{"step": 1, "loss": 10.519957542419434, "lr": 4.0000000000000003e-07, "elapsed_sec": 9.796109676361084, "step_time_sec": 9.732346511998912, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 12185815552}
{"step": 2, "loss": 10.505473136901855, "lr": 8.000000000000001e-07, "elapsed_sec": 18.018307209014893, "step_time_sec": 8.22200025501661, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3, "loss": 10.501450538635254, "lr": 1.2000000000000002e-06, "elapsed_sec": 26.230910062789917, "step_time_sec": 8.212467817997094, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4, "loss": 10.501275062561035, "lr": 1.6000000000000001e-06, "elapsed_sec": 34.44721722602844, "step_time_sec": 8.216161029995419, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5, "loss": 10.478228569030762, "lr": 2e-06, "elapsed_sec": 42.66265606880188, "step_time_sec": 8.215188117988873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 6, "loss": 10.466719627380371, "lr": 2.4000000000000003e-06, "elapsed_sec": 50.890071392059326, "step_time_sec": 8.227324009989388, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 7, "loss": 10.447820663452148, "lr": 2.8e-06, "elapsed_sec": 59.118703842163086, "step_time_sec": 8.22842887500883, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 8, "loss": 10.412598609924316, "lr": 3.2000000000000003e-06, "elapsed_sec": 67.34878778457642, "step_time_sec": 8.229949002998183, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 9, "loss": 10.372179985046387, "lr": 3.6000000000000003e-06, "elapsed_sec": 75.57836127281189, "step_time_sec": 8.22940476500662, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 10, "loss": 10.363941192626953, "lr": 4e-06, "elapsed_sec": 83.80844378471375, "step_time_sec": 8.229922703991178, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 11, "loss": 10.329357147216797, "lr": 4.4e-06, "elapsed_sec": 92.03840279579163, "step_time_sec": 8.229799997003283, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 12, "loss": 10.311334609985352, "lr": 4.800000000000001e-06, "elapsed_sec": 100.26704454421997, "step_time_sec": 8.228462800005218, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 13, "loss": 10.261175155639648, "lr": 5.200000000000001e-06, "elapsed_sec": 108.49778366088867, "step_time_sec": 8.230555889022071, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 14, "loss": 10.238824844360352, "lr": 5.6e-06, "elapsed_sec": 116.72524929046631, "step_time_sec": 8.227268234011717, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 15, "loss": 10.205403327941895, "lr": 6e-06, "elapsed_sec": 124.952707529068, "step_time_sec": 8.227358036994701, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 16, "loss": 10.184412956237793, "lr": 6.4000000000000006e-06, "elapsed_sec": 133.17745518684387, "step_time_sec": 8.224523682991276, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 17, "loss": 10.12645435333252, "lr": 6.8e-06, "elapsed_sec": 141.40650868415833, "step_time_sec": 8.228868668986252, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 18, "loss": 10.159541130065918, "lr": 7.2000000000000005e-06, "elapsed_sec": 149.63483142852783, "step_time_sec": 8.228160967002623, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 19, "loss": 10.130760192871094, "lr": 7.6e-06, "elapsed_sec": 157.8638038635254, "step_time_sec": 8.228822243021568, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 20, "loss": 10.064231872558594, "lr": 8e-06, "elapsed_sec": 166.09310102462769, "step_time_sec": 8.229200270987349, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 21, "loss": 10.004128456115723, "lr": 8.400000000000001e-06, "elapsed_sec": 174.31966471672058, "step_time_sec": 8.226328710006783, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 22, "loss": 9.99524974822998, "lr": 8.8e-06, "elapsed_sec": 182.54726243019104, "step_time_sec": 8.227431836014148, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 23, "loss": 9.948460578918457, "lr": 9.2e-06, "elapsed_sec": 190.77384305000305, "step_time_sec": 8.226413181982934, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 24, "loss": 9.9866361618042, "lr": 9.600000000000001e-06, "elapsed_sec": 199.00333094596863, "step_time_sec": 8.229333811003016, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 25, "loss": 9.896597862243652, "lr": 1e-05, "elapsed_sec": 207.2324240207672, "step_time_sec": 8.22895231100847, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 26, "loss": 9.875495910644531, "lr": 1.0400000000000002e-05, "elapsed_sec": 215.46097540855408, "step_time_sec": 8.228372920013499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 27, "loss": 9.86939811706543, "lr": 1.08e-05, "elapsed_sec": 223.69012928009033, "step_time_sec": 8.229045760992449, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 28, "loss": 9.840307235717773, "lr": 1.12e-05, "elapsed_sec": 231.92017364501953, "step_time_sec": 8.22984655899927, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 29, "loss": 9.757282257080078, "lr": 1.16e-05, "elapsed_sec": 240.14669013023376, "step_time_sec": 8.226365944981808, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 30, "loss": 9.766834259033203, "lr": 1.2e-05, "elapsed_sec": 248.37262511253357, "step_time_sec": 8.225776871986454, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 31, "loss": 9.765989303588867, "lr": 1.2400000000000002e-05, "elapsed_sec": 256.6003506183624, "step_time_sec": 8.22759578199475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 32, "loss": 9.742070198059082, "lr": 1.2800000000000001e-05, "elapsed_sec": 264.82851481437683, "step_time_sec": 8.227965506986948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 33, "loss": 9.755448341369629, "lr": 1.32e-05, "elapsed_sec": 273.0548417568207, "step_time_sec": 8.22615329999826, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 34, "loss": 9.692987442016602, "lr": 1.36e-05, "elapsed_sec": 281.2834141254425, "step_time_sec": 8.228463484003441, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 35, "loss": 9.610698699951172, "lr": 1.4e-05, "elapsed_sec": 289.50989413261414, "step_time_sec": 8.226306571013993, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 36, "loss": 9.66393756866455, "lr": 1.4400000000000001e-05, "elapsed_sec": 297.73748874664307, "step_time_sec": 8.227390157990158, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 37, "loss": 9.577474594116211, "lr": 1.48e-05, "elapsed_sec": 305.96486735343933, "step_time_sec": 8.227225451002596, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 38, "loss": 9.56798267364502, "lr": 1.52e-05, "elapsed_sec": 314.1919975280762, "step_time_sec": 8.226970474992413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 39, "loss": 9.569748878479004, "lr": 1.56e-05, "elapsed_sec": 322.4191131591797, "step_time_sec": 8.226949507981772, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 40, "loss": 9.453240394592285, "lr": 1.6e-05, "elapsed_sec": 330.64545154571533, "step_time_sec": 8.226195869006915, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 41, "loss": 9.475468635559082, "lr": 1.6400000000000002e-05, "elapsed_sec": 338.87241291999817, "step_time_sec": 8.226823637989582, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 42, "loss": 9.525599479675293, "lr": 1.6800000000000002e-05, "elapsed_sec": 347.09905982017517, "step_time_sec": 8.226462569000432, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 43, "loss": 9.52625846862793, "lr": 1.72e-05, "elapsed_sec": 355.3265504837036, "step_time_sec": 8.227369844011264, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 44, "loss": 9.486774444580078, "lr": 1.76e-05, "elapsed_sec": 363.55301451683044, "step_time_sec": 8.226260908995755, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 45, "loss": 9.398921012878418, "lr": 1.8e-05, "elapsed_sec": 371.78215980529785, "step_time_sec": 8.228979434003122, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 46, "loss": 9.456478118896484, "lr": 1.84e-05, "elapsed_sec": 380.0110936164856, "step_time_sec": 8.228762501006713, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 47, "loss": 9.471497535705566, "lr": 1.88e-05, "elapsed_sec": 388.2387022972107, "step_time_sec": 8.227461221977137, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 48, "loss": 9.42961311340332, "lr": 1.9200000000000003e-05, "elapsed_sec": 396.46638798713684, "step_time_sec": 8.22757439999259, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 49, "loss": 9.374841690063477, "lr": 1.96e-05, "elapsed_sec": 404.6935887336731, "step_time_sec": 8.227001930994447, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 50, "loss": 9.377694129943848, "lr": 2e-05, "elapsed_sec": 412.9214472770691, "step_time_sec": 8.227713411004515, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 51, "loss": 9.425689697265625, "lr": 2.04e-05, "elapsed_sec": 421.1496915817261, "step_time_sec": 8.228076894010883, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 52, "loss": 9.344730377197266, "lr": 2.0800000000000004e-05, "elapsed_sec": 429.3789691925049, "step_time_sec": 8.22917083700304, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 53, "loss": 9.244389533996582, "lr": 2.12e-05, "elapsed_sec": 437.60842394828796, "step_time_sec": 8.229241067980183, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 54, "loss": 9.266797065734863, "lr": 2.16e-05, "elapsed_sec": 445.83601546287537, "step_time_sec": 8.227470715995878, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 55, "loss": 9.28549575805664, "lr": 2.2000000000000003e-05, "elapsed_sec": 454.0632162094116, "step_time_sec": 8.227023182000266, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 56, "loss": 9.293370246887207, "lr": 2.24e-05, "elapsed_sec": 462.29025769233704, "step_time_sec": 8.226858523004921, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 57, "loss": 9.281633377075195, "lr": 2.2800000000000002e-05, "elapsed_sec": 470.51883697509766, "step_time_sec": 8.228487167012645, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 58, "loss": 9.254070281982422, "lr": 2.32e-05, "elapsed_sec": 478.7454881668091, "step_time_sec": 8.22645947500132, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 59, "loss": 9.282824516296387, "lr": 2.36e-05, "elapsed_sec": 486.974591255188, "step_time_sec": 8.228909316996578, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 60, "loss": 9.197436332702637, "lr": 2.4e-05, "elapsed_sec": 495.20382952690125, "step_time_sec": 8.229156454006443, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 61, "loss": 9.225679397583008, "lr": 2.44e-05, "elapsed_sec": 503.432941198349, "step_time_sec": 8.228889717021957, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 62, "loss": 9.116599082946777, "lr": 2.4800000000000003e-05, "elapsed_sec": 511.66107964515686, "step_time_sec": 8.228026538010454, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 63, "loss": 9.136240005493164, "lr": 2.52e-05, "elapsed_sec": 519.8900518417358, "step_time_sec": 8.228735620999942, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 64, "loss": 9.149857521057129, "lr": 2.5600000000000002e-05, "elapsed_sec": 528.1182181835175, "step_time_sec": 8.228078635991551, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 65, "loss": 9.105879783630371, "lr": 2.6000000000000002e-05, "elapsed_sec": 536.3457217216492, "step_time_sec": 8.227261777006788, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 66, "loss": 9.14565658569336, "lr": 2.64e-05, "elapsed_sec": 544.5748105049133, "step_time_sec": 8.228933826001594, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 67, "loss": 9.06406307220459, "lr": 2.68e-05, "elapsed_sec": 552.8039801120758, "step_time_sec": 8.228996248013573, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 68, "loss": 9.069005012512207, "lr": 2.72e-05, "elapsed_sec": 561.0331540107727, "step_time_sec": 8.229085319995647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 69, "loss": 9.042241096496582, "lr": 2.7600000000000003e-05, "elapsed_sec": 569.2594027519226, "step_time_sec": 8.226074697013246, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 70, "loss": 9.050315856933594, "lr": 2.8e-05, "elapsed_sec": 577.4864649772644, "step_time_sec": 8.226919159991667, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 71, "loss": 8.961495399475098, "lr": 2.8400000000000003e-05, "elapsed_sec": 585.7124617099762, "step_time_sec": 8.22581908799475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 72, "loss": 8.977455139160156, "lr": 2.8800000000000002e-05, "elapsed_sec": 593.9349598884583, "step_time_sec": 8.22235342199565, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 73, "loss": 8.983489036560059, "lr": 2.92e-05, "elapsed_sec": 602.1597874164581, "step_time_sec": 8.224624268012121, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 74, "loss": 8.947361946105957, "lr": 2.96e-05, "elapsed_sec": 610.3874292373657, "step_time_sec": 8.227483544003917, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 75, "loss": 9.003510475158691, "lr": 3e-05, "elapsed_sec": 618.614182472229, "step_time_sec": 8.226600402995246, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 76, "loss": 8.955291748046875, "lr": 3.04e-05, "elapsed_sec": 626.8432998657227, "step_time_sec": 8.22899653398781, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 77, "loss": 8.866506576538086, "lr": 3.08e-05, "elapsed_sec": 635.0720722675323, "step_time_sec": 8.228563162003411, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 78, "loss": 8.946792602539062, "lr": 3.12e-05, "elapsed_sec": 643.2985460758209, "step_time_sec": 8.226311420003185, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 79, "loss": 8.867426872253418, "lr": 3.16e-05, "elapsed_sec": 651.5272991657257, "step_time_sec": 8.228644057002384, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 80, "loss": 8.88417911529541, "lr": 3.2e-05, "elapsed_sec": 659.7523789405823, "step_time_sec": 8.224894465995021, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 81, "loss": 8.845967292785645, "lr": 3.24e-05, "elapsed_sec": 667.980441570282, "step_time_sec": 8.227904877974652, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 82, "loss": 8.853690147399902, "lr": 3.2800000000000004e-05, "elapsed_sec": 676.2103908061981, "step_time_sec": 8.229838394996477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 83, "loss": 8.805481910705566, "lr": 3.32e-05, "elapsed_sec": 684.4369676113129, "step_time_sec": 8.226367248018505, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 84, "loss": 8.872130393981934, "lr": 3.3600000000000004e-05, "elapsed_sec": 692.6632628440857, "step_time_sec": 8.226179956021952, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 85, "loss": 8.79236888885498, "lr": 3.4e-05, "elapsed_sec": 700.8922166824341, "step_time_sec": 8.228762699000072, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 86, "loss": 8.782154083251953, "lr": 3.44e-05, "elapsed_sec": 709.1209115982056, "step_time_sec": 8.228594845015323, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 87, "loss": 8.802003860473633, "lr": 3.4800000000000006e-05, "elapsed_sec": 717.349203824997, "step_time_sec": 8.228035923006246, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 88, "loss": 8.71653938293457, "lr": 3.52e-05, "elapsed_sec": 725.5760035514832, "step_time_sec": 8.226645835005911, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 89, "loss": 8.725602149963379, "lr": 3.56e-05, "elapsed_sec": 733.8026864528656, "step_time_sec": 8.226573090010788, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 90, "loss": 8.666906356811523, "lr": 3.6e-05, "elapsed_sec": 742.0314040184021, "step_time_sec": 8.228514852991793, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 91, "loss": 8.648832321166992, "lr": 3.6400000000000004e-05, "elapsed_sec": 750.2599403858185, "step_time_sec": 8.228319153015036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 92, "loss": 8.674057006835938, "lr": 3.68e-05, "elapsed_sec": 758.4882972240448, "step_time_sec": 8.228205305000301, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 93, "loss": 8.651775360107422, "lr": 3.72e-05, "elapsed_sec": 766.7159011363983, "step_time_sec": 8.227513021993218, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 94, "loss": 8.7017183303833, "lr": 3.76e-05, "elapsed_sec": 774.9423577785492, "step_time_sec": 8.226215075992513, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 95, "loss": 8.678481101989746, "lr": 3.8e-05, "elapsed_sec": 783.1695923805237, "step_time_sec": 8.2270946019853, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 96, "loss": 8.574710845947266, "lr": 3.8400000000000005e-05, "elapsed_sec": 791.3958599567413, "step_time_sec": 8.226105798996286, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 97, "loss": 8.553394317626953, "lr": 3.88e-05, "elapsed_sec": 799.6251215934753, "step_time_sec": 8.229097522009397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 98, "loss": 8.541318893432617, "lr": 3.92e-05, "elapsed_sec": 807.8542671203613, "step_time_sec": 8.229027919005603, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 99, "loss": 8.490690231323242, "lr": 3.96e-05, "elapsed_sec": 816.0788946151733, "step_time_sec": 8.22450232601841, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 100, "loss": 8.528120040893555, "lr": 4e-05, "elapsed_sec": 824.3050665855408, "step_time_sec": 8.225977280002553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 101, "loss": 8.452980041503906, "lr": 4.0400000000000006e-05, "elapsed_sec": 832.5338566303253, "step_time_sec": 8.228587265999522, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 102, "loss": 8.475951194763184, "lr": 4.08e-05, "elapsed_sec": 840.7599217891693, "step_time_sec": 8.225933666020865, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 103, "loss": 8.519282341003418, "lr": 4.12e-05, "elapsed_sec": 848.9849605560303, "step_time_sec": 8.224897410022095, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 104, "loss": 8.517868995666504, "lr": 4.160000000000001e-05, "elapsed_sec": 857.2143349647522, "step_time_sec": 8.22917488301755, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 105, "loss": 8.458340644836426, "lr": 4.2000000000000004e-05, "elapsed_sec": 865.4432384967804, "step_time_sec": 8.228758362994995, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 106, "loss": 8.409611701965332, "lr": 4.24e-05, "elapsed_sec": 873.6695094108582, "step_time_sec": 8.226096643978963, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 107, "loss": 8.384879112243652, "lr": 4.2800000000000004e-05, "elapsed_sec": 881.8980994224548, "step_time_sec": 8.22849046101328, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 108, "loss": 8.366512298583984, "lr": 4.32e-05, "elapsed_sec": 890.1263861656189, "step_time_sec": 8.22808132902719, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 109, "loss": 8.412646293640137, "lr": 4.36e-05, "elapsed_sec": 898.3525440692902, "step_time_sec": 8.226069127995288, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 110, "loss": 8.258499145507812, "lr": 4.4000000000000006e-05, "elapsed_sec": 906.580171585083, "step_time_sec": 8.227396033005789, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 111, "loss": 8.393250465393066, "lr": 4.44e-05, "elapsed_sec": 914.8069927692413, "step_time_sec": 8.226662354980363, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 112, "loss": 8.370220184326172, "lr": 4.48e-05, "elapsed_sec": 923.0342090129852, "step_time_sec": 8.22708536000573, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 113, "loss": 8.356823921203613, "lr": 4.52e-05, "elapsed_sec": 931.2619106769562, "step_time_sec": 8.227544255991234, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 114, "loss": 8.274832725524902, "lr": 4.5600000000000004e-05, "elapsed_sec": 939.4886457920074, "step_time_sec": 8.226598990993807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 115, "loss": 8.310687065124512, "lr": 4.6e-05, "elapsed_sec": 947.7177219390869, "step_time_sec": 8.228872418985702, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 116, "loss": 8.303194046020508, "lr": 4.64e-05, "elapsed_sec": 955.9481883049011, "step_time_sec": 8.230324061994907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 117, "loss": 8.269158363342285, "lr": 4.68e-05, "elapsed_sec": 964.1777496337891, "step_time_sec": 8.229445531003876, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 118, "loss": 8.287335395812988, "lr": 4.72e-05, "elapsed_sec": 972.4063730239868, "step_time_sec": 8.228402577980887, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 119, "loss": 8.247856140136719, "lr": 4.7600000000000005e-05, "elapsed_sec": 980.6348831653595, "step_time_sec": 8.228387037001085, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 120, "loss": 8.28364086151123, "lr": 4.8e-05, "elapsed_sec": 988.864363193512, "step_time_sec": 8.229302419000305, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 121, "loss": 8.1890287399292, "lr": 4.8400000000000004e-05, "elapsed_sec": 997.0933372974396, "step_time_sec": 8.22892499997397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 122, "loss": 8.181218147277832, "lr": 4.88e-05, "elapsed_sec": 1005.3224341869354, "step_time_sec": 8.228860404982697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 123, "loss": 8.129717826843262, "lr": 4.92e-05, "elapsed_sec": 1013.5500860214233, "step_time_sec": 8.227453818020876, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 124, "loss": 8.186448097229004, "lr": 4.9600000000000006e-05, "elapsed_sec": 1021.7784838676453, "step_time_sec": 8.228252686996711, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 125, "loss": 8.219650268554688, "lr": 5e-05, "elapsed_sec": 1030.004272699356, "step_time_sec": 8.225660346011864, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 126, "loss": 8.189053535461426, "lr": 5.04e-05, "elapsed_sec": 1038.231679201126, "step_time_sec": 8.227238154999213, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 127, "loss": 8.17771053314209, "lr": 5.08e-05, "elapsed_sec": 1046.4575293064117, "step_time_sec": 8.225677954003913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 128, "loss": 8.121573448181152, "lr": 5.1200000000000004e-05, "elapsed_sec": 1054.6865780353546, "step_time_sec": 8.228896954999072, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 129, "loss": 8.116137504577637, "lr": 5.16e-05, "elapsed_sec": 1062.9155914783478, "step_time_sec": 8.228865665994817, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 130, "loss": 8.039885520935059, "lr": 5.2000000000000004e-05, "elapsed_sec": 1071.1441271305084, "step_time_sec": 8.228383493988076, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 131, "loss": 8.123106956481934, "lr": 5.24e-05, "elapsed_sec": 1079.3712763786316, "step_time_sec": 8.227051247988129, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 132, "loss": 8.064987182617188, "lr": 5.28e-05, "elapsed_sec": 1087.5992250442505, "step_time_sec": 8.22776033100672, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 133, "loss": 8.063470840454102, "lr": 5.3200000000000006e-05, "elapsed_sec": 1095.828236579895, "step_time_sec": 8.228834937006468, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 134, "loss": 8.062150001525879, "lr": 5.36e-05, "elapsed_sec": 1104.0570619106293, "step_time_sec": 8.228655274986522, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 135, "loss": 8.037906646728516, "lr": 5.4e-05, "elapsed_sec": 1112.2859466075897, "step_time_sec": 8.228766052023275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 136, "loss": 8.012834548950195, "lr": 5.44e-05, "elapsed_sec": 1120.5147602558136, "step_time_sec": 8.228624985000351, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 137, "loss": 8.064157485961914, "lr": 5.4800000000000004e-05, "elapsed_sec": 1128.7438385486603, "step_time_sec": 8.228898624016438, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 138, "loss": 8.00090217590332, "lr": 5.520000000000001e-05, "elapsed_sec": 1136.972501039505, "step_time_sec": 8.22851165299653, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 139, "loss": 8.043827056884766, "lr": 5.56e-05, "elapsed_sec": 1145.2019419670105, "step_time_sec": 8.229277253994951, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 140, "loss": 7.964439868927002, "lr": 5.6e-05, "elapsed_sec": 1153.4289751052856, "step_time_sec": 8.226887163007632, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 141, "loss": 8.025486946105957, "lr": 5.640000000000001e-05, "elapsed_sec": 1161.6576037406921, "step_time_sec": 8.228434844990261, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 142, "loss": 8.02719783782959, "lr": 5.6800000000000005e-05, "elapsed_sec": 1169.8848114013672, "step_time_sec": 8.227055218012538, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 143, "loss": 8.026123046875, "lr": 5.72e-05, "elapsed_sec": 1178.1117885112762, "step_time_sec": 8.226876945002005, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 144, "loss": 7.997283458709717, "lr": 5.7600000000000004e-05, "elapsed_sec": 1186.3384675979614, "step_time_sec": 8.22647249000147, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 145, "loss": 7.99241304397583, "lr": 5.8e-05, "elapsed_sec": 1194.5651910305023, "step_time_sec": 8.226604864990804, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 146, "loss": 7.966104030609131, "lr": 5.84e-05, "elapsed_sec": 1202.7927210330963, "step_time_sec": 8.227321956976084, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 147, "loss": 7.94872522354126, "lr": 5.8800000000000006e-05, "elapsed_sec": 1211.0216798782349, "step_time_sec": 8.228802058001747, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 148, "loss": 8.005328178405762, "lr": 5.92e-05, "elapsed_sec": 1219.2511749267578, "step_time_sec": 8.229422529024305, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 149, "loss": 8.022841453552246, "lr": 5.96e-05, "elapsed_sec": 1227.4782445430756, "step_time_sec": 8.226897510990966, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 150, "loss": 7.982725143432617, "lr": 6e-05, "elapsed_sec": 1235.7066390514374, "step_time_sec": 8.22821274198941, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 151, "loss": 7.986963748931885, "lr": 6.0400000000000004e-05, "elapsed_sec": 1243.9357256889343, "step_time_sec": 8.228861059993505, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 152, "loss": 7.821870803833008, "lr": 6.08e-05, "elapsed_sec": 1252.1651227474213, "step_time_sec": 8.229312283016043, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 153, "loss": 7.865364074707031, "lr": 6.120000000000001e-05, "elapsed_sec": 1260.39408826828, "step_time_sec": 8.228759955993155, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 154, "loss": 7.933681488037109, "lr": 6.16e-05, "elapsed_sec": 1268.622104883194, "step_time_sec": 8.227849513001274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 155, "loss": 7.966499328613281, "lr": 6.2e-05, "elapsed_sec": 1276.84952044487, "step_time_sec": 8.227292355993995, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 156, "loss": 7.900127410888672, "lr": 6.24e-05, "elapsed_sec": 1285.0765199661255, "step_time_sec": 8.226799452997511, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 157, "loss": 7.893668174743652, "lr": 6.280000000000001e-05, "elapsed_sec": 1293.3044135570526, "step_time_sec": 8.22774880801444, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 158, "loss": 7.937892436981201, "lr": 6.32e-05, "elapsed_sec": 1301.5306367874146, "step_time_sec": 8.226060997985769, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 159, "loss": 7.897235870361328, "lr": 6.36e-05, "elapsed_sec": 1309.758529663086, "step_time_sec": 8.227763317001518, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 160, "loss": 7.898919582366943, "lr": 6.4e-05, "elapsed_sec": 1317.9854834079742, "step_time_sec": 8.226816034002695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 161, "loss": 7.880965232849121, "lr": 6.44e-05, "elapsed_sec": 1326.2146255970001, "step_time_sec": 8.229019316000631, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 162, "loss": 7.850151062011719, "lr": 6.48e-05, "elapsed_sec": 1334.4446992874146, "step_time_sec": 8.229894930002047, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 163, "loss": 7.805723667144775, "lr": 6.520000000000001e-05, "elapsed_sec": 1342.6736721992493, "step_time_sec": 8.228838594979607, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 164, "loss": 7.765656471252441, "lr": 6.560000000000001e-05, "elapsed_sec": 1350.9009778499603, "step_time_sec": 8.227064717008034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 165, "loss": 7.829962253570557, "lr": 6.6e-05, "elapsed_sec": 1359.1305451393127, "step_time_sec": 8.229438693000702, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 166, "loss": 7.914029121398926, "lr": 6.64e-05, "elapsed_sec": 1367.3567342758179, "step_time_sec": 8.226053602003958, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 167, "loss": 7.813790321350098, "lr": 6.68e-05, "elapsed_sec": 1375.585284948349, "step_time_sec": 8.2283523149963, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 168, "loss": 7.842958450317383, "lr": 6.720000000000001e-05, "elapsed_sec": 1383.812451839447, "step_time_sec": 8.227000137994764, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 169, "loss": 7.799837112426758, "lr": 6.76e-05, "elapsed_sec": 1392.0387120246887, "step_time_sec": 8.22610564000206, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 170, "loss": 7.845525741577148, "lr": 6.8e-05, "elapsed_sec": 1400.266545534134, "step_time_sec": 8.227681131014833, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 171, "loss": 7.879123210906982, "lr": 6.840000000000001e-05, "elapsed_sec": 1408.49511885643, "step_time_sec": 8.228472704999149, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 172, "loss": 7.830366611480713, "lr": 6.88e-05, "elapsed_sec": 1416.724493265152, "step_time_sec": 8.229165452998132, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 173, "loss": 7.851619720458984, "lr": 6.92e-05, "elapsed_sec": 1424.9513323307037, "step_time_sec": 8.226655363017926, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 174, "loss": 7.788964748382568, "lr": 6.960000000000001e-05, "elapsed_sec": 1433.1793899536133, "step_time_sec": 8.227964438003255, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 175, "loss": 7.803962230682373, "lr": 7.000000000000001e-05, "elapsed_sec": 1441.4065992832184, "step_time_sec": 8.22702950699022, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 176, "loss": 7.668135166168213, "lr": 7.04e-05, "elapsed_sec": 1449.6345188617706, "step_time_sec": 8.22778825100977, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 177, "loss": 7.764729022979736, "lr": 7.08e-05, "elapsed_sec": 1457.861733675003, "step_time_sec": 8.226997952006059, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 178, "loss": 7.808509826660156, "lr": 7.12e-05, "elapsed_sec": 1466.0892674922943, "step_time_sec": 8.22737415699521, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 179, "loss": 7.711036682128906, "lr": 7.159999999999999e-05, "elapsed_sec": 1474.317617893219, "step_time_sec": 8.228205077000894, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 180, "loss": 7.732991695404053, "lr": 7.2e-05, "elapsed_sec": 1482.5470683574677, "step_time_sec": 8.229340677993605, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 181, "loss": 7.68436336517334, "lr": 7.240000000000001e-05, "elapsed_sec": 1490.776395559311, "step_time_sec": 8.229101303993957, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 182, "loss": 7.674361705780029, "lr": 7.280000000000001e-05, "elapsed_sec": 1499.0044190883636, "step_time_sec": 8.227854864991968, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 183, "loss": 7.7498064041137695, "lr": 7.32e-05, "elapsed_sec": 1507.2313332557678, "step_time_sec": 8.22677240698249, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 184, "loss": 7.782443046569824, "lr": 7.36e-05, "elapsed_sec": 1515.460646390915, "step_time_sec": 8.229229268006748, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 185, "loss": 7.6725640296936035, "lr": 7.400000000000001e-05, "elapsed_sec": 1523.6875936985016, "step_time_sec": 8.226717138983076, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 186, "loss": 7.673073768615723, "lr": 7.44e-05, "elapsed_sec": 1531.915724515915, "step_time_sec": 8.22798798698932, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 187, "loss": 7.69179630279541, "lr": 7.48e-05, "elapsed_sec": 1540.143107175827, "step_time_sec": 8.227192038990324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 188, "loss": 7.625990867614746, "lr": 7.52e-05, "elapsed_sec": 1548.3706305027008, "step_time_sec": 8.227451856975676, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 189, "loss": 7.715603351593018, "lr": 7.56e-05, "elapsed_sec": 1556.5967857837677, "step_time_sec": 8.225936737988377, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 190, "loss": 7.729246616363525, "lr": 7.6e-05, "elapsed_sec": 1564.826063156128, "step_time_sec": 8.229129856976215, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 191, "loss": 7.697624683380127, "lr": 7.640000000000001e-05, "elapsed_sec": 1573.0552167892456, "step_time_sec": 8.228968044015346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 192, "loss": 7.601853370666504, "lr": 7.680000000000001e-05, "elapsed_sec": 1581.2823877334595, "step_time_sec": 8.22700709401397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 193, "loss": 7.541373252868652, "lr": 7.72e-05, "elapsed_sec": 1589.5097465515137, "step_time_sec": 8.227253572986228, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 194, "loss": 7.603813171386719, "lr": 7.76e-05, "elapsed_sec": 1597.7369709014893, "step_time_sec": 8.22705059699365, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 195, "loss": 7.609255313873291, "lr": 7.8e-05, "elapsed_sec": 1605.9635767936707, "step_time_sec": 8.226435885007959, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 196, "loss": 7.652596473693848, "lr": 7.84e-05, "elapsed_sec": 1614.1916546821594, "step_time_sec": 8.227884914987953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 197, "loss": 7.619997024536133, "lr": 7.88e-05, "elapsed_sec": 1622.4184775352478, "step_time_sec": 8.226670581992948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 198, "loss": 7.575228214263916, "lr": 7.92e-05, "elapsed_sec": 1630.6458644866943, "step_time_sec": 8.227304743981222, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 199, "loss": 7.574359893798828, "lr": 7.960000000000001e-05, "elapsed_sec": 1638.8746581077576, "step_time_sec": 8.228621006011963, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 200, "loss": 7.526299953460693, "lr": 8e-05, "elapsed_sec": 1647.10365152359, "step_time_sec": 8.228770854999311, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 201, "loss": 7.531688690185547, "lr": 8.04e-05, "elapsed_sec": 1655.332654953003, "step_time_sec": 8.22885163599858, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 202, "loss": 7.625715732574463, "lr": 8.080000000000001e-05, "elapsed_sec": 1663.5620532035828, "step_time_sec": 8.229259801009903, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 203, "loss": 7.5046467781066895, "lr": 8.120000000000001e-05, "elapsed_sec": 1671.7921402454376, "step_time_sec": 8.22991745499894, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 204, "loss": 7.4598212242126465, "lr": 8.16e-05, "elapsed_sec": 1680.019728422165, "step_time_sec": 8.227452441991773, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 205, "loss": 7.5626115798950195, "lr": 8.2e-05, "elapsed_sec": 1688.2469153404236, "step_time_sec": 8.227006647008238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 206, "loss": 7.564806938171387, "lr": 8.24e-05, "elapsed_sec": 1696.4767792224884, "step_time_sec": 8.22969502399792, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 207, "loss": 7.436630725860596, "lr": 8.28e-05, "elapsed_sec": 1704.7062423229218, "step_time_sec": 8.229413364984794, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 208, "loss": 7.560800552368164, "lr": 8.320000000000002e-05, "elapsed_sec": 1712.9360435009003, "step_time_sec": 8.229607248998946, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 209, "loss": 7.489374160766602, "lr": 8.360000000000001e-05, "elapsed_sec": 1721.165104150772, "step_time_sec": 8.228834658977576, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 210, "loss": 7.51253604888916, "lr": 8.400000000000001e-05, "elapsed_sec": 1729.3941977024078, "step_time_sec": 8.228978728002403, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 211, "loss": 7.537048816680908, "lr": 8.44e-05, "elapsed_sec": 1737.621776342392, "step_time_sec": 8.227379743009806, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 212, "loss": 7.452347278594971, "lr": 8.48e-05, "elapsed_sec": 1745.8488750457764, "step_time_sec": 8.226953367004171, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 213, "loss": 7.472954273223877, "lr": 8.52e-05, "elapsed_sec": 1754.075516462326, "step_time_sec": 8.226468744018348, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 214, "loss": 7.483042240142822, "lr": 8.560000000000001e-05, "elapsed_sec": 1762.3046925067902, "step_time_sec": 8.2290231329971, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 215, "loss": 7.393919944763184, "lr": 8.6e-05, "elapsed_sec": 1770.5331642627716, "step_time_sec": 8.228349096985767, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 216, "loss": 7.399316310882568, "lr": 8.64e-05, "elapsed_sec": 1778.762047290802, "step_time_sec": 8.228732515010051, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 217, "loss": 7.518381595611572, "lr": 8.68e-05, "elapsed_sec": 1786.9884595870972, "step_time_sec": 8.226224555022782, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 218, "loss": 7.46047306060791, "lr": 8.72e-05, "elapsed_sec": 1795.2166957855225, "step_time_sec": 8.228047103999415, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 219, "loss": 7.405009746551514, "lr": 8.76e-05, "elapsed_sec": 1803.4435000419617, "step_time_sec": 8.226648384006694, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 220, "loss": 7.269919395446777, "lr": 8.800000000000001e-05, "elapsed_sec": 1811.670837879181, "step_time_sec": 8.227243952016579, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 221, "loss": 7.36342716217041, "lr": 8.840000000000001e-05, "elapsed_sec": 1819.898206949234, "step_time_sec": 8.227188025979558, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 222, "loss": 7.44268274307251, "lr": 8.88e-05, "elapsed_sec": 1828.1268570423126, "step_time_sec": 8.228486798994709, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 223, "loss": 7.303511142730713, "lr": 8.92e-05, "elapsed_sec": 1836.3555240631104, "step_time_sec": 8.228479062992847, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 224, "loss": 7.2761077880859375, "lr": 8.96e-05, "elapsed_sec": 1844.5833168029785, "step_time_sec": 8.22762337801396, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 225, "loss": 7.360838890075684, "lr": 9e-05, "elapsed_sec": 1852.8103125095367, "step_time_sec": 8.226842534990283, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 226, "loss": 7.4133687019348145, "lr": 9.04e-05, "elapsed_sec": 1861.0382850170135, "step_time_sec": 8.227835976984352, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 227, "loss": 7.236988544464111, "lr": 9.080000000000001e-05, "elapsed_sec": 1869.2651069164276, "step_time_sec": 8.226721969986102, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 228, "loss": 7.403906345367432, "lr": 9.120000000000001e-05, "elapsed_sec": 1877.4928455352783, "step_time_sec": 8.227509759017266, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 229, "loss": 7.3328537940979, "lr": 9.16e-05, "elapsed_sec": 1885.7208790779114, "step_time_sec": 8.227893224015133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 230, "loss": 7.465392589569092, "lr": 9.2e-05, "elapsed_sec": 1893.9479689598083, "step_time_sec": 8.226952513010474, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 231, "loss": 7.178247451782227, "lr": 9.240000000000001e-05, "elapsed_sec": 1902.1758348941803, "step_time_sec": 8.227662861987483, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 232, "loss": 7.273200035095215, "lr": 9.28e-05, "elapsed_sec": 1910.4030694961548, "step_time_sec": 8.227092166023795, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 233, "loss": 7.347271919250488, "lr": 9.32e-05, "elapsed_sec": 1918.6327421665192, "step_time_sec": 8.22949939701357, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 234, "loss": 7.330509185791016, "lr": 9.36e-05, "elapsed_sec": 1926.8621289730072, "step_time_sec": 8.229228615993634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 235, "loss": 7.250112533569336, "lr": 9.4e-05, "elapsed_sec": 1935.0918123722076, "step_time_sec": 8.22958805502276, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 236, "loss": 7.226873874664307, "lr": 9.44e-05, "elapsed_sec": 1943.3212912082672, "step_time_sec": 8.229306369001279, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 237, "loss": 7.336287021636963, "lr": 9.480000000000001e-05, "elapsed_sec": 1951.5512328147888, "step_time_sec": 8.229789029981475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 238, "loss": 7.187093734741211, "lr": 9.520000000000001e-05, "elapsed_sec": 1959.7813565731049, "step_time_sec": 8.229943002021173, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 239, "loss": 7.2485480308532715, "lr": 9.56e-05, "elapsed_sec": 1968.010727405548, "step_time_sec": 8.229204164003022, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 240, "loss": 7.2565202713012695, "lr": 9.6e-05, "elapsed_sec": 1976.241651058197, "step_time_sec": 8.230713229015237, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 241, "loss": 7.263650417327881, "lr": 9.64e-05, "elapsed_sec": 1984.4713304042816, "step_time_sec": 8.229529924021335, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 242, "loss": 7.2428107261657715, "lr": 9.680000000000001e-05, "elapsed_sec": 1992.7006587982178, "step_time_sec": 8.22915789700346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 243, "loss": 7.31964111328125, "lr": 9.72e-05, "elapsed_sec": 2000.9307975769043, "step_time_sec": 8.229992303007748, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 244, "loss": 7.286164283752441, "lr": 9.76e-05, "elapsed_sec": 2009.1605937480927, "step_time_sec": 8.22965549401124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 245, "loss": 7.211132526397705, "lr": 9.800000000000001e-05, "elapsed_sec": 2017.3904385566711, "step_time_sec": 8.229651319998084, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 246, "loss": 7.1959381103515625, "lr": 9.84e-05, "elapsed_sec": 2025.6194982528687, "step_time_sec": 8.22891378897475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 247, "loss": 7.194977760314941, "lr": 9.88e-05, "elapsed_sec": 2033.8493127822876, "step_time_sec": 8.22964189297636, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 248, "loss": 7.153557300567627, "lr": 9.920000000000001e-05, "elapsed_sec": 2042.0790333747864, "step_time_sec": 8.229565607005497, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 249, "loss": 7.270102024078369, "lr": 9.960000000000001e-05, "elapsed_sec": 2050.3092234134674, "step_time_sec": 8.230036156019196, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 250, "loss": 7.205660343170166, "lr": 0.0001, "elapsed_sec": 2058.53729057312, "step_time_sec": 8.227940097014653, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 251, "loss": 7.198582172393799, "lr": 0.0001004, "elapsed_sec": 2066.76554274559, "step_time_sec": 8.228049919998739, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 252, "loss": 7.198324203491211, "lr": 0.0001008, "elapsed_sec": 2074.9930126667023, "step_time_sec": 8.227386382000986, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 253, "loss": 7.291356563568115, "lr": 0.0001012, "elapsed_sec": 2083.221931219101, "step_time_sec": 8.22875623300206, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 254, "loss": 7.154963970184326, "lr": 0.0001016, "elapsed_sec": 2091.451548099518, "step_time_sec": 8.229411342006642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 255, "loss": 7.191559314727783, "lr": 0.00010200000000000001, "elapsed_sec": 2099.681376218796, "step_time_sec": 8.229671002016403, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 256, "loss": 7.255947113037109, "lr": 0.00010240000000000001, "elapsed_sec": 2107.911891937256, "step_time_sec": 8.230364036979154, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 257, "loss": 7.1239447593688965, "lr": 0.0001028, "elapsed_sec": 2116.1417479515076, "step_time_sec": 8.229685554979369, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 258, "loss": 7.265697956085205, "lr": 0.0001032, "elapsed_sec": 2124.370642900467, "step_time_sec": 8.228791580011602, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 259, "loss": 7.110750675201416, "lr": 0.00010360000000000001, "elapsed_sec": 2132.599527835846, "step_time_sec": 8.228693206008757, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 260, "loss": 7.161442279815674, "lr": 0.00010400000000000001, "elapsed_sec": 2140.8268206119537, "step_time_sec": 8.227136275003431, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 261, "loss": 7.146083831787109, "lr": 0.0001044, "elapsed_sec": 2149.054901123047, "step_time_sec": 8.227944501995808, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 262, "loss": 7.140963554382324, "lr": 0.0001048, "elapsed_sec": 2157.2821571826935, "step_time_sec": 8.227169925987255, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 263, "loss": 7.148171901702881, "lr": 0.0001052, "elapsed_sec": 2165.511364221573, "step_time_sec": 8.229006701003527, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 264, "loss": 7.118491172790527, "lr": 0.0001056, "elapsed_sec": 2173.741420030594, "step_time_sec": 8.229955667018658, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 265, "loss": 7.279483795166016, "lr": 0.00010600000000000002, "elapsed_sec": 2181.9711294174194, "step_time_sec": 8.229487856995547, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 266, "loss": 7.154058456420898, "lr": 0.00010640000000000001, "elapsed_sec": 2190.2010250091553, "step_time_sec": 8.229782395996153, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 267, "loss": 7.016635417938232, "lr": 0.00010680000000000001, "elapsed_sec": 2198.428619861603, "step_time_sec": 8.227414310007589, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 268, "loss": 7.305427551269531, "lr": 0.0001072, "elapsed_sec": 2206.6559085845947, "step_time_sec": 8.22714824101422, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 269, "loss": 7.233744144439697, "lr": 0.0001076, "elapsed_sec": 2214.8832182884216, "step_time_sec": 8.227188213990303, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 270, "loss": 7.139033794403076, "lr": 0.000108, "elapsed_sec": 2223.111053466797, "step_time_sec": 8.227611395006534, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 271, "loss": 7.082510471343994, "lr": 0.0001084, "elapsed_sec": 2231.340703010559, "step_time_sec": 8.229549951996887, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 272, "loss": 7.10545015335083, "lr": 0.0001088, "elapsed_sec": 2239.5682051181793, "step_time_sec": 8.227323305996833, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 273, "loss": 7.061676025390625, "lr": 0.00010920000000000001, "elapsed_sec": 2247.7948582172394, "step_time_sec": 8.226448328001425, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 274, "loss": 7.000365257263184, "lr": 0.00010960000000000001, "elapsed_sec": 2256.0222313404083, "step_time_sec": 8.22727186899283, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 275, "loss": 7.05740213394165, "lr": 0.00011, "elapsed_sec": 2264.250384569168, "step_time_sec": 8.227953716006596, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 276, "loss": 7.064769744873047, "lr": 0.00011040000000000001, "elapsed_sec": 2272.4788796901703, "step_time_sec": 8.228323008981533, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 277, "loss": 7.128411769866943, "lr": 0.00011080000000000001, "elapsed_sec": 2280.7084696292877, "step_time_sec": 8.229476655018516, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 278, "loss": 7.112476348876953, "lr": 0.0001112, "elapsed_sec": 2288.938007593155, "step_time_sec": 8.229332536982838, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 279, "loss": 7.12471866607666, "lr": 0.0001116, "elapsed_sec": 2297.168041944504, "step_time_sec": 8.229874141019536, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 280, "loss": 7.1327595710754395, "lr": 0.000112, "elapsed_sec": 2305.3955924510956, "step_time_sec": 8.227449517988134, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 281, "loss": 7.022354602813721, "lr": 0.0001124, "elapsed_sec": 2313.621566057205, "step_time_sec": 8.225814406992868, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 282, "loss": 7.0943522453308105, "lr": 0.00011280000000000002, "elapsed_sec": 2321.848985671997, "step_time_sec": 8.22721573500894, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 283, "loss": 7.086832046508789, "lr": 0.00011320000000000001, "elapsed_sec": 2330.077996492386, "step_time_sec": 8.228890992992092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 284, "loss": 7.103909492492676, "lr": 0.00011360000000000001, "elapsed_sec": 2338.305232524872, "step_time_sec": 8.227104064979358, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 285, "loss": 7.194436073303223, "lr": 0.000114, "elapsed_sec": 2346.534899711609, "step_time_sec": 8.22946354400483, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 286, "loss": 7.089519500732422, "lr": 0.0001144, "elapsed_sec": 2354.762221813202, "step_time_sec": 8.227144981996389, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 287, "loss": 7.0063886642456055, "lr": 0.0001148, "elapsed_sec": 2362.9897978305817, "step_time_sec": 8.227414263994433, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 288, "loss": 6.935528755187988, "lr": 0.00011520000000000001, "elapsed_sec": 2371.219544172287, "step_time_sec": 8.229610954003874, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 289, "loss": 7.201207160949707, "lr": 0.0001156, "elapsed_sec": 2379.4489467144012, "step_time_sec": 8.22929912499967, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 290, "loss": 6.970212936401367, "lr": 0.000116, "elapsed_sec": 2387.6786680221558, "step_time_sec": 8.229505917988718, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 291, "loss": 7.358813285827637, "lr": 0.0001164, "elapsed_sec": 2395.9087965488434, "step_time_sec": 8.229956759983907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 292, "loss": 6.945559501647949, "lr": 0.0001168, "elapsed_sec": 2404.138190984726, "step_time_sec": 8.229241471010027, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 293, "loss": 7.115769386291504, "lr": 0.00011720000000000002, "elapsed_sec": 2412.367651939392, "step_time_sec": 8.229374838992953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 294, "loss": 6.9364495277404785, "lr": 0.00011760000000000001, "elapsed_sec": 2420.5968105793, "step_time_sec": 8.228968013980193, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 295, "loss": 6.96002197265625, "lr": 0.00011800000000000001, "elapsed_sec": 2428.8265810012817, "step_time_sec": 8.2296137859812, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 296, "loss": 6.925050258636475, "lr": 0.0001184, "elapsed_sec": 2437.0520327091217, "step_time_sec": 8.225274042983074, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 297, "loss": 7.058743476867676, "lr": 0.0001188, "elapsed_sec": 2445.2800636291504, "step_time_sec": 8.227906671003439, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 298, "loss": 6.979632377624512, "lr": 0.0001192, "elapsed_sec": 2453.508845090866, "step_time_sec": 8.22856148800929, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 299, "loss": 6.9180378913879395, "lr": 0.00011960000000000001, "elapsed_sec": 2461.735913038254, "step_time_sec": 8.226924996008165, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 300, "loss": 6.941965103149414, "lr": 0.00012, "elapsed_sec": 2469.9644827842712, "step_time_sec": 8.228412689990364, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 301, "loss": 7.049347877502441, "lr": 0.00012040000000000001, "elapsed_sec": 2478.1924607753754, "step_time_sec": 8.227819936000742, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 302, "loss": 6.964862823486328, "lr": 0.00012080000000000001, "elapsed_sec": 2486.4224326610565, "step_time_sec": 8.229830272000981, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 303, "loss": 6.9654860496521, "lr": 0.0001212, "elapsed_sec": 2494.6501066684723, "step_time_sec": 8.227515136997681, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 304, "loss": 6.925640106201172, "lr": 0.0001216, "elapsed_sec": 2502.8795170783997, "step_time_sec": 8.229264488996705, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 305, "loss": 7.006808757781982, "lr": 0.00012200000000000001, "elapsed_sec": 2511.110067129135, "step_time_sec": 8.230418813007418, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 306, "loss": 7.020445823669434, "lr": 0.00012240000000000002, "elapsed_sec": 2519.3383882045746, "step_time_sec": 8.228125245979754, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 307, "loss": 7.026490688323975, "lr": 0.0001228, "elapsed_sec": 2527.566355228424, "step_time_sec": 8.227782502013724, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 308, "loss": 7.002697944641113, "lr": 0.0001232, "elapsed_sec": 2535.794571876526, "step_time_sec": 8.228089699987322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 309, "loss": 7.032038688659668, "lr": 0.0001236, "elapsed_sec": 2544.0230679512024, "step_time_sec": 8.228365982999094, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 310, "loss": 6.969245910644531, "lr": 0.000124, "elapsed_sec": 2552.2495980262756, "step_time_sec": 8.226298509020125, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 311, "loss": 6.863881587982178, "lr": 0.00012440000000000002, "elapsed_sec": 2560.479412317276, "step_time_sec": 8.229719841998303, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 312, "loss": 6.988770484924316, "lr": 0.0001248, "elapsed_sec": 2568.7100031375885, "step_time_sec": 8.230439398001181, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 313, "loss": 6.945908546447754, "lr": 0.0001252, "elapsed_sec": 2576.9403846263885, "step_time_sec": 8.230168997019064, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 314, "loss": 6.966862201690674, "lr": 0.00012560000000000002, "elapsed_sec": 2585.1698849201202, "step_time_sec": 8.229363998980261, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 315, "loss": 7.040717124938965, "lr": 0.000126, "elapsed_sec": 2593.39994096756, "step_time_sec": 8.229881619015941, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 316, "loss": 6.999602794647217, "lr": 0.0001264, "elapsed_sec": 2601.627424478531, "step_time_sec": 8.227323072002036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 317, "loss": 6.925985813140869, "lr": 0.0001268, "elapsed_sec": 2609.855988264084, "step_time_sec": 8.228443494008388, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 318, "loss": 6.899860382080078, "lr": 0.0001272, "elapsed_sec": 2618.083344936371, "step_time_sec": 8.227189678989816, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 319, "loss": 6.969964981079102, "lr": 0.0001276, "elapsed_sec": 2626.313257932663, "step_time_sec": 8.229726231016684, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 320, "loss": 7.003751277923584, "lr": 0.000128, "elapsed_sec": 2634.5434923171997, "step_time_sec": 8.230086089024553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 321, "loss": 6.99003791809082, "lr": 0.0001284, "elapsed_sec": 2642.772829771042, "step_time_sec": 8.229201506997924, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 322, "loss": 6.81178092956543, "lr": 0.0001288, "elapsed_sec": 2651.00213098526, "step_time_sec": 8.229142123978818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 323, "loss": 6.854548931121826, "lr": 0.0001292, "elapsed_sec": 2659.229597091675, "step_time_sec": 8.227355967013864, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 324, "loss": 7.077430248260498, "lr": 0.0001296, "elapsed_sec": 2667.45645904541, "step_time_sec": 8.226663228997495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 325, "loss": 6.954282760620117, "lr": 0.00013000000000000002, "elapsed_sec": 2675.684112071991, "step_time_sec": 8.227544366993243, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 326, "loss": 6.97117280960083, "lr": 0.00013040000000000003, "elapsed_sec": 2683.9129779338837, "step_time_sec": 8.228737260011258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 327, "loss": 7.002613544464111, "lr": 0.0001308, "elapsed_sec": 2692.142060995102, "step_time_sec": 8.228867071011337, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 328, "loss": 6.950592517852783, "lr": 0.00013120000000000002, "elapsed_sec": 2700.3694653511047, "step_time_sec": 8.227271606010618, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 329, "loss": 7.034499168395996, "lr": 0.0001316, "elapsed_sec": 2708.5975432395935, "step_time_sec": 8.227857543010032, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 330, "loss": 6.860952854156494, "lr": 0.000132, "elapsed_sec": 2716.826049566269, "step_time_sec": 8.2283888689999, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 331, "loss": 7.037774562835693, "lr": 0.00013240000000000002, "elapsed_sec": 2725.0539202690125, "step_time_sec": 8.227690942003392, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 332, "loss": 6.797207355499268, "lr": 0.0001328, "elapsed_sec": 2733.282640695572, "step_time_sec": 8.228573140018852, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 333, "loss": 6.901675224304199, "lr": 0.0001332, "elapsed_sec": 2741.5114316940308, "step_time_sec": 8.228623705013888, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 334, "loss": 6.8239641189575195, "lr": 0.0001336, "elapsed_sec": 2749.741860151291, "step_time_sec": 8.230275000008987, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 335, "loss": 6.815365791320801, "lr": 0.000134, "elapsed_sec": 2757.9715485572815, "step_time_sec": 8.229550461983308, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 336, "loss": 6.814860820770264, "lr": 0.00013440000000000001, "elapsed_sec": 2766.2010476589203, "step_time_sec": 8.229412112996215, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 337, "loss": 6.9280619621276855, "lr": 0.0001348, "elapsed_sec": 2774.431359052658, "step_time_sec": 8.23007140000118, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 338, "loss": 6.862276077270508, "lr": 0.0001352, "elapsed_sec": 2782.6607315540314, "step_time_sec": 8.22921575899818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 339, "loss": 6.737376689910889, "lr": 0.0001356, "elapsed_sec": 2790.8903529644012, "step_time_sec": 8.229477596003562, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 340, "loss": 6.936995506286621, "lr": 0.000136, "elapsed_sec": 2799.1178829669952, "step_time_sec": 8.227407295984449, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 341, "loss": 6.872287273406982, "lr": 0.00013639999999999998, "elapsed_sec": 2807.347144842148, "step_time_sec": 8.229062566009816, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 342, "loss": 6.8285698890686035, "lr": 0.00013680000000000002, "elapsed_sec": 2815.576596260071, "step_time_sec": 8.22932806200697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 343, "loss": 6.871025562286377, "lr": 0.00013720000000000003, "elapsed_sec": 2823.8059096336365, "step_time_sec": 8.22912957999506, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 344, "loss": 6.96433162689209, "lr": 0.0001376, "elapsed_sec": 2832.0342252254486, "step_time_sec": 8.2282370350149, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 345, "loss": 6.960544586181641, "lr": 0.00013800000000000002, "elapsed_sec": 2840.262539625168, "step_time_sec": 8.228099586005555, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 346, "loss": 7.003678321838379, "lr": 0.0001384, "elapsed_sec": 2848.4897499084473, "step_time_sec": 8.227032164984848, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 347, "loss": 6.9760308265686035, "lr": 0.0001388, "elapsed_sec": 2856.7178173065186, "step_time_sec": 8.227928387990687, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 348, "loss": 6.959328651428223, "lr": 0.00013920000000000002, "elapsed_sec": 2864.945660352707, "step_time_sec": 8.227684187993873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 349, "loss": 6.952682971954346, "lr": 0.0001396, "elapsed_sec": 2873.173398733139, "step_time_sec": 8.227582382998662, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 350, "loss": 6.996934413909912, "lr": 0.00014000000000000001, "elapsed_sec": 2881.4027264118195, "step_time_sec": 8.229177622997668, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 351, "loss": 6.931902885437012, "lr": 0.0001404, "elapsed_sec": 2889.6327419281006, "step_time_sec": 8.229868916998385, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 352, "loss": 6.763069152832031, "lr": 0.0001408, "elapsed_sec": 2897.862524032593, "step_time_sec": 8.22962339699734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 353, "loss": 6.825408935546875, "lr": 0.00014120000000000002, "elapsed_sec": 2906.0925698280334, "step_time_sec": 8.229903842991916, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 354, "loss": 6.804842472076416, "lr": 0.0001416, "elapsed_sec": 2914.3227581977844, "step_time_sec": 8.230062226997688, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 355, "loss": 7.60021448135376, "lr": 0.000142, "elapsed_sec": 2922.5522089004517, "step_time_sec": 8.229291850992013, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 356, "loss": 6.880292892456055, "lr": 0.0001424, "elapsed_sec": 2930.782730102539, "step_time_sec": 8.230358753004111, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 357, "loss": 6.7435736656188965, "lr": 0.0001428, "elapsed_sec": 2939.011952638626, "step_time_sec": 8.229032644012477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 358, "loss": 6.808405876159668, "lr": 0.00014319999999999998, "elapsed_sec": 2947.2403202056885, "step_time_sec": 8.228236647992162, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 359, "loss": 6.721776008605957, "lr": 0.0001436, "elapsed_sec": 2955.4685249328613, "step_time_sec": 8.228108791983686, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 360, "loss": 6.929161071777344, "lr": 0.000144, "elapsed_sec": 2963.697576522827, "step_time_sec": 8.228810122003779, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 361, "loss": 6.8344573974609375, "lr": 0.0001444, "elapsed_sec": 2971.927517414093, "step_time_sec": 8.229795352992369, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 362, "loss": 6.7871479988098145, "lr": 0.00014480000000000002, "elapsed_sec": 2980.1574082374573, "step_time_sec": 8.229802779998863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 363, "loss": 6.713804244995117, "lr": 0.0001452, "elapsed_sec": 2988.3872005939484, "step_time_sec": 8.229610039998079, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 364, "loss": 6.680352210998535, "lr": 0.00014560000000000002, "elapsed_sec": 2996.617450237274, "step_time_sec": 8.230059435009025, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 365, "loss": 6.775351524353027, "lr": 0.00014600000000000003, "elapsed_sec": 3004.8463723659515, "step_time_sec": 8.228728582005715, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 366, "loss": 6.787242889404297, "lr": 0.0001464, "elapsed_sec": 3013.074463367462, "step_time_sec": 8.228009385988116, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 367, "loss": 6.907992362976074, "lr": 0.00014680000000000002, "elapsed_sec": 3021.3015537261963, "step_time_sec": 8.226875055988785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 368, "loss": 6.747190475463867, "lr": 0.0001472, "elapsed_sec": 3029.530531644821, "step_time_sec": 8.22888549999334, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 369, "loss": 6.635620594024658, "lr": 0.0001476, "elapsed_sec": 3037.7589395046234, "step_time_sec": 8.22818905801978, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 370, "loss": 6.703984260559082, "lr": 0.00014800000000000002, "elapsed_sec": 3045.988049030304, "step_time_sec": 8.229024193016812, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 371, "loss": 6.9062113761901855, "lr": 0.0001484, "elapsed_sec": 3054.216861963272, "step_time_sec": 8.228585636010394, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 372, "loss": 6.755170822143555, "lr": 0.0001488, "elapsed_sec": 3062.444739818573, "step_time_sec": 8.227728187979665, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 373, "loss": 6.728435039520264, "lr": 0.0001492, "elapsed_sec": 3070.672993183136, "step_time_sec": 8.22816586800036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 374, "loss": 6.756184101104736, "lr": 0.0001496, "elapsed_sec": 3078.902580022812, "step_time_sec": 8.229330233007204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 375, "loss": 6.857005596160889, "lr": 0.00015, "elapsed_sec": 3087.132011651993, "step_time_sec": 8.229342331003863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 376, "loss": 6.764888286590576, "lr": 0.0001504, "elapsed_sec": 3095.361098766327, "step_time_sec": 8.228879975009477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 377, "loss": 6.790586471557617, "lr": 0.0001508, "elapsed_sec": 3103.589988231659, "step_time_sec": 8.228706474008504, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 378, "loss": 6.8679585456848145, "lr": 0.0001512, "elapsed_sec": 3111.818941116333, "step_time_sec": 8.228752514987718, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 379, "loss": 6.769087791442871, "lr": 0.00015160000000000003, "elapsed_sec": 3120.0483860969543, "step_time_sec": 8.229309619986452, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 380, "loss": 6.80294132232666, "lr": 0.000152, "elapsed_sec": 3128.2759001255035, "step_time_sec": 8.22736962302588, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 381, "loss": 6.815040111541748, "lr": 0.00015240000000000002, "elapsed_sec": 3136.5039088726044, "step_time_sec": 8.227858290018048, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 382, "loss": 6.7534942626953125, "lr": 0.00015280000000000003, "elapsed_sec": 3144.731452226639, "step_time_sec": 8.227418728987686, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 383, "loss": 6.69903564453125, "lr": 0.0001532, "elapsed_sec": 3152.9599747657776, "step_time_sec": 8.228361379005946, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 384, "loss": 6.807240962982178, "lr": 0.00015360000000000002, "elapsed_sec": 3161.18768453598, "step_time_sec": 8.227505349001149, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 385, "loss": 6.868391036987305, "lr": 0.000154, "elapsed_sec": 3169.4179384708405, "step_time_sec": 8.230167052999604, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 386, "loss": 6.7243852615356445, "lr": 0.0001544, "elapsed_sec": 3177.6476826667786, "step_time_sec": 8.2295347689942, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 387, "loss": 6.792775630950928, "lr": 0.00015480000000000002, "elapsed_sec": 3185.8751904964447, "step_time_sec": 8.227379839983769, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 388, "loss": 6.711380481719971, "lr": 0.0001552, "elapsed_sec": 3194.104959964752, "step_time_sec": 8.229565357993124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 389, "loss": 6.732090950012207, "lr": 0.00015560000000000001, "elapsed_sec": 3202.3324201107025, "step_time_sec": 8.227346233004937, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 390, "loss": 6.790426731109619, "lr": 0.000156, "elapsed_sec": 3210.562836408615, "step_time_sec": 8.230224037979497, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 391, "loss": 6.651332378387451, "lr": 0.0001564, "elapsed_sec": 3218.792891263962, "step_time_sec": 8.229894732008688, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 392, "loss": 6.738215923309326, "lr": 0.0001568, "elapsed_sec": 3227.018317461014, "step_time_sec": 8.225256998004625, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 393, "loss": 6.720392227172852, "lr": 0.0001572, "elapsed_sec": 3235.2478518486023, "step_time_sec": 8.229389902000548, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 394, "loss": 6.756256103515625, "lr": 0.0001576, "elapsed_sec": 3243.475440979004, "step_time_sec": 8.227436546003446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 395, "loss": 6.703514575958252, "lr": 0.000158, "elapsed_sec": 3251.7069325447083, "step_time_sec": 8.231402382982196, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 396, "loss": 6.678727626800537, "lr": 0.0001584, "elapsed_sec": 3259.9373638629913, "step_time_sec": 8.230163283995353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 397, "loss": 6.70128059387207, "lr": 0.00015879999999999998, "elapsed_sec": 3268.167161464691, "step_time_sec": 8.229647956002736, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 398, "loss": 6.627381324768066, "lr": 0.00015920000000000002, "elapsed_sec": 3276.397220134735, "step_time_sec": 8.229915145988343, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 399, "loss": 6.735154151916504, "lr": 0.00015960000000000003, "elapsed_sec": 3284.6245744228363, "step_time_sec": 8.22725213697413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 400, "loss": 6.719301223754883, "lr": 0.00016, "elapsed_sec": 3292.8535499572754, "step_time_sec": 8.228785682993475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 401, "loss": 6.700405597686768, "lr": 0.00016040000000000002, "elapsed_sec": 3301.082801103592, "step_time_sec": 8.229130358988186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 402, "loss": 6.7373833656311035, "lr": 0.0001608, "elapsed_sec": 3309.3117809295654, "step_time_sec": 8.228780103992904, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 403, "loss": 6.780934810638428, "lr": 0.00016120000000000002, "elapsed_sec": 3317.5419154167175, "step_time_sec": 8.229966875020182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 404, "loss": 6.683631896972656, "lr": 0.00016160000000000002, "elapsed_sec": 3325.769271373749, "step_time_sec": 8.227210542012472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 405, "loss": 6.859014511108398, "lr": 0.000162, "elapsed_sec": 3333.9981734752655, "step_time_sec": 8.228776272007963, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 406, "loss": 6.735208034515381, "lr": 0.00016240000000000002, "elapsed_sec": 3342.2245252132416, "step_time_sec": 8.226166609005304, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 407, "loss": 6.691999912261963, "lr": 0.0001628, "elapsed_sec": 3350.45507645607, "step_time_sec": 8.230383135000011, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 408, "loss": 6.690687656402588, "lr": 0.0001632, "elapsed_sec": 3358.685069799423, "step_time_sec": 8.229821905988501, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 409, "loss": 6.629654884338379, "lr": 0.0001636, "elapsed_sec": 3366.914855480194, "step_time_sec": 8.229605139000341, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 410, "loss": 6.601104736328125, "lr": 0.000164, "elapsed_sec": 3375.143361568451, "step_time_sec": 8.228425376000814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 411, "loss": 6.769458770751953, "lr": 0.0001644, "elapsed_sec": 3383.371359348297, "step_time_sec": 8.227805948001333, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 412, "loss": 6.650694847106934, "lr": 0.0001648, "elapsed_sec": 3391.599323987961, "step_time_sec": 8.227787625975907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 413, "loss": 6.7353515625, "lr": 0.0001652, "elapsed_sec": 3399.8291664123535, "step_time_sec": 8.229719456023304, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 414, "loss": 6.734564781188965, "lr": 0.0001656, "elapsed_sec": 3408.0581283569336, "step_time_sec": 8.228817685012473, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 415, "loss": 6.753288745880127, "lr": 0.000166, "elapsed_sec": 3416.2860605716705, "step_time_sec": 8.227738760004286, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 416, "loss": 6.713940620422363, "lr": 0.00016640000000000003, "elapsed_sec": 3424.5121541023254, "step_time_sec": 8.225902559992392, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 417, "loss": 6.663650035858154, "lr": 0.00016680000000000002, "elapsed_sec": 3432.741596698761, "step_time_sec": 8.229311771981884, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 418, "loss": 6.580212116241455, "lr": 0.00016720000000000003, "elapsed_sec": 3440.9720871448517, "step_time_sec": 8.230361861002166, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 419, "loss": 6.64075231552124, "lr": 0.0001676, "elapsed_sec": 3449.1992321014404, "step_time_sec": 8.226965837005991, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 420, "loss": 6.661510467529297, "lr": 0.00016800000000000002, "elapsed_sec": 3457.427715063095, "step_time_sec": 8.228293729014695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 421, "loss": 6.679022789001465, "lr": 0.00016840000000000003, "elapsed_sec": 3465.656716823578, "step_time_sec": 8.228853779990459, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 422, "loss": 6.743699550628662, "lr": 0.0001688, "elapsed_sec": 3473.884101629257, "step_time_sec": 8.227234634017805, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 423, "loss": 6.6042985916137695, "lr": 0.00016920000000000002, "elapsed_sec": 3482.113354444504, "step_time_sec": 8.22909310500836, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 424, "loss": 6.7631306648254395, "lr": 0.0001696, "elapsed_sec": 3490.340764284134, "step_time_sec": 8.22722386199166, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 425, "loss": 6.693897247314453, "lr": 0.00017, "elapsed_sec": 3498.569150686264, "step_time_sec": 8.228231086017331, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 426, "loss": 6.640782833099365, "lr": 0.0001704, "elapsed_sec": 3506.797869682312, "step_time_sec": 8.228569760016399, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 427, "loss": 6.682490825653076, "lr": 0.0001708, "elapsed_sec": 3515.0262167453766, "step_time_sec": 8.228270174004138, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 428, "loss": 6.65167236328125, "lr": 0.00017120000000000001, "elapsed_sec": 3523.2549946308136, "step_time_sec": 8.228524572012248, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 429, "loss": 6.621989727020264, "lr": 0.0001716, "elapsed_sec": 3531.4847750663757, "step_time_sec": 8.229646716994466, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 430, "loss": 6.557704448699951, "lr": 0.000172, "elapsed_sec": 3539.7127051353455, "step_time_sec": 8.227766004012665, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 431, "loss": 6.696403980255127, "lr": 0.0001724, "elapsed_sec": 3547.939833879471, "step_time_sec": 8.226973495009588, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 432, "loss": 6.650622844696045, "lr": 0.0001728, "elapsed_sec": 3556.1684679985046, "step_time_sec": 8.228529226005776, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 433, "loss": 6.6012749671936035, "lr": 0.0001732, "elapsed_sec": 3564.3965022563934, "step_time_sec": 8.22781192700495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 434, "loss": 6.644206523895264, "lr": 0.0001736, "elapsed_sec": 3572.626730442047, "step_time_sec": 8.23009021699545, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 435, "loss": 6.6495161056518555, "lr": 0.00017400000000000003, "elapsed_sec": 3580.856378555298, "step_time_sec": 8.229533125006128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 436, "loss": 6.602592945098877, "lr": 0.0001744, "elapsed_sec": 3589.0862407684326, "step_time_sec": 8.229668611020315, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 437, "loss": 6.6349029541015625, "lr": 0.00017480000000000002, "elapsed_sec": 3597.3128085136414, "step_time_sec": 8.226404646993615, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 438, "loss": 6.6702961921691895, "lr": 0.0001752, "elapsed_sec": 3605.5412318706512, "step_time_sec": 8.228253854991635, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 439, "loss": 6.515190601348877, "lr": 0.0001756, "elapsed_sec": 3613.7702984809875, "step_time_sec": 8.228998415987007, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 440, "loss": 6.596498012542725, "lr": 0.00017600000000000002, "elapsed_sec": 3621.998311519623, "step_time_sec": 8.227796037012013, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 441, "loss": 6.834029674530029, "lr": 0.0001764, "elapsed_sec": 3630.227772474289, "step_time_sec": 8.229342342994642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 442, "loss": 6.454840183258057, "lr": 0.00017680000000000001, "elapsed_sec": 3638.4577457904816, "step_time_sec": 8.229789876000723, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 443, "loss": 6.65950870513916, "lr": 0.0001772, "elapsed_sec": 3646.687557220459, "step_time_sec": 8.229673464986263, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 444, "loss": 6.6442484855651855, "lr": 0.0001776, "elapsed_sec": 3654.917657852173, "step_time_sec": 8.229954592010472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 445, "loss": 6.650251865386963, "lr": 0.00017800000000000002, "elapsed_sec": 3663.1474134922028, "step_time_sec": 8.229555102996528, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 446, "loss": 6.5991668701171875, "lr": 0.0001784, "elapsed_sec": 3671.3751792907715, "step_time_sec": 8.22764986200491, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 447, "loss": 6.635595321655273, "lr": 0.0001788, "elapsed_sec": 3679.6035947799683, "step_time_sec": 8.228218131989706, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 448, "loss": 6.629244327545166, "lr": 0.0001792, "elapsed_sec": 3687.8332636356354, "step_time_sec": 8.229512827005237, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 449, "loss": 6.732376575469971, "lr": 0.0001796, "elapsed_sec": 3696.0621304512024, "step_time_sec": 8.228777216019807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 450, "loss": 6.58833646774292, "lr": 0.00018, "elapsed_sec": 3704.2901632785797, "step_time_sec": 8.2278676200076, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 451, "loss": 6.599917888641357, "lr": 0.0001804, "elapsed_sec": 3712.51726603508, "step_time_sec": 8.226897874992574, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 452, "loss": 6.635191440582275, "lr": 0.0001808, "elapsed_sec": 3720.7453048229218, "step_time_sec": 8.227901609992841, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 453, "loss": 6.589687824249268, "lr": 0.0001812, "elapsed_sec": 3728.9750587940216, "step_time_sec": 8.229593730997294, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 454, "loss": 6.513318061828613, "lr": 0.00018160000000000002, "elapsed_sec": 3737.205398797989, "step_time_sec": 8.230199912009994, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 455, "loss": 6.5441718101501465, "lr": 0.000182, "elapsed_sec": 3745.4355154037476, "step_time_sec": 8.229917848977493, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 456, "loss": 6.632120609283447, "lr": 0.00018240000000000002, "elapsed_sec": 3753.6641750335693, "step_time_sec": 8.228479457000503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 457, "loss": 6.571490287780762, "lr": 0.00018280000000000003, "elapsed_sec": 3761.8939430713654, "step_time_sec": 8.229619335994357, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 458, "loss": 6.61398983001709, "lr": 0.0001832, "elapsed_sec": 3770.1206645965576, "step_time_sec": 8.22661801398499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 459, "loss": 6.566062927246094, "lr": 0.00018360000000000002, "elapsed_sec": 3778.3492205142975, "step_time_sec": 8.228446947003249, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 460, "loss": 6.551438331604004, "lr": 0.000184, "elapsed_sec": 3786.576756477356, "step_time_sec": 8.227317045995733, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 461, "loss": 6.5198564529418945, "lr": 0.0001844, "elapsed_sec": 3794.8061084747314, "step_time_sec": 8.229152346990304, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 462, "loss": 6.648536682128906, "lr": 0.00018480000000000002, "elapsed_sec": 3803.0360612869263, "step_time_sec": 8.229820170992753, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 463, "loss": 6.591145992279053, "lr": 0.0001852, "elapsed_sec": 3811.265480518341, "step_time_sec": 8.22932388499612, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 464, "loss": 6.736073970794678, "lr": 0.0001856, "elapsed_sec": 3819.495080947876, "step_time_sec": 8.229444326017983, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 465, "loss": 6.477832794189453, "lr": 0.000186, "elapsed_sec": 3827.724678993225, "step_time_sec": 8.229363858001307, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 466, "loss": 6.582462310791016, "lr": 0.0001864, "elapsed_sec": 3835.9549267292023, "step_time_sec": 8.230072620994179, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 467, "loss": 6.569265365600586, "lr": 0.00018680000000000001, "elapsed_sec": 3844.1855046749115, "step_time_sec": 8.230449675989803, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 468, "loss": 6.480304718017578, "lr": 0.0001872, "elapsed_sec": 3852.413001060486, "step_time_sec": 8.22735167000792, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 469, "loss": 6.471355438232422, "lr": 0.0001876, "elapsed_sec": 3860.6400561332703, "step_time_sec": 8.226883572002407, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 470, "loss": 6.343580722808838, "lr": 0.000188, "elapsed_sec": 3868.868321418762, "step_time_sec": 8.22821160699823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 471, "loss": 6.606316566467285, "lr": 0.0001884, "elapsed_sec": 3877.095700979233, "step_time_sec": 8.227105232013855, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 472, "loss": 6.599635124206543, "lr": 0.0001888, "elapsed_sec": 3885.3246805667877, "step_time_sec": 8.228918017994147, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 473, "loss": 6.494200229644775, "lr": 0.00018920000000000002, "elapsed_sec": 3893.5536086559296, "step_time_sec": 8.228678734012647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 474, "loss": 6.653486728668213, "lr": 0.00018960000000000003, "elapsed_sec": 3901.7810661792755, "step_time_sec": 8.227325265994295, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 475, "loss": 6.563943386077881, "lr": 0.00019, "elapsed_sec": 3910.0107176303864, "step_time_sec": 8.229492693993961, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 476, "loss": 6.5613837242126465, "lr": 0.00019040000000000002, "elapsed_sec": 3918.240131378174, "step_time_sec": 8.229278422018979, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 477, "loss": 6.431516170501709, "lr": 0.0001908, "elapsed_sec": 3926.470377445221, "step_time_sec": 8.230124869005522, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 478, "loss": 6.424363613128662, "lr": 0.0001912, "elapsed_sec": 3934.699994325638, "step_time_sec": 8.229440176015487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 479, "loss": 6.484958171844482, "lr": 0.00019160000000000002, "elapsed_sec": 3942.9296686649323, "step_time_sec": 8.229461715003708, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 480, "loss": 6.489154815673828, "lr": 0.000192, "elapsed_sec": 3951.1605067253113, "step_time_sec": 8.230688202020247, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 481, "loss": 6.503838062286377, "lr": 0.00019240000000000001, "elapsed_sec": 3959.3870038986206, "step_time_sec": 8.226419508020626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 482, "loss": 6.496673583984375, "lr": 0.0001928, "elapsed_sec": 3967.61332154274, "step_time_sec": 8.226101991982432, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 483, "loss": 6.614237308502197, "lr": 0.0001932, "elapsed_sec": 3975.843144416809, "step_time_sec": 8.229667998995865, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 484, "loss": 6.548065662384033, "lr": 0.00019360000000000002, "elapsed_sec": 3984.0723934173584, "step_time_sec": 8.229073101014365, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 485, "loss": 6.453685760498047, "lr": 0.000194, "elapsed_sec": 3992.301924228668, "step_time_sec": 8.229394088994013, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 486, "loss": 6.454378604888916, "lr": 0.0001944, "elapsed_sec": 4000.5301082134247, "step_time_sec": 8.228083218011307, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 487, "loss": 6.566827774047852, "lr": 0.0001948, "elapsed_sec": 4008.758858680725, "step_time_sec": 8.228523680998478, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 488, "loss": 6.546975135803223, "lr": 0.0001952, "elapsed_sec": 4016.9861035346985, "step_time_sec": 8.227088941988768, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 489, "loss": 6.531592845916748, "lr": 0.00019559999999999998, "elapsed_sec": 4025.214702606201, "step_time_sec": 8.228451836010208, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 490, "loss": 6.465081691741943, "lr": 0.00019600000000000002, "elapsed_sec": 4033.4420692920685, "step_time_sec": 8.227223577006953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 491, "loss": 6.538581371307373, "lr": 0.00019640000000000003, "elapsed_sec": 4041.670809984207, "step_time_sec": 8.228632175014354, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 492, "loss": 6.579845428466797, "lr": 0.0001968, "elapsed_sec": 4049.9004366397858, "step_time_sec": 8.229405597987352, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 493, "loss": 6.528842926025391, "lr": 0.00019720000000000002, "elapsed_sec": 4058.1297760009766, "step_time_sec": 8.229204097006004, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 494, "loss": 6.736555099487305, "lr": 0.0001976, "elapsed_sec": 4066.3587987422943, "step_time_sec": 8.22891586998594, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 495, "loss": 6.547087669372559, "lr": 0.00019800000000000002, "elapsed_sec": 4074.5888118743896, "step_time_sec": 8.22981893501128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 496, "loss": 6.559807777404785, "lr": 0.00019840000000000002, "elapsed_sec": 4082.8189220428467, "step_time_sec": 8.229948043008335, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 497, "loss": 6.418319225311279, "lr": 0.0001988, "elapsed_sec": 4091.048652410507, "step_time_sec": 8.229583473003004, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 498, "loss": 6.4480671882629395, "lr": 0.00019920000000000002, "elapsed_sec": 4099.2772126197815, "step_time_sec": 8.228469617984956, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 499, "loss": 6.488837718963623, "lr": 0.0001996, "elapsed_sec": 4107.506419420242, "step_time_sec": 8.228980370011413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 500, "loss": 6.376605987548828, "lr": 0.0002, "elapsed_sec": 4115.736721992493, "step_time_sec": 29.606893475021934, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 501, "loss": 6.429925918579102, "lr": 0.0002, "elapsed_sec": 4145.354514837265, "step_time_sec": 8.240592629997991, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 502, "loss": 6.502254962921143, "lr": 0.0002, "elapsed_sec": 4153.569623231888, "step_time_sec": 8.214968895976199, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 503, "loss": 6.530492305755615, "lr": 0.0002, "elapsed_sec": 4161.784911394119, "step_time_sec": 8.215097236010479, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 504, "loss": 6.52166748046875, "lr": 0.0002, "elapsed_sec": 4170.000780582428, "step_time_sec": 8.215737085993169, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 505, "loss": 6.443513870239258, "lr": 0.0002, "elapsed_sec": 4178.216514110565, "step_time_sec": 8.215493676019832, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 506, "loss": 6.6494140625, "lr": 0.0002, "elapsed_sec": 4186.431793689728, "step_time_sec": 8.215142236003885, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 507, "loss": 6.5508928298950195, "lr": 0.0002, "elapsed_sec": 4194.647248506546, "step_time_sec": 8.215311710984679, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 508, "loss": 6.749892711639404, "lr": 0.0002, "elapsed_sec": 4202.865496397018, "step_time_sec": 8.218063117004931, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 509, "loss": 6.535534858703613, "lr": 0.0002, "elapsed_sec": 4211.093318939209, "step_time_sec": 8.227668181993067, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 510, "loss": 6.477221488952637, "lr": 0.0002, "elapsed_sec": 4219.321380615234, "step_time_sec": 8.227942262019496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 511, "loss": 6.435550212860107, "lr": 0.0002, "elapsed_sec": 4227.551151514053, "step_time_sec": 8.229592584015336, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 512, "loss": 6.367842674255371, "lr": 0.0002, "elapsed_sec": 4235.780768871307, "step_time_sec": 8.229504507005913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 513, "loss": 6.480863094329834, "lr": 0.0002, "elapsed_sec": 4244.010126829147, "step_time_sec": 8.22916124999756, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 514, "loss": 6.448121547698975, "lr": 0.0002, "elapsed_sec": 4252.240030050278, "step_time_sec": 8.229742961993907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 515, "loss": 6.433417320251465, "lr": 0.0002, "elapsed_sec": 4260.47011756897, "step_time_sec": 8.229918175988132, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 516, "loss": 6.430978775024414, "lr": 0.0002, "elapsed_sec": 4268.698853254318, "step_time_sec": 8.228550881001865, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 517, "loss": 6.361861228942871, "lr": 0.0002, "elapsed_sec": 4276.926242351532, "step_time_sec": 8.22729494702071, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 518, "loss": 6.4009504318237305, "lr": 0.0002, "elapsed_sec": 4285.153876066208, "step_time_sec": 8.227408761013066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 519, "loss": 6.381475448608398, "lr": 0.0002, "elapsed_sec": 4293.38170671463, "step_time_sec": 8.22767253901111, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 520, "loss": 6.434328079223633, "lr": 0.0002, "elapsed_sec": 4301.608563423157, "step_time_sec": 8.226761758007342, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 521, "loss": 6.382274627685547, "lr": 0.0002, "elapsed_sec": 4309.835290670395, "step_time_sec": 8.22651795699494, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 522, "loss": 6.4586100578308105, "lr": 0.0002, "elapsed_sec": 4318.064303636551, "step_time_sec": 8.228860937000718, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 523, "loss": 6.362071990966797, "lr": 0.0002, "elapsed_sec": 4326.291554927826, "step_time_sec": 8.227095490001375, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 524, "loss": 6.547203063964844, "lr": 0.0002, "elapsed_sec": 4334.520969629288, "step_time_sec": 8.229258216975722, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 525, "loss": 6.488918304443359, "lr": 0.0002, "elapsed_sec": 4342.751107931137, "step_time_sec": 8.229989464016398, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 526, "loss": 6.4822845458984375, "lr": 0.0002, "elapsed_sec": 4350.979013204575, "step_time_sec": 8.22779167999397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 527, "loss": 6.419869899749756, "lr": 0.0002, "elapsed_sec": 4359.206169128418, "step_time_sec": 8.227020374994026, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 528, "loss": 6.5425262451171875, "lr": 0.0002, "elapsed_sec": 4367.434513092041, "step_time_sec": 8.228111106989672, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 529, "loss": 6.390000820159912, "lr": 0.0002, "elapsed_sec": 4375.663655996323, "step_time_sec": 8.229046241991455, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 530, "loss": 6.36940336227417, "lr": 0.0002, "elapsed_sec": 4383.893516778946, "step_time_sec": 8.229666370985797, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 531, "loss": 6.47836446762085, "lr": 0.0002, "elapsed_sec": 4392.1228539943695, "step_time_sec": 8.229139119008323, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 532, "loss": 6.394843578338623, "lr": 0.0002, "elapsed_sec": 4400.352530956268, "step_time_sec": 8.22954027398373, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 533, "loss": 6.439940452575684, "lr": 0.0002, "elapsed_sec": 4408.582279920578, "step_time_sec": 8.22960244701244, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 534, "loss": 6.40760612487793, "lr": 0.0002, "elapsed_sec": 4416.8119168281555, "step_time_sec": 8.229523288988275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 535, "loss": 6.334616661071777, "lr": 0.0002, "elapsed_sec": 4425.04106259346, "step_time_sec": 8.22895014900132, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 536, "loss": 6.420666217803955, "lr": 0.0002, "elapsed_sec": 4433.271058797836, "step_time_sec": 8.229837947990745, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 537, "loss": 6.355462551116943, "lr": 0.0002, "elapsed_sec": 4441.50109910965, "step_time_sec": 8.229894587973831, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 538, "loss": 6.3983564376831055, "lr": 0.0002, "elapsed_sec": 4449.729675769806, "step_time_sec": 8.228428747999715, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 539, "loss": 6.417647361755371, "lr": 0.0002, "elapsed_sec": 4457.959157705307, "step_time_sec": 8.229317069984972, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 540, "loss": 6.551478862762451, "lr": 0.0002, "elapsed_sec": 4466.186795711517, "step_time_sec": 8.22751190298004, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 541, "loss": 6.402220249176025, "lr": 0.0002, "elapsed_sec": 4474.413767814636, "step_time_sec": 8.22673053998733, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 542, "loss": 6.380388259887695, "lr": 0.0002, "elapsed_sec": 4482.640773773193, "step_time_sec": 8.226870576996589, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 543, "loss": 6.397372722625732, "lr": 0.0002, "elapsed_sec": 4490.871415615082, "step_time_sec": 8.230522414989537, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 544, "loss": 6.358451843261719, "lr": 0.0002, "elapsed_sec": 4499.100872993469, "step_time_sec": 8.229267369984882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 545, "loss": 6.388802528381348, "lr": 0.0002, "elapsed_sec": 4507.33007645607, "step_time_sec": 8.229045626998413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 546, "loss": 6.378384113311768, "lr": 0.0002, "elapsed_sec": 4515.558967828751, "step_time_sec": 8.228775870986283, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 547, "loss": 6.4733099937438965, "lr": 0.0002, "elapsed_sec": 4523.7871351242065, "step_time_sec": 8.227962024015142, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 548, "loss": 6.395536422729492, "lr": 0.0002, "elapsed_sec": 4532.015642881393, "step_time_sec": 8.22838765499182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 549, "loss": 6.448266983032227, "lr": 0.0002, "elapsed_sec": 4540.244111776352, "step_time_sec": 8.228290090017254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 550, "loss": 6.382241249084473, "lr": 0.0002, "elapsed_sec": 4548.472925186157, "step_time_sec": 8.228686769027263, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 551, "loss": 6.322267532348633, "lr": 0.0002, "elapsed_sec": 4556.703072309494, "step_time_sec": 8.229932893998921, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 552, "loss": 6.531739234924316, "lr": 0.0002, "elapsed_sec": 4564.931708812714, "step_time_sec": 8.228480218996992, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 553, "loss": 6.3316731452941895, "lr": 0.0002, "elapsed_sec": 4573.16032743454, "step_time_sec": 8.22846386799938, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 554, "loss": 6.402317047119141, "lr": 0.0002, "elapsed_sec": 4581.387717247009, "step_time_sec": 8.227290803013602, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 555, "loss": 6.3660359382629395, "lr": 0.0002, "elapsed_sec": 4589.615182876587, "step_time_sec": 8.227254691999406, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 556, "loss": 6.357669830322266, "lr": 0.0002, "elapsed_sec": 4597.843759059906, "step_time_sec": 8.228420075989561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 557, "loss": 6.44683313369751, "lr": 0.0002, "elapsed_sec": 4606.072851657867, "step_time_sec": 8.228994750999846, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 558, "loss": 6.376293659210205, "lr": 0.0002, "elapsed_sec": 4614.30327296257, "step_time_sec": 8.230212021997431, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 559, "loss": 6.387221336364746, "lr": 0.0002, "elapsed_sec": 4622.533170223236, "step_time_sec": 8.229731508996338, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 560, "loss": 6.343684196472168, "lr": 0.0002, "elapsed_sec": 4630.761999845505, "step_time_sec": 8.228733432013541, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 561, "loss": 6.435442924499512, "lr": 0.0002, "elapsed_sec": 4638.991513490677, "step_time_sec": 8.229290246003075, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 562, "loss": 6.3415021896362305, "lr": 0.0002, "elapsed_sec": 4647.220664262772, "step_time_sec": 8.229017637990182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 563, "loss": 6.305459499359131, "lr": 0.0002, "elapsed_sec": 4655.450455665588, "step_time_sec": 8.229602780018467, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 564, "loss": 6.382946014404297, "lr": 0.0002, "elapsed_sec": 4663.679627656937, "step_time_sec": 8.229031296999892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 565, "loss": 6.339844703674316, "lr": 0.0002, "elapsed_sec": 4671.91014456749, "step_time_sec": 8.230368060001638, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 566, "loss": 6.373386383056641, "lr": 0.0002, "elapsed_sec": 4680.139800071716, "step_time_sec": 8.229490586003521, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 567, "loss": 6.4064106941223145, "lr": 0.0002, "elapsed_sec": 4688.366386890411, "step_time_sec": 8.226428684021812, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 568, "loss": 6.272947788238525, "lr": 0.0002, "elapsed_sec": 4696.59454870224, "step_time_sec": 8.228024815995013, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 569, "loss": 6.3719868659973145, "lr": 0.0002, "elapsed_sec": 4704.821711778641, "step_time_sec": 8.226948987983633, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 570, "loss": 6.394576072692871, "lr": 0.0002, "elapsed_sec": 4713.048914670944, "step_time_sec": 8.227034379000543, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 571, "loss": 6.298750877380371, "lr": 0.0002, "elapsed_sec": 4721.2765417099, "step_time_sec": 8.227518056024564, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 572, "loss": 6.25969123840332, "lr": 0.0002, "elapsed_sec": 4729.507028341293, "step_time_sec": 8.230300019000424, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 573, "loss": 6.339113712310791, "lr": 0.0002, "elapsed_sec": 4737.737082958221, "step_time_sec": 8.229848904011305, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 574, "loss": 6.228273391723633, "lr": 0.0002, "elapsed_sec": 4745.966969490051, "step_time_sec": 8.229670797998551, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 575, "loss": 6.250462055206299, "lr": 0.0002, "elapsed_sec": 4754.1945033073425, "step_time_sec": 8.22739127499517, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 576, "loss": 6.337723731994629, "lr": 0.0002, "elapsed_sec": 4762.423032283783, "step_time_sec": 8.228383055015001, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 577, "loss": 6.564232349395752, "lr": 0.0002, "elapsed_sec": 4770.649978399277, "step_time_sec": 8.226802605000557, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 578, "loss": 6.532543659210205, "lr": 0.0002, "elapsed_sec": 4778.877529382706, "step_time_sec": 8.227339753997512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 579, "loss": 6.31834602355957, "lr": 0.0002, "elapsed_sec": 4787.10743021965, "step_time_sec": 8.229775932995835, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 580, "loss": 6.339108467102051, "lr": 0.0002, "elapsed_sec": 4795.33683013916, "step_time_sec": 8.229195980995428, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 581, "loss": 6.370757579803467, "lr": 0.0002, "elapsed_sec": 4803.566791296005, "step_time_sec": 8.229812007019063, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 582, "loss": 6.367822647094727, "lr": 0.0002, "elapsed_sec": 4811.7946672439575, "step_time_sec": 8.227708708000137, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 583, "loss": 6.274897575378418, "lr": 0.0002, "elapsed_sec": 4820.0220239162445, "step_time_sec": 8.227213161007967, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 584, "loss": 6.346560478210449, "lr": 0.0002, "elapsed_sec": 4828.247556686401, "step_time_sec": 8.225358113995753, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 585, "loss": 6.30031681060791, "lr": 0.0002, "elapsed_sec": 4836.475522279739, "step_time_sec": 8.227881427999819, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 586, "loss": 6.516203880310059, "lr": 0.0002, "elapsed_sec": 4844.703823328018, "step_time_sec": 8.228080438013421, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 587, "loss": 6.268025875091553, "lr": 0.0002, "elapsed_sec": 4852.932485103607, "step_time_sec": 8.228493553993758, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 588, "loss": 6.2829909324646, "lr": 0.0002, "elapsed_sec": 4861.1612594127655, "step_time_sec": 8.228612564009381, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 589, "loss": 6.2796950340271, "lr": 0.0002, "elapsed_sec": 4869.387929677963, "step_time_sec": 8.226539128983859, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 590, "loss": 6.25394344329834, "lr": 0.0002, "elapsed_sec": 4877.617498159409, "step_time_sec": 8.229421283002011, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 591, "loss": 6.216843605041504, "lr": 0.0002, "elapsed_sec": 4885.84440612793, "step_time_sec": 8.226739924022695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 592, "loss": 6.297022819519043, "lr": 0.0002, "elapsed_sec": 4894.072231054306, "step_time_sec": 8.227679392992286, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 593, "loss": 6.289453506469727, "lr": 0.0002, "elapsed_sec": 4902.298044204712, "step_time_sec": 8.22567200800404, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 594, "loss": 6.452700138092041, "lr": 0.0002, "elapsed_sec": 4910.528904438019, "step_time_sec": 8.230678833002457, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 595, "loss": 6.277034759521484, "lr": 0.0002, "elapsed_sec": 4918.758177518845, "step_time_sec": 8.229090774024371, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 596, "loss": 6.281892776489258, "lr": 0.0002, "elapsed_sec": 4926.988025665283, "step_time_sec": 8.229721847979818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 597, "loss": 6.325608253479004, "lr": 0.0002, "elapsed_sec": 4935.217805862427, "step_time_sec": 8.229644278995693, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 598, "loss": 6.306236267089844, "lr": 0.0002, "elapsed_sec": 4943.447399377823, "step_time_sec": 8.229363083984936, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 599, "loss": 6.401443004608154, "lr": 0.0002, "elapsed_sec": 4951.674699544907, "step_time_sec": 8.227204308001092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 600, "loss": 6.3036789894104, "lr": 0.0002, "elapsed_sec": 4959.903571128845, "step_time_sec": 8.22863346899976, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 601, "loss": 6.443446159362793, "lr": 0.0002, "elapsed_sec": 4968.131560325623, "step_time_sec": 8.227882064995356, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 602, "loss": 6.274202823638916, "lr": 0.0002, "elapsed_sec": 4976.359411239624, "step_time_sec": 8.227649058011593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 603, "loss": 6.397613048553467, "lr": 0.0002, "elapsed_sec": 4984.588287115097, "step_time_sec": 8.228697893006029, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 604, "loss": 6.291489601135254, "lr": 0.0002, "elapsed_sec": 4992.817261695862, "step_time_sec": 8.228823515004478, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 605, "loss": 6.262211322784424, "lr": 0.0002, "elapsed_sec": 5001.047587633133, "step_time_sec": 8.230128652008716, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 606, "loss": 6.274276256561279, "lr": 0.0002, "elapsed_sec": 5009.277063131332, "step_time_sec": 8.229329649999272, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 607, "loss": 6.241001605987549, "lr": 0.0002, "elapsed_sec": 5017.507066249847, "step_time_sec": 8.229870008013677, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 608, "loss": 6.275449275970459, "lr": 0.0002, "elapsed_sec": 5025.7373046875, "step_time_sec": 8.230047521006782, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 609, "loss": 6.159284591674805, "lr": 0.0002, "elapsed_sec": 5033.966765165329, "step_time_sec": 8.229274833021918, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 610, "loss": 6.268886566162109, "lr": 0.0002, "elapsed_sec": 5042.196095705032, "step_time_sec": 8.229239042004338, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 611, "loss": 6.169172286987305, "lr": 0.0002, "elapsed_sec": 5050.425016641617, "step_time_sec": 8.228674256009981, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 612, "loss": 6.304986476898193, "lr": 0.0002, "elapsed_sec": 5058.6552629470825, "step_time_sec": 8.230093015008606, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 613, "loss": 6.270630359649658, "lr": 0.0002, "elapsed_sec": 5066.8833973407745, "step_time_sec": 8.228018500987673, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 614, "loss": 6.212421417236328, "lr": 0.0002, "elapsed_sec": 5075.112117767334, "step_time_sec": 8.228487613989273, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 615, "loss": 6.444400310516357, "lr": 0.0002, "elapsed_sec": 5083.342001199722, "step_time_sec": 8.22968886300805, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 616, "loss": 6.128503322601318, "lr": 0.0002, "elapsed_sec": 5091.571264266968, "step_time_sec": 8.229198001994519, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 617, "loss": 6.217636585235596, "lr": 0.0002, "elapsed_sec": 5099.801372766495, "step_time_sec": 8.229883705993416, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 618, "loss": 6.198116779327393, "lr": 0.0002, "elapsed_sec": 5108.031309127808, "step_time_sec": 8.229802729008952, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 619, "loss": 6.224739074707031, "lr": 0.0002, "elapsed_sec": 5116.260021209717, "step_time_sec": 8.228527275001397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 620, "loss": 6.321182727813721, "lr": 0.0002, "elapsed_sec": 5124.487384080887, "step_time_sec": 8.227189693017863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 621, "loss": 6.305802345275879, "lr": 0.0002, "elapsed_sec": 5132.714299440384, "step_time_sec": 8.226777292002225, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 622, "loss": 6.210368633270264, "lr": 0.0002, "elapsed_sec": 5140.942606449127, "step_time_sec": 8.228145764995134, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 623, "loss": 6.092025279998779, "lr": 0.0002, "elapsed_sec": 5149.171190023422, "step_time_sec": 8.228456929995446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 624, "loss": 6.232975482940674, "lr": 0.0002, "elapsed_sec": 5157.399624586105, "step_time_sec": 8.22830369998701, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 625, "loss": 6.323404788970947, "lr": 0.0002, "elapsed_sec": 5165.625571012497, "step_time_sec": 8.225726440985454, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 626, "loss": 6.197755336761475, "lr": 0.0002, "elapsed_sec": 5173.884917497635, "step_time_sec": 8.23685468902113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 627, "loss": 6.186779975891113, "lr": 0.0002, "elapsed_sec": 5182.112818002701, "step_time_sec": 8.227658224001061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 628, "loss": 6.258362770080566, "lr": 0.0002, "elapsed_sec": 5190.341230630875, "step_time_sec": 8.228218494012253, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 629, "loss": 6.173934459686279, "lr": 0.0002, "elapsed_sec": 5198.571209669113, "step_time_sec": 8.229836362996139, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 630, "loss": 6.168494701385498, "lr": 0.0002, "elapsed_sec": 5206.798898458481, "step_time_sec": 8.227595071977703, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 631, "loss": 6.323765754699707, "lr": 0.0002, "elapsed_sec": 5215.028181314468, "step_time_sec": 8.229109298001276, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 632, "loss": 6.136216163635254, "lr": 0.0002, "elapsed_sec": 5223.255671262741, "step_time_sec": 8.227290972019546, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 633, "loss": 6.267079830169678, "lr": 0.0002, "elapsed_sec": 5231.485114812851, "step_time_sec": 8.229358290001983, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 634, "loss": 6.336837291717529, "lr": 0.0002, "elapsed_sec": 5239.715103387833, "step_time_sec": 8.22976464300882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 635, "loss": 6.195796012878418, "lr": 0.0002, "elapsed_sec": 5247.944612979889, "step_time_sec": 8.229344694991596, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 636, "loss": 6.23333740234375, "lr": 0.0002, "elapsed_sec": 5256.173502206802, "step_time_sec": 8.22878973200568, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 637, "loss": 6.269836902618408, "lr": 0.0002, "elapsed_sec": 5264.403213024139, "step_time_sec": 8.229535906983074, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 638, "loss": 6.322137355804443, "lr": 0.0002, "elapsed_sec": 5272.633345127106, "step_time_sec": 8.229961197008379, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 639, "loss": 6.1107587814331055, "lr": 0.0002, "elapsed_sec": 5280.863160610199, "step_time_sec": 8.229651055007707, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 640, "loss": 6.200603008270264, "lr": 0.0002, "elapsed_sec": 5289.092659950256, "step_time_sec": 8.229310302995145, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 641, "loss": 6.3400797843933105, "lr": 0.0002, "elapsed_sec": 5297.322039842606, "step_time_sec": 8.229260556981899, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 642, "loss": 6.261970043182373, "lr": 0.0002, "elapsed_sec": 5305.5515303611755, "step_time_sec": 8.229290933988523, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 643, "loss": 6.2697014808654785, "lr": 0.0002, "elapsed_sec": 5313.7813584804535, "step_time_sec": 8.229673317982815, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 644, "loss": 6.192412376403809, "lr": 0.0002, "elapsed_sec": 5322.010882616043, "step_time_sec": 8.229400237003574, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 645, "loss": 6.234358787536621, "lr": 0.0002, "elapsed_sec": 5330.240886211395, "step_time_sec": 8.229801884997869, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 646, "loss": 6.229666709899902, "lr": 0.0002, "elapsed_sec": 5338.470154285431, "step_time_sec": 8.229163445008453, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 647, "loss": 6.290761470794678, "lr": 0.0002, "elapsed_sec": 5346.700046539307, "step_time_sec": 8.229666045983322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 648, "loss": 6.24416971206665, "lr": 0.0002, "elapsed_sec": 5354.9300010204315, "step_time_sec": 8.229842386994278, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 649, "loss": 6.28919792175293, "lr": 0.0002, "elapsed_sec": 5363.157231330872, "step_time_sec": 8.226999209000496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 650, "loss": 6.18079137802124, "lr": 0.0002, "elapsed_sec": 5371.38464641571, "step_time_sec": 8.22726451800554, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 651, "loss": 6.332503795623779, "lr": 0.0002, "elapsed_sec": 5379.614539146423, "step_time_sec": 8.229727316997014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 652, "loss": 6.345889568328857, "lr": 0.0002, "elapsed_sec": 5387.843027591705, "step_time_sec": 8.228358698019292, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 653, "loss": 6.167347431182861, "lr": 0.0002, "elapsed_sec": 5396.072008609772, "step_time_sec": 8.228773597016698, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 654, "loss": 6.129332065582275, "lr": 0.0002, "elapsed_sec": 5404.301355838776, "step_time_sec": 8.229184446012368, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 655, "loss": 6.233705520629883, "lr": 0.0002, "elapsed_sec": 5412.531193494797, "step_time_sec": 8.229710008017719, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 656, "loss": 6.348316192626953, "lr": 0.0002, "elapsed_sec": 5420.758432865143, "step_time_sec": 8.227024755004095, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 657, "loss": 6.156805038452148, "lr": 0.0002, "elapsed_sec": 5428.986213684082, "step_time_sec": 8.227663541998481, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 658, "loss": 6.145750522613525, "lr": 0.0002, "elapsed_sec": 5437.213907241821, "step_time_sec": 8.227561747014988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 659, "loss": 6.294693470001221, "lr": 0.0002, "elapsed_sec": 5445.442889928818, "step_time_sec": 8.228785357991, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 660, "loss": 6.153820037841797, "lr": 0.0002, "elapsed_sec": 5453.670357465744, "step_time_sec": 8.227287825982785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 661, "loss": 6.216848373413086, "lr": 0.0002, "elapsed_sec": 5461.899628639221, "step_time_sec": 8.22909671801608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 662, "loss": 6.126791000366211, "lr": 0.0002, "elapsed_sec": 5470.129055738449, "step_time_sec": 8.229276544007007, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 663, "loss": 6.12830114364624, "lr": 0.0002, "elapsed_sec": 5478.358179092407, "step_time_sec": 8.229039441997884, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 664, "loss": 6.292793273925781, "lr": 0.0002, "elapsed_sec": 5486.587469816208, "step_time_sec": 8.229061853024177, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 665, "loss": 6.253063678741455, "lr": 0.0002, "elapsed_sec": 5494.81729054451, "step_time_sec": 8.229710209008772, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 666, "loss": 6.228337287902832, "lr": 0.0002, "elapsed_sec": 5503.04408288002, "step_time_sec": 8.226629558979766, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 667, "loss": 6.162898540496826, "lr": 0.0002, "elapsed_sec": 5511.271881580353, "step_time_sec": 8.22761014799471, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 668, "loss": 6.161266803741455, "lr": 0.0002, "elapsed_sec": 5519.498777151108, "step_time_sec": 8.226731080008904, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 669, "loss": 6.1685261726379395, "lr": 0.0002, "elapsed_sec": 5527.726466417313, "step_time_sec": 8.227539690997219, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 670, "loss": 6.272224426269531, "lr": 0.0002, "elapsed_sec": 5535.954494714737, "step_time_sec": 8.227846382011194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 671, "loss": 6.1233367919921875, "lr": 0.0002, "elapsed_sec": 5544.182520866394, "step_time_sec": 8.227869377005845, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 672, "loss": 6.192683219909668, "lr": 0.0002, "elapsed_sec": 5552.409935712814, "step_time_sec": 8.227316156000597, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 673, "loss": 6.219451427459717, "lr": 0.0002, "elapsed_sec": 5560.638453960419, "step_time_sec": 8.228278966009384, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 674, "loss": 6.193971633911133, "lr": 0.0002, "elapsed_sec": 5568.867523908615, "step_time_sec": 8.228918023989536, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 675, "loss": 6.131769180297852, "lr": 0.0002, "elapsed_sec": 5577.097105503082, "step_time_sec": 8.229440511000575, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 676, "loss": 6.201135635375977, "lr": 0.0002, "elapsed_sec": 5585.32644200325, "step_time_sec": 8.229160666989628, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 677, "loss": 6.21390438079834, "lr": 0.0002, "elapsed_sec": 5593.555561065674, "step_time_sec": 8.229024709988153, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 678, "loss": 6.083154201507568, "lr": 0.0002, "elapsed_sec": 5601.7854771614075, "step_time_sec": 8.229679161013337, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 679, "loss": 6.1540937423706055, "lr": 0.0002, "elapsed_sec": 5610.014412641525, "step_time_sec": 8.228781893005362, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 680, "loss": 6.095361232757568, "lr": 0.0002, "elapsed_sec": 5618.244918823242, "step_time_sec": 8.230348391982261, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 681, "loss": 6.216634750366211, "lr": 0.0002, "elapsed_sec": 5626.4744753837585, "step_time_sec": 8.229382374993293, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 682, "loss": 6.228910446166992, "lr": 0.0002, "elapsed_sec": 5634.702681064606, "step_time_sec": 8.228050940000685, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 683, "loss": 6.17949104309082, "lr": 0.0002, "elapsed_sec": 5642.932439565659, "step_time_sec": 8.22962815099163, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 684, "loss": 6.256796360015869, "lr": 0.0002, "elapsed_sec": 5651.162771940231, "step_time_sec": 8.230158122023568, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 685, "loss": 6.294649600982666, "lr": 0.0002, "elapsed_sec": 5659.392623186111, "step_time_sec": 8.229720121016726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 686, "loss": 6.031900405883789, "lr": 0.0002, "elapsed_sec": 5667.62136888504, "step_time_sec": 8.22861254599411, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 687, "loss": 6.103900909423828, "lr": 0.0002, "elapsed_sec": 5675.850527763367, "step_time_sec": 8.228919976012548, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 688, "loss": 6.052828788757324, "lr": 0.0002, "elapsed_sec": 5684.080566644669, "step_time_sec": 8.229861269996036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 689, "loss": 6.166463851928711, "lr": 0.0002, "elapsed_sec": 5692.308190107346, "step_time_sec": 8.227535974991042, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 690, "loss": 6.150754928588867, "lr": 0.0002, "elapsed_sec": 5700.536917209625, "step_time_sec": 8.228532077977434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 691, "loss": 6.188103199005127, "lr": 0.0002, "elapsed_sec": 5708.764824390411, "step_time_sec": 8.227757427986944, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 692, "loss": 6.245376110076904, "lr": 0.0002, "elapsed_sec": 5716.994243383408, "step_time_sec": 8.229243439011043, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 693, "loss": 6.156307220458984, "lr": 0.0002, "elapsed_sec": 5725.222449302673, "step_time_sec": 8.228024865995394, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 694, "loss": 6.110998630523682, "lr": 0.0002, "elapsed_sec": 5733.45032787323, "step_time_sec": 8.227756957989186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 695, "loss": 6.209361553192139, "lr": 0.0002, "elapsed_sec": 5741.678519248962, "step_time_sec": 8.22800444898894, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 696, "loss": 6.138263702392578, "lr": 0.0002, "elapsed_sec": 5749.906722784042, "step_time_sec": 8.228034569998272, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 697, "loss": 6.1043572425842285, "lr": 0.0002, "elapsed_sec": 5758.135551691055, "step_time_sec": 8.228647182986606, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 698, "loss": 6.19089937210083, "lr": 0.0002, "elapsed_sec": 5766.365439414978, "step_time_sec": 8.229816153994761, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 699, "loss": 6.250263690948486, "lr": 0.0002, "elapsed_sec": 5774.59458732605, "step_time_sec": 8.228935338003794, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 700, "loss": 6.145717620849609, "lr": 0.0002, "elapsed_sec": 5782.821449518204, "step_time_sec": 8.22672589099966, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 701, "loss": 6.241276264190674, "lr": 0.0002, "elapsed_sec": 5791.050629377365, "step_time_sec": 8.229002480016788, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 702, "loss": 6.174741744995117, "lr": 0.0002, "elapsed_sec": 5799.277525663376, "step_time_sec": 8.226747937005712, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 703, "loss": 6.202995777130127, "lr": 0.0002, "elapsed_sec": 5807.505534410477, "step_time_sec": 8.227856337995036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 704, "loss": 6.132613658905029, "lr": 0.0002, "elapsed_sec": 5815.734062671661, "step_time_sec": 8.228345170995453, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 705, "loss": 6.314485549926758, "lr": 0.0002, "elapsed_sec": 5823.9637088775635, "step_time_sec": 8.229564795998158, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 706, "loss": 6.175285339355469, "lr": 0.0002, "elapsed_sec": 5832.193297386169, "step_time_sec": 8.229356432973873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 707, "loss": 6.110280513763428, "lr": 0.0002, "elapsed_sec": 5840.42297911644, "step_time_sec": 8.229530598997371, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 708, "loss": 6.15895938873291, "lr": 0.0002, "elapsed_sec": 5848.652250051498, "step_time_sec": 8.229113205015892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 709, "loss": 6.071022033691406, "lr": 0.0002, "elapsed_sec": 5856.8819625377655, "step_time_sec": 8.229537685983814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 710, "loss": 6.122594833374023, "lr": 0.0002, "elapsed_sec": 5865.111541509628, "step_time_sec": 8.229433150991099, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 711, "loss": 6.163331985473633, "lr": 0.0002, "elapsed_sec": 5873.342009544373, "step_time_sec": 8.230339881993132, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 712, "loss": 6.187577724456787, "lr": 0.0002, "elapsed_sec": 5881.57205080986, "step_time_sec": 8.229837493010564, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 713, "loss": 6.14393949508667, "lr": 0.0002, "elapsed_sec": 5889.801565647125, "step_time_sec": 8.229363574006129, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 714, "loss": 6.091925144195557, "lr": 0.0002, "elapsed_sec": 5898.0301632881165, "step_time_sec": 8.228478218021337, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 715, "loss": 6.191019058227539, "lr": 0.0002, "elapsed_sec": 5906.259252071381, "step_time_sec": 8.22889188802219, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 716, "loss": 6.093130111694336, "lr": 0.0002, "elapsed_sec": 5914.488817930222, "step_time_sec": 8.229395050992025, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 717, "loss": 6.102323532104492, "lr": 0.0002, "elapsed_sec": 5922.718480825424, "step_time_sec": 8.22955992000061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 718, "loss": 6.048957824707031, "lr": 0.0002, "elapsed_sec": 5930.948971271515, "step_time_sec": 8.230341215996305, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 719, "loss": 6.1512627601623535, "lr": 0.0002, "elapsed_sec": 5939.178151607513, "step_time_sec": 8.229015398013871, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 720, "loss": 6.133179187774658, "lr": 0.0002, "elapsed_sec": 5947.405771493912, "step_time_sec": 8.227413230983075, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 721, "loss": 6.207662105560303, "lr": 0.0002, "elapsed_sec": 5955.6346163749695, "step_time_sec": 8.228682246000972, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 722, "loss": 6.156109809875488, "lr": 0.0002, "elapsed_sec": 5963.863622188568, "step_time_sec": 8.228842020995216, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 723, "loss": 6.0492401123046875, "lr": 0.0002, "elapsed_sec": 5972.09214758873, "step_time_sec": 8.228373273013858, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 724, "loss": 6.11249303817749, "lr": 0.0002, "elapsed_sec": 5980.32081413269, "step_time_sec": 8.22849361601402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 725, "loss": 6.061827182769775, "lr": 0.0002, "elapsed_sec": 5988.548986911774, "step_time_sec": 8.228043531998992, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 726, "loss": 6.206127166748047, "lr": 0.0002, "elapsed_sec": 5996.778615951538, "step_time_sec": 8.229455867985962, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 727, "loss": 6.153193950653076, "lr": 0.0002, "elapsed_sec": 6005.007161617279, "step_time_sec": 8.22839465099969, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 728, "loss": 6.050606727600098, "lr": 0.0002, "elapsed_sec": 6013.236143350601, "step_time_sec": 8.228830031992402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 729, "loss": 6.276336669921875, "lr": 0.0002, "elapsed_sec": 6021.466197252274, "step_time_sec": 8.229872901982162, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 730, "loss": 6.19215726852417, "lr": 0.0002, "elapsed_sec": 6029.69612455368, "step_time_sec": 8.229783688992029, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 731, "loss": 6.136540412902832, "lr": 0.0002, "elapsed_sec": 6037.926260471344, "step_time_sec": 8.230025947996182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 732, "loss": 6.015045166015625, "lr": 0.0002, "elapsed_sec": 6046.155958414078, "step_time_sec": 8.22953298300854, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 733, "loss": 6.034610748291016, "lr": 0.0002, "elapsed_sec": 6054.385951042175, "step_time_sec": 8.229766092001228, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 734, "loss": 6.075456142425537, "lr": 0.0002, "elapsed_sec": 6062.615327835083, "step_time_sec": 8.22922212901176, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 735, "loss": 6.086706638336182, "lr": 0.0002, "elapsed_sec": 6070.842189788818, "step_time_sec": 8.226777867006604, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 736, "loss": 6.156525611877441, "lr": 0.0002, "elapsed_sec": 6079.070022344589, "step_time_sec": 8.227628111984814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 737, "loss": 6.07059907913208, "lr": 0.0002, "elapsed_sec": 6087.29859828949, "step_time_sec": 8.2284133079811, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 738, "loss": 6.092706680297852, "lr": 0.0002, "elapsed_sec": 6095.527815818787, "step_time_sec": 8.229046143009327, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 739, "loss": 6.073668956756592, "lr": 0.0002, "elapsed_sec": 6103.756119012833, "step_time_sec": 8.228213927010074, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 740, "loss": 6.195743083953857, "lr": 0.0002, "elapsed_sec": 6111.9863386154175, "step_time_sec": 8.230033889005426, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 741, "loss": 6.166141986846924, "lr": 0.0002, "elapsed_sec": 6120.216199159622, "step_time_sec": 8.2296662350127, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 742, "loss": 6.051299571990967, "lr": 0.0002, "elapsed_sec": 6128.446484088898, "step_time_sec": 8.230148331989767, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 743, "loss": 6.1156697273254395, "lr": 0.0002, "elapsed_sec": 6136.675990343094, "step_time_sec": 8.229386250983225, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 744, "loss": 6.01693058013916, "lr": 0.0002, "elapsed_sec": 6144.905163049698, "step_time_sec": 8.228984501009108, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 745, "loss": 5.97791051864624, "lr": 0.0002, "elapsed_sec": 6153.133905887604, "step_time_sec": 8.228637806983897, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 746, "loss": 6.120555877685547, "lr": 0.0002, "elapsed_sec": 6161.361617803574, "step_time_sec": 8.227479522000067, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 747, "loss": 6.170558452606201, "lr": 0.0002, "elapsed_sec": 6169.5927946567535, "step_time_sec": 8.231105199025478, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 748, "loss": 6.1477131843566895, "lr": 0.0002, "elapsed_sec": 6177.821706771851, "step_time_sec": 8.228722255997127, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 749, "loss": 6.15254020690918, "lr": 0.0002, "elapsed_sec": 6186.0516448020935, "step_time_sec": 8.22975708998274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 750, "loss": 6.062813758850098, "lr": 0.0002, "elapsed_sec": 6194.281987428665, "step_time_sec": 8.23017510099453, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 751, "loss": 6.063478469848633, "lr": 0.0002, "elapsed_sec": 6202.5121467113495, "step_time_sec": 8.230033883999567, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 752, "loss": 6.1396002769470215, "lr": 0.0002, "elapsed_sec": 6210.742185592651, "step_time_sec": 8.229857970000012, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 753, "loss": 6.089064598083496, "lr": 0.0002, "elapsed_sec": 6218.971966743469, "step_time_sec": 8.229624412022531, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 754, "loss": 6.064237594604492, "lr": 0.0002, "elapsed_sec": 6227.202111005783, "step_time_sec": 8.22999775299104, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 755, "loss": 6.148715972900391, "lr": 0.0002, "elapsed_sec": 6235.431606769562, "step_time_sec": 8.229341690021101, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 756, "loss": 6.219992160797119, "lr": 0.0002, "elapsed_sec": 6243.660724878311, "step_time_sec": 8.22895012499066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 757, "loss": 6.062867641448975, "lr": 0.0002, "elapsed_sec": 6251.890699625015, "step_time_sec": 8.22981648601126, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 758, "loss": 6.158252716064453, "lr": 0.0002, "elapsed_sec": 6260.121078014374, "step_time_sec": 8.230239895987324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 759, "loss": 6.046947479248047, "lr": 0.0002, "elapsed_sec": 6268.350981235504, "step_time_sec": 8.229773225000827, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 760, "loss": 6.123488903045654, "lr": 0.0002, "elapsed_sec": 6276.581159114838, "step_time_sec": 8.229948315012734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 761, "loss": 6.11777925491333, "lr": 0.0002, "elapsed_sec": 6284.810391902924, "step_time_sec": 8.229079932003515, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 762, "loss": 6.019218444824219, "lr": 0.0002, "elapsed_sec": 6293.0406177043915, "step_time_sec": 8.230141254985938, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 763, "loss": 6.109716892242432, "lr": 0.0002, "elapsed_sec": 6301.269071102142, "step_time_sec": 8.228240511991316, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 764, "loss": 6.115269660949707, "lr": 0.0002, "elapsed_sec": 6309.497372150421, "step_time_sec": 8.228153812990058, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 765, "loss": 5.947271823883057, "lr": 0.0002, "elapsed_sec": 6317.726912975311, "step_time_sec": 8.229360972007271, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 766, "loss": 6.118536949157715, "lr": 0.0002, "elapsed_sec": 6325.956646680832, "step_time_sec": 8.229598464007722, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 767, "loss": 6.086315631866455, "lr": 0.0002, "elapsed_sec": 6334.18622136116, "step_time_sec": 8.229403521982022, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 768, "loss": 6.002045154571533, "lr": 0.0002, "elapsed_sec": 6342.415272474289, "step_time_sec": 8.228888675017515, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 769, "loss": 6.038228988647461, "lr": 0.0002, "elapsed_sec": 6350.645541191101, "step_time_sec": 8.23010168998735, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 770, "loss": 5.918094158172607, "lr": 0.0002, "elapsed_sec": 6358.874642133713, "step_time_sec": 8.228948564006714, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 771, "loss": 6.195390224456787, "lr": 0.0002, "elapsed_sec": 6367.102824687958, "step_time_sec": 8.22803903798922, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 772, "loss": 6.126861095428467, "lr": 0.0002, "elapsed_sec": 6375.332010746002, "step_time_sec": 8.229093169997213, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 773, "loss": 6.049898147583008, "lr": 0.0002, "elapsed_sec": 6383.5592658519745, "step_time_sec": 8.227029998000944, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 774, "loss": 6.053187847137451, "lr": 0.0002, "elapsed_sec": 6391.788898468018, "step_time_sec": 8.229474877996836, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 775, "loss": 6.141943454742432, "lr": 0.0002, "elapsed_sec": 6400.018761873245, "step_time_sec": 8.229757702007191, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 776, "loss": 5.922675132751465, "lr": 0.0002, "elapsed_sec": 6408.248254060745, "step_time_sec": 8.229367601015838, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 777, "loss": 6.055163860321045, "lr": 0.0002, "elapsed_sec": 6416.478088378906, "step_time_sec": 8.229590669012396, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 778, "loss": 6.1147613525390625, "lr": 0.0002, "elapsed_sec": 6424.705068588257, "step_time_sec": 8.226865835982608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 779, "loss": 6.127535820007324, "lr": 0.0002, "elapsed_sec": 6432.9328417778015, "step_time_sec": 8.227599006000673, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 780, "loss": 5.9755353927612305, "lr": 0.0002, "elapsed_sec": 6441.163021564484, "step_time_sec": 8.230017881985987, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 781, "loss": 6.16023063659668, "lr": 0.0002, "elapsed_sec": 6449.392754793167, "step_time_sec": 8.22958756799926, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 782, "loss": 6.268181324005127, "lr": 0.0002, "elapsed_sec": 6457.623212337494, "step_time_sec": 8.23029050801415, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 783, "loss": 6.055598258972168, "lr": 0.0002, "elapsed_sec": 6465.850870370865, "step_time_sec": 8.227526702015894, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 784, "loss": 6.184757232666016, "lr": 0.0002, "elapsed_sec": 6474.0795974731445, "step_time_sec": 8.228535274975002, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 785, "loss": 6.174934387207031, "lr": 0.0002, "elapsed_sec": 6482.307100534439, "step_time_sec": 8.227342700993177, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 786, "loss": 6.081526279449463, "lr": 0.0002, "elapsed_sec": 6490.535474538803, "step_time_sec": 8.22825825199834, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 787, "loss": 6.059023380279541, "lr": 0.0002, "elapsed_sec": 6498.763379573822, "step_time_sec": 8.227671277010813, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 788, "loss": 6.07538366317749, "lr": 0.0002, "elapsed_sec": 6506.991659641266, "step_time_sec": 8.228112506010802, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 789, "loss": 6.061354637145996, "lr": 0.0002, "elapsed_sec": 6515.220409631729, "step_time_sec": 8.228597298002569, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 790, "loss": 6.011831283569336, "lr": 0.0002, "elapsed_sec": 6523.449462413788, "step_time_sec": 8.228983108012471, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 791, "loss": 5.987299919128418, "lr": 0.0002, "elapsed_sec": 6531.67826461792, "step_time_sec": 8.228584508004133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 792, "loss": 6.0089569091796875, "lr": 0.0002, "elapsed_sec": 6539.905841827393, "step_time_sec": 8.227423343982082, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 793, "loss": 6.010016441345215, "lr": 0.0002, "elapsed_sec": 6548.134752035141, "step_time_sec": 8.22876540699508, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 794, "loss": 6.067809104919434, "lr": 0.0002, "elapsed_sec": 6556.363008975983, "step_time_sec": 8.228094032005174, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 795, "loss": 5.886960506439209, "lr": 0.0002, "elapsed_sec": 6564.590330123901, "step_time_sec": 8.227151580998907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 796, "loss": 6.024059772491455, "lr": 0.0002, "elapsed_sec": 6572.819610357285, "step_time_sec": 8.229106352984672, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 797, "loss": 6.0934529304504395, "lr": 0.0002, "elapsed_sec": 6581.049922466278, "step_time_sec": 8.230188218993135, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 798, "loss": 5.962614059448242, "lr": 0.0002, "elapsed_sec": 6589.278096914291, "step_time_sec": 8.228050463018008, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 799, "loss": 6.123087406158447, "lr": 0.0002, "elapsed_sec": 6597.506222486496, "step_time_sec": 8.227946753992, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 800, "loss": 6.131389617919922, "lr": 0.0002, "elapsed_sec": 6605.7335460186005, "step_time_sec": 8.227210184995783, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 801, "loss": 6.021420955657959, "lr": 0.0002, "elapsed_sec": 6613.960973501205, "step_time_sec": 8.227214373997413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 802, "loss": 5.954859256744385, "lr": 0.0002, "elapsed_sec": 6622.190683841705, "step_time_sec": 8.229537959006848, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 803, "loss": 5.946731090545654, "lr": 0.0002, "elapsed_sec": 6630.419895887375, "step_time_sec": 8.229072605987312, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 804, "loss": 5.8672308921813965, "lr": 0.0002, "elapsed_sec": 6638.648703575134, "step_time_sec": 8.228685292997397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 805, "loss": 5.982762813568115, "lr": 0.0002, "elapsed_sec": 6646.877566337585, "step_time_sec": 8.228659135987982, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 806, "loss": 5.896338939666748, "lr": 0.0002, "elapsed_sec": 6655.1064648628235, "step_time_sec": 8.228712481999537, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 807, "loss": 6.234790802001953, "lr": 0.0002, "elapsed_sec": 6663.336193323135, "step_time_sec": 8.229617392004002, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 808, "loss": 5.947345733642578, "lr": 0.0002, "elapsed_sec": 6671.566642999649, "step_time_sec": 8.23024542600615, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 809, "loss": 6.079411029815674, "lr": 0.0002, "elapsed_sec": 6679.796672582626, "step_time_sec": 8.229872497991892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 810, "loss": 5.953326225280762, "lr": 0.0002, "elapsed_sec": 6688.025723218918, "step_time_sec": 8.228851633990416, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 811, "loss": 5.906106948852539, "lr": 0.0002, "elapsed_sec": 6696.254667043686, "step_time_sec": 8.228857806010637, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 812, "loss": 6.036899089813232, "lr": 0.0002, "elapsed_sec": 6704.484380722046, "step_time_sec": 8.229535508027766, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 813, "loss": 6.007087230682373, "lr": 0.0002, "elapsed_sec": 6713.764996051788, "step_time_sec": 9.280448671022896, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 814, "loss": 6.05131196975708, "lr": 0.0002, "elapsed_sec": 6721.994819879532, "step_time_sec": 8.229589256981853, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 815, "loss": 5.913909912109375, "lr": 0.0002, "elapsed_sec": 6730.223656177521, "step_time_sec": 8.228702205000445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 816, "loss": 6.012244701385498, "lr": 0.0002, "elapsed_sec": 6738.45378279686, "step_time_sec": 8.229986960999668, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 817, "loss": 5.935700416564941, "lr": 0.0002, "elapsed_sec": 6746.683435916901, "step_time_sec": 8.229447798017645, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 818, "loss": 5.956183433532715, "lr": 0.0002, "elapsed_sec": 6754.9128658771515, "step_time_sec": 8.229277745995205, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 819, "loss": 5.964108467102051, "lr": 0.0002, "elapsed_sec": 6763.143188714981, "step_time_sec": 8.23015905500506, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 820, "loss": 5.984810829162598, "lr": 0.0002, "elapsed_sec": 6771.3715925216675, "step_time_sec": 8.228269338986138, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 821, "loss": 5.880571365356445, "lr": 0.0002, "elapsed_sec": 6779.600259065628, "step_time_sec": 8.228531486995053, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 822, "loss": 5.887503623962402, "lr": 0.0002, "elapsed_sec": 6787.829982757568, "step_time_sec": 8.22953530299128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 823, "loss": 5.9919047355651855, "lr": 0.0002, "elapsed_sec": 6796.057973623276, "step_time_sec": 8.227827964990865, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 824, "loss": 6.0330071449279785, "lr": 0.0002, "elapsed_sec": 6804.286880016327, "step_time_sec": 8.22874270498869, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 825, "loss": 6.003767967224121, "lr": 0.0002, "elapsed_sec": 6812.516555786133, "step_time_sec": 8.229528897994896, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 826, "loss": 6.11436128616333, "lr": 0.0002, "elapsed_sec": 6820.746666431427, "step_time_sec": 8.22993011699873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 827, "loss": 5.948924541473389, "lr": 0.0002, "elapsed_sec": 6828.977450609207, "step_time_sec": 8.230680931999814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 828, "loss": 5.937865734100342, "lr": 0.0002, "elapsed_sec": 6837.2071788311005, "step_time_sec": 8.229513614001917, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 829, "loss": 5.945302963256836, "lr": 0.0002, "elapsed_sec": 6845.4367961883545, "step_time_sec": 8.229463317984482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 830, "loss": 5.86661434173584, "lr": 0.0002, "elapsed_sec": 6853.664515733719, "step_time_sec": 8.227633676986443, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 831, "loss": 6.029479503631592, "lr": 0.0002, "elapsed_sec": 6861.89400434494, "step_time_sec": 8.22925355299958, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 832, "loss": 6.005712032318115, "lr": 0.0002, "elapsed_sec": 6870.124233007431, "step_time_sec": 8.230063593015075, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 833, "loss": 5.917523384094238, "lr": 0.0002, "elapsed_sec": 6878.352600812912, "step_time_sec": 8.228228847001446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 834, "loss": 5.875757217407227, "lr": 0.0002, "elapsed_sec": 6886.579526901245, "step_time_sec": 8.226788256986765, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 835, "loss": 5.961983680725098, "lr": 0.0002, "elapsed_sec": 6894.807817220688, "step_time_sec": 8.228110827010823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 836, "loss": 5.972943305969238, "lr": 0.0002, "elapsed_sec": 6903.036787748337, "step_time_sec": 8.228806213010103, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 837, "loss": 5.9331955909729, "lr": 0.0002, "elapsed_sec": 6911.265148639679, "step_time_sec": 8.228291749983327, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 838, "loss": 5.931972980499268, "lr": 0.0002, "elapsed_sec": 6919.49493432045, "step_time_sec": 8.229583562992048, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 839, "loss": 5.966734409332275, "lr": 0.0002, "elapsed_sec": 6927.72473859787, "step_time_sec": 8.22962884500157, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 840, "loss": 5.98411750793457, "lr": 0.0002, "elapsed_sec": 6935.9522495269775, "step_time_sec": 8.22740774898557, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 841, "loss": 5.901174068450928, "lr": 0.0002, "elapsed_sec": 6944.181028604507, "step_time_sec": 8.228622030990664, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 842, "loss": 5.906156063079834, "lr": 0.0002, "elapsed_sec": 6952.410337686539, "step_time_sec": 8.229106421989854, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 843, "loss": 5.930456161499023, "lr": 0.0002, "elapsed_sec": 6960.637886762619, "step_time_sec": 8.227377667004475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 844, "loss": 5.777782440185547, "lr": 0.0002, "elapsed_sec": 6968.865829706192, "step_time_sec": 8.227848737005843, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 845, "loss": 5.959997653961182, "lr": 0.0002, "elapsed_sec": 6977.094321966171, "step_time_sec": 8.228288334008539, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 846, "loss": 5.844476222991943, "lr": 0.0002, "elapsed_sec": 6985.321474790573, "step_time_sec": 8.226981350017013, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 847, "loss": 5.939876079559326, "lr": 0.0002, "elapsed_sec": 6993.549593925476, "step_time_sec": 8.228003408003133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 848, "loss": 5.99676513671875, "lr": 0.0002, "elapsed_sec": 7001.779434204102, "step_time_sec": 8.229682395001873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 849, "loss": 5.984302043914795, "lr": 0.0002, "elapsed_sec": 7010.009442329407, "step_time_sec": 8.229833356977906, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 850, "loss": 5.946237564086914, "lr": 0.0002, "elapsed_sec": 7018.239692687988, "step_time_sec": 8.23015055700671, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 851, "loss": 6.042696475982666, "lr": 0.0002, "elapsed_sec": 7026.4696452617645, "step_time_sec": 8.229722763004247, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 852, "loss": 5.979262351989746, "lr": 0.0002, "elapsed_sec": 7034.699452638626, "step_time_sec": 8.229644148988882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 853, "loss": 5.899518966674805, "lr": 0.0002, "elapsed_sec": 7042.926905870438, "step_time_sec": 8.227291628019884, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 854, "loss": 5.843690395355225, "lr": 0.0002, "elapsed_sec": 7051.156703233719, "step_time_sec": 8.229657129006227, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 855, "loss": 5.965251922607422, "lr": 0.0002, "elapsed_sec": 7059.386758089066, "step_time_sec": 8.229933027003426, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 856, "loss": 5.9690704345703125, "lr": 0.0002, "elapsed_sec": 7067.617007732391, "step_time_sec": 8.230039875983493, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 857, "loss": 5.95482063293457, "lr": 0.0002, "elapsed_sec": 7075.8436896800995, "step_time_sec": 8.226527748978697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 858, "loss": 5.934957981109619, "lr": 0.0002, "elapsed_sec": 7084.072115659714, "step_time_sec": 8.228330155019648, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 859, "loss": 5.975223064422607, "lr": 0.0002, "elapsed_sec": 7092.302126169205, "step_time_sec": 8.229838436003774, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 860, "loss": 5.8735504150390625, "lr": 0.0002, "elapsed_sec": 7100.532046556473, "step_time_sec": 8.229734822001774, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 861, "loss": 5.946963310241699, "lr": 0.0002, "elapsed_sec": 7108.761649608612, "step_time_sec": 8.229460075002862, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 862, "loss": 5.838242053985596, "lr": 0.0002, "elapsed_sec": 7116.991860866547, "step_time_sec": 8.230103707988746, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 863, "loss": 5.929205894470215, "lr": 0.0002, "elapsed_sec": 7125.221928119659, "step_time_sec": 8.229891514987685, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 864, "loss": 5.845162391662598, "lr": 0.0002, "elapsed_sec": 7133.452989578247, "step_time_sec": 8.230825610022293, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 865, "loss": 5.8833417892456055, "lr": 0.0002, "elapsed_sec": 7141.682769775391, "step_time_sec": 8.229582478001248, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 866, "loss": 5.98375129699707, "lr": 0.0002, "elapsed_sec": 7149.9131295681, "step_time_sec": 8.230242091987748, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 867, "loss": 5.8956685066223145, "lr": 0.0002, "elapsed_sec": 7158.140032529831, "step_time_sec": 8.226722105988301, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 868, "loss": 5.964231967926025, "lr": 0.0002, "elapsed_sec": 7166.368173837662, "step_time_sec": 8.228005915996619, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 869, "loss": 5.9325761795043945, "lr": 0.0002, "elapsed_sec": 7174.597724676132, "step_time_sec": 8.229414470988559, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 870, "loss": 5.929478645324707, "lr": 0.0002, "elapsed_sec": 7182.8278901577, "step_time_sec": 8.229974784015212, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 871, "loss": 5.830590724945068, "lr": 0.0002, "elapsed_sec": 7191.055186986923, "step_time_sec": 8.22715848099324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 872, "loss": 5.944486141204834, "lr": 0.0002, "elapsed_sec": 7199.2839114665985, "step_time_sec": 8.228622768016066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 873, "loss": 6.07701301574707, "lr": 0.0002, "elapsed_sec": 7207.5137186050415, "step_time_sec": 8.229653083020821, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 874, "loss": 5.908764362335205, "lr": 0.0002, "elapsed_sec": 7215.743518590927, "step_time_sec": 8.229564228997333, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 875, "loss": 5.950359344482422, "lr": 0.0002, "elapsed_sec": 7223.973187446594, "step_time_sec": 8.229552139993757, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 876, "loss": 5.949813365936279, "lr": 0.0002, "elapsed_sec": 7232.200448036194, "step_time_sec": 8.22710569499759, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 877, "loss": 5.890500545501709, "lr": 0.0002, "elapsed_sec": 7240.430480003357, "step_time_sec": 8.229820829001255, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 878, "loss": 5.98137903213501, "lr": 0.0002, "elapsed_sec": 7248.660771846771, "step_time_sec": 8.230137128004571, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 879, "loss": 6.001167297363281, "lr": 0.0002, "elapsed_sec": 7256.890987634659, "step_time_sec": 8.230041433009319, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 880, "loss": 5.919180870056152, "lr": 0.0002, "elapsed_sec": 7265.12091088295, "step_time_sec": 8.229780067980755, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 881, "loss": 5.901373863220215, "lr": 0.0002, "elapsed_sec": 7273.350634813309, "step_time_sec": 8.229588228976354, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 882, "loss": 5.923462390899658, "lr": 0.0002, "elapsed_sec": 7281.578367233276, "step_time_sec": 8.227573740005028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 883, "loss": 5.845873832702637, "lr": 0.0002, "elapsed_sec": 7289.8077919483185, "step_time_sec": 8.229289604001679, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 884, "loss": 5.866557598114014, "lr": 0.0002, "elapsed_sec": 7298.0374751091, "step_time_sec": 8.229504118004115, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 885, "loss": 5.8223652839660645, "lr": 0.0002, "elapsed_sec": 7306.266932010651, "step_time_sec": 8.229313480027486, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 886, "loss": 5.986145496368408, "lr": 0.0002, "elapsed_sec": 7314.493410825729, "step_time_sec": 8.226352684985613, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 887, "loss": 5.94229793548584, "lr": 0.0002, "elapsed_sec": 7322.722924947739, "step_time_sec": 8.229336020012852, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 888, "loss": 5.848353862762451, "lr": 0.0002, "elapsed_sec": 7330.951638460159, "step_time_sec": 8.228591084014624, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 889, "loss": 5.7395195960998535, "lr": 0.0002, "elapsed_sec": 7339.182034730911, "step_time_sec": 8.23020260501653, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 890, "loss": 5.823785781860352, "lr": 0.0002, "elapsed_sec": 7347.412127017975, "step_time_sec": 8.229874227981782, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 891, "loss": 5.902729511260986, "lr": 0.0002, "elapsed_sec": 7355.640756607056, "step_time_sec": 8.228426681976998, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 892, "loss": 5.951573371887207, "lr": 0.0002, "elapsed_sec": 7363.869438648224, "step_time_sec": 8.228574353997828, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 893, "loss": 5.853367805480957, "lr": 0.0002, "elapsed_sec": 7372.098928689957, "step_time_sec": 8.229278315004194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 894, "loss": 5.973324775695801, "lr": 0.0002, "elapsed_sec": 7380.32813835144, "step_time_sec": 8.229056651995052, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 895, "loss": 5.902538299560547, "lr": 0.0002, "elapsed_sec": 7388.557774066925, "step_time_sec": 8.229498839995358, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 896, "loss": 5.909477233886719, "lr": 0.0002, "elapsed_sec": 7396.78679394722, "step_time_sec": 8.228855059016496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 897, "loss": 5.762050628662109, "lr": 0.0002, "elapsed_sec": 7405.016357898712, "step_time_sec": 8.229365070990752, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 898, "loss": 5.895092964172363, "lr": 0.0002, "elapsed_sec": 7413.246649026871, "step_time_sec": 8.230126168986317, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 899, "loss": 5.887320518493652, "lr": 0.0002, "elapsed_sec": 7421.476347208023, "step_time_sec": 8.229562178021297, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 900, "loss": 5.9464850425720215, "lr": 0.0002, "elapsed_sec": 7429.70796918869, "step_time_sec": 8.231536716979463, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 901, "loss": 5.964878559112549, "lr": 0.0002, "elapsed_sec": 7437.9378135204315, "step_time_sec": 8.22964305797359, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 902, "loss": 5.888519763946533, "lr": 0.0002, "elapsed_sec": 7446.167776584625, "step_time_sec": 8.229850947012892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 903, "loss": 5.845760345458984, "lr": 0.0002, "elapsed_sec": 7454.397767543793, "step_time_sec": 8.229758289991878, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 904, "loss": 5.8266143798828125, "lr": 0.0002, "elapsed_sec": 7462.627630472183, "step_time_sec": 8.229720067989547, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 905, "loss": 5.9616241455078125, "lr": 0.0002, "elapsed_sec": 7470.85728931427, "step_time_sec": 8.229533383011585, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 906, "loss": 5.844291687011719, "lr": 0.0002, "elapsed_sec": 7479.085832834244, "step_time_sec": 8.2283931330021, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 907, "loss": 5.8822197914123535, "lr": 0.0002, "elapsed_sec": 7487.316107749939, "step_time_sec": 8.230085507995682, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 908, "loss": 5.851210117340088, "lr": 0.0002, "elapsed_sec": 7495.545765161514, "step_time_sec": 8.229515290993731, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 909, "loss": 5.8815178871154785, "lr": 0.0002, "elapsed_sec": 7503.775844812393, "step_time_sec": 8.229925904015545, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 910, "loss": 5.86076545715332, "lr": 0.0002, "elapsed_sec": 7512.004727840424, "step_time_sec": 8.228710423019947, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 911, "loss": 5.970577716827393, "lr": 0.0002, "elapsed_sec": 7520.231337785721, "step_time_sec": 8.226463748986134, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 912, "loss": 5.790316581726074, "lr": 0.0002, "elapsed_sec": 7528.461320400238, "step_time_sec": 8.229873061005492, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 913, "loss": 5.837367057800293, "lr": 0.0002, "elapsed_sec": 7536.692483186722, "step_time_sec": 8.230974181002239, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 914, "loss": 6.123839855194092, "lr": 0.0002, "elapsed_sec": 7544.919588088989, "step_time_sec": 8.227007297013188, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 915, "loss": 5.73406982421875, "lr": 0.0002, "elapsed_sec": 7553.1484224796295, "step_time_sec": 8.228655659011565, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 916, "loss": 5.89591646194458, "lr": 0.0002, "elapsed_sec": 7561.375779390335, "step_time_sec": 8.227158952999162, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 917, "loss": 5.75557804107666, "lr": 0.0002, "elapsed_sec": 7569.603959083557, "step_time_sec": 8.228085728012957, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 918, "loss": 5.857518196105957, "lr": 0.0002, "elapsed_sec": 7577.833589792252, "step_time_sec": 8.22942960599903, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 919, "loss": 5.745097637176514, "lr": 0.0002, "elapsed_sec": 7586.063064575195, "step_time_sec": 8.229295172001002, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 920, "loss": 5.944908618927002, "lr": 0.0002, "elapsed_sec": 7594.293275117874, "step_time_sec": 8.23008271100116, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 921, "loss": 5.77063512802124, "lr": 0.0002, "elapsed_sec": 7602.522463798523, "step_time_sec": 8.228999093989842, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 922, "loss": 5.848954200744629, "lr": 0.0002, "elapsed_sec": 7610.750225305557, "step_time_sec": 8.227621825993992, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 923, "loss": 5.89720344543457, "lr": 0.0002, "elapsed_sec": 7618.97989988327, "step_time_sec": 8.22951443499187, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 924, "loss": 5.790800094604492, "lr": 0.0002, "elapsed_sec": 7627.210110425949, "step_time_sec": 8.230041199014522, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 925, "loss": 5.9093337059021, "lr": 0.0002, "elapsed_sec": 7635.440672159195, "step_time_sec": 8.230474350013537, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 926, "loss": 5.793030738830566, "lr": 0.0002, "elapsed_sec": 7643.670962572098, "step_time_sec": 8.230073368002195, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 927, "loss": 5.8936591148376465, "lr": 0.0002, "elapsed_sec": 7651.899765491486, "step_time_sec": 8.2286548029806, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 928, "loss": 5.759986400604248, "lr": 0.0002, "elapsed_sec": 7660.128347396851, "step_time_sec": 8.228426591987954, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 929, "loss": 5.812958717346191, "lr": 0.0002, "elapsed_sec": 7668.3585729599, "step_time_sec": 8.230111223005224, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 930, "loss": 5.864936828613281, "lr": 0.0002, "elapsed_sec": 7676.588940143585, "step_time_sec": 8.230158507998567, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 931, "loss": 5.862442493438721, "lr": 0.0002, "elapsed_sec": 7684.818444490433, "step_time_sec": 8.229384745005518, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 932, "loss": 5.860655784606934, "lr": 0.0002, "elapsed_sec": 7693.046808719635, "step_time_sec": 8.228203020000365, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 933, "loss": 5.7707600593566895, "lr": 0.0002, "elapsed_sec": 7701.276414394379, "step_time_sec": 8.22943932499038, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 934, "loss": 5.867938995361328, "lr": 0.0002, "elapsed_sec": 7709.505870819092, "step_time_sec": 8.229312796989689, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 935, "loss": 5.916243553161621, "lr": 0.0002, "elapsed_sec": 7717.736322641373, "step_time_sec": 8.230293731990969, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 936, "loss": 5.834097385406494, "lr": 0.0002, "elapsed_sec": 7725.966119527817, "step_time_sec": 8.229707585996948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 937, "loss": 5.877383708953857, "lr": 0.0002, "elapsed_sec": 7734.194392681122, "step_time_sec": 8.228090584016172, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 938, "loss": 5.83904504776001, "lr": 0.0002, "elapsed_sec": 7742.424199819565, "step_time_sec": 8.229631439986406, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 939, "loss": 5.841861724853516, "lr": 0.0002, "elapsed_sec": 7750.651067495346, "step_time_sec": 8.22668923099991, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 940, "loss": 5.886272430419922, "lr": 0.0002, "elapsed_sec": 7758.879292964935, "step_time_sec": 8.228078572021332, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 941, "loss": 5.808419704437256, "lr": 0.0002, "elapsed_sec": 7767.109179019928, "step_time_sec": 8.229755367996404, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 942, "loss": 5.872170448303223, "lr": 0.0002, "elapsed_sec": 7775.3380427360535, "step_time_sec": 8.228678432002198, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 943, "loss": 5.814393520355225, "lr": 0.0002, "elapsed_sec": 7783.566117525101, "step_time_sec": 8.227979249000782, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 944, "loss": 5.852724552154541, "lr": 0.0002, "elapsed_sec": 7791.7974643707275, "step_time_sec": 8.231132397981128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 945, "loss": 5.912739276885986, "lr": 0.0002, "elapsed_sec": 7800.027443647385, "step_time_sec": 8.229875884979265, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 946, "loss": 5.813299655914307, "lr": 0.0002, "elapsed_sec": 7808.256906509399, "step_time_sec": 8.229261866013985, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 947, "loss": 5.812803268432617, "lr": 0.0002, "elapsed_sec": 7816.48397564888, "step_time_sec": 8.226909584976966, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 948, "loss": 5.877680778503418, "lr": 0.0002, "elapsed_sec": 7824.713691711426, "step_time_sec": 8.22956270398572, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 949, "loss": 5.774181842803955, "lr": 0.0002, "elapsed_sec": 7832.943426847458, "step_time_sec": 8.229581253021024, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 950, "loss": 5.751199245452881, "lr": 0.0002, "elapsed_sec": 7841.173317909241, "step_time_sec": 8.22975087800296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 951, "loss": 5.750495433807373, "lr": 0.0002, "elapsed_sec": 7849.401550531387, "step_time_sec": 8.228104687004816, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 952, "loss": 5.805752277374268, "lr": 0.0002, "elapsed_sec": 7857.631793498993, "step_time_sec": 8.23003109099227, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 953, "loss": 5.79350471496582, "lr": 0.0002, "elapsed_sec": 7865.861702919006, "step_time_sec": 8.229773347993614, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 954, "loss": 5.758545398712158, "lr": 0.0002, "elapsed_sec": 7874.091357469559, "step_time_sec": 8.229507221985841, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 955, "loss": 5.758182525634766, "lr": 0.0002, "elapsed_sec": 7882.3211143016815, "step_time_sec": 8.229639147990383, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 956, "loss": 5.687973976135254, "lr": 0.0002, "elapsed_sec": 7890.547201156616, "step_time_sec": 8.22589049700764, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 957, "loss": 5.905525207519531, "lr": 0.0002, "elapsed_sec": 7898.775880098343, "step_time_sec": 8.228598838002654, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 958, "loss": 5.7583136558532715, "lr": 0.0002, "elapsed_sec": 7907.003945112228, "step_time_sec": 8.227851975010708, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 959, "loss": 5.737029075622559, "lr": 0.0002, "elapsed_sec": 7915.233018159866, "step_time_sec": 8.228955694008619, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 960, "loss": 5.894069194793701, "lr": 0.0002, "elapsed_sec": 7923.460666179657, "step_time_sec": 8.22747488701134, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 961, "loss": 5.765750408172607, "lr": 0.0002, "elapsed_sec": 7931.690407276154, "step_time_sec": 8.22953635599697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 962, "loss": 5.720954895019531, "lr": 0.0002, "elapsed_sec": 7939.920326948166, "step_time_sec": 8.229771756014088, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 963, "loss": 5.716599464416504, "lr": 0.0002, "elapsed_sec": 7948.149580955505, "step_time_sec": 8.229085107013816, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 964, "loss": 5.921523094177246, "lr": 0.0002, "elapsed_sec": 7956.37896323204, "step_time_sec": 8.229320707003353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 965, "loss": 5.942735195159912, "lr": 0.0002, "elapsed_sec": 7964.6091067790985, "step_time_sec": 8.229923792008776, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 966, "loss": 5.774704456329346, "lr": 0.0002, "elapsed_sec": 7972.839622497559, "step_time_sec": 8.230307607998839, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 967, "loss": 5.743651390075684, "lr": 0.0002, "elapsed_sec": 7981.069303750992, "step_time_sec": 8.22961008900893, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 968, "loss": 5.823121547698975, "lr": 0.0002, "elapsed_sec": 7989.298966407776, "step_time_sec": 8.229485185001977, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 969, "loss": 5.844394207000732, "lr": 0.0002, "elapsed_sec": 7997.526194810867, "step_time_sec": 8.226995569013525, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 970, "loss": 5.813067436218262, "lr": 0.0002, "elapsed_sec": 8005.755278348923, "step_time_sec": 8.22893836000003, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 971, "loss": 5.708871364593506, "lr": 0.0002, "elapsed_sec": 8013.98201584816, "step_time_sec": 8.226612995000323, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 972, "loss": 5.827378749847412, "lr": 0.0002, "elapsed_sec": 8022.211101293564, "step_time_sec": 8.228856862988323, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 973, "loss": 5.8993964195251465, "lr": 0.0002, "elapsed_sec": 8030.438424110413, "step_time_sec": 8.227179570996668, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 974, "loss": 6.03269624710083, "lr": 0.0002, "elapsed_sec": 8038.6666395664215, "step_time_sec": 8.22801384201739, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 975, "loss": 5.7810258865356445, "lr": 0.0002, "elapsed_sec": 8046.895040273666, "step_time_sec": 8.22824786300771, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 976, "loss": 5.790118217468262, "lr": 0.0002, "elapsed_sec": 8055.12482714653, "step_time_sec": 8.229703249002341, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 977, "loss": 5.830965518951416, "lr": 0.0002, "elapsed_sec": 8063.354292631149, "step_time_sec": 8.229242877976503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 978, "loss": 5.887166500091553, "lr": 0.0002, "elapsed_sec": 8071.58216381073, "step_time_sec": 8.227714215987362, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 979, "loss": 5.831849575042725, "lr": 0.0002, "elapsed_sec": 8079.811609983444, "step_time_sec": 8.2292858120054, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 980, "loss": 5.8021135330200195, "lr": 0.0002, "elapsed_sec": 8088.041479110718, "step_time_sec": 8.229696602997137, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 981, "loss": 5.867305278778076, "lr": 0.0002, "elapsed_sec": 8096.269765138626, "step_time_sec": 8.228141586005222, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 982, "loss": 5.681748867034912, "lr": 0.0002, "elapsed_sec": 8104.498478651047, "step_time_sec": 8.228594337997492, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 983, "loss": 5.756385803222656, "lr": 0.0002, "elapsed_sec": 8112.7258887290955, "step_time_sec": 8.227176046988461, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 984, "loss": 5.82811164855957, "lr": 0.0002, "elapsed_sec": 8120.9554789066315, "step_time_sec": 8.22945936198812, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 985, "loss": 5.745394706726074, "lr": 0.0002, "elapsed_sec": 8129.185195684433, "step_time_sec": 8.229595673008589, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 986, "loss": 5.692750930786133, "lr": 0.0002, "elapsed_sec": 8137.414803981781, "step_time_sec": 8.229429724015063, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 987, "loss": 5.841291904449463, "lr": 0.0002, "elapsed_sec": 8145.644120931625, "step_time_sec": 8.229203127004439, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 988, "loss": 5.719700336456299, "lr": 0.0002, "elapsed_sec": 8153.87565946579, "step_time_sec": 8.231328453985043, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 989, "loss": 5.717268466949463, "lr": 0.0002, "elapsed_sec": 8162.104292869568, "step_time_sec": 8.228499707998708, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 990, "loss": 5.754941940307617, "lr": 0.0002, "elapsed_sec": 8170.332307100296, "step_time_sec": 8.227892630995484, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 991, "loss": 5.697622776031494, "lr": 0.0002, "elapsed_sec": 8178.562677383423, "step_time_sec": 8.230165613000281, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 992, "loss": 5.732921123504639, "lr": 0.0002, "elapsed_sec": 8186.793066978455, "step_time_sec": 8.230206701991847, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 993, "loss": 5.795831203460693, "lr": 0.0002, "elapsed_sec": 8195.022792816162, "step_time_sec": 8.229572087002452, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 994, "loss": 5.7555317878723145, "lr": 0.0002, "elapsed_sec": 8203.251704216003, "step_time_sec": 8.228761856997153, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 995, "loss": 5.719804763793945, "lr": 0.0002, "elapsed_sec": 8211.480014801025, "step_time_sec": 8.228185418003704, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 996, "loss": 5.773508548736572, "lr": 0.0002, "elapsed_sec": 8219.709623098373, "step_time_sec": 8.229455674008932, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 997, "loss": 5.75764799118042, "lr": 0.0002, "elapsed_sec": 8227.939721345901, "step_time_sec": 8.229882318002637, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 998, "loss": 5.832427978515625, "lr": 0.0002, "elapsed_sec": 8236.16949725151, "step_time_sec": 8.22962205298245, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 999, "loss": 5.691696643829346, "lr": 0.0002, "elapsed_sec": 8244.398778676987, "step_time_sec": 8.229129193001427, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1000, "loss": 5.771336555480957, "lr": 0.0002, "elapsed_sec": 8252.628346920013, "step_time_sec": 51.41397342397249, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": true, "probe_ran": true, "probe_elapsed_sec": 1.0455801489879377, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1001, "loss": 5.771897315979004, "lr": 0.0002, "elapsed_sec": 8304.044908046722, "step_time_sec": 8.23189163801726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1002, "loss": 5.702426433563232, "lr": 0.0002, "elapsed_sec": 8312.274377346039, "step_time_sec": 8.229382004996296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1003, "loss": 5.861361026763916, "lr": 0.0002, "elapsed_sec": 8320.504188299179, "step_time_sec": 8.229627862019697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1004, "loss": 5.761072635650635, "lr": 0.0002, "elapsed_sec": 8328.734146356583, "step_time_sec": 8.229831617005402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1005, "loss": 5.772669792175293, "lr": 0.0002, "elapsed_sec": 8336.964079618454, "step_time_sec": 8.229688239982352, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1006, "loss": 5.789926528930664, "lr": 0.0002, "elapsed_sec": 8345.19351053238, "step_time_sec": 8.229318780999165, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1007, "loss": 5.8855061531066895, "lr": 0.0002, "elapsed_sec": 8353.4227912426, "step_time_sec": 8.229106512008002, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1008, "loss": 5.713780403137207, "lr": 0.0002, "elapsed_sec": 8361.652240276337, "step_time_sec": 8.229298266989645, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1009, "loss": 5.733396530151367, "lr": 0.0002, "elapsed_sec": 8369.882134199142, "step_time_sec": 8.229777227999875, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1010, "loss": 5.696365833282471, "lr": 0.0002, "elapsed_sec": 8378.112021923065, "step_time_sec": 8.229686411999865, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1011, "loss": 5.757964611053467, "lr": 0.0002, "elapsed_sec": 8386.34262752533, "step_time_sec": 8.230431261996273, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1012, "loss": 5.636813163757324, "lr": 0.0002, "elapsed_sec": 8394.573283433914, "step_time_sec": 8.230565392004792, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1013, "loss": 5.972193241119385, "lr": 0.0002, "elapsed_sec": 8402.803879976273, "step_time_sec": 8.230386470007943, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1014, "loss": 5.7719292640686035, "lr": 0.0002, "elapsed_sec": 8411.030621051788, "step_time_sec": 8.226579327019863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1015, "loss": 5.65820837020874, "lr": 0.0002, "elapsed_sec": 8419.26036787033, "step_time_sec": 8.229656197974691, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1016, "loss": 5.7361955642700195, "lr": 0.0002, "elapsed_sec": 8427.489716053009, "step_time_sec": 8.22914581297664, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1017, "loss": 5.746600151062012, "lr": 0.0002, "elapsed_sec": 8435.718025684357, "step_time_sec": 8.228177008015336, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1018, "loss": 5.709837913513184, "lr": 0.0002, "elapsed_sec": 8443.94489479065, "step_time_sec": 8.226676607009722, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1019, "loss": 5.743813991546631, "lr": 0.0002, "elapsed_sec": 8452.174971342087, "step_time_sec": 8.229915561998496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1020, "loss": 5.6866631507873535, "lr": 0.0002, "elapsed_sec": 8460.404419660568, "step_time_sec": 8.229284556000493, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1021, "loss": 5.726356029510498, "lr": 0.0002, "elapsed_sec": 8468.63297176361, "step_time_sec": 8.228391045995522, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1022, "loss": 5.695237636566162, "lr": 0.0002, "elapsed_sec": 8476.863008975983, "step_time_sec": 8.2299145119905, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1023, "loss": 5.6728596687316895, "lr": 0.0002, "elapsed_sec": 8485.092853784561, "step_time_sec": 8.229675654991297, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1024, "loss": 5.932042598724365, "lr": 0.0002, "elapsed_sec": 8493.322004556656, "step_time_sec": 8.22895822999999, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1025, "loss": 5.6290202140808105, "lr": 0.0002, "elapsed_sec": 8501.55207324028, "step_time_sec": 8.22991593502229, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1026, "loss": 5.780593395233154, "lr": 0.0002, "elapsed_sec": 8509.782759428024, "step_time_sec": 8.23057644299115, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1027, "loss": 5.763006687164307, "lr": 0.0002, "elapsed_sec": 8518.011290311813, "step_time_sec": 8.228314603009494, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1028, "loss": 5.677393913269043, "lr": 0.0002, "elapsed_sec": 8526.23913526535, "step_time_sec": 8.227762641996378, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1029, "loss": 5.7031331062316895, "lr": 0.0002, "elapsed_sec": 8534.468068361282, "step_time_sec": 8.228706026013242, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1030, "loss": 5.7238383293151855, "lr": 0.0002, "elapsed_sec": 8542.694795131683, "step_time_sec": 8.226579924987163, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1031, "loss": 5.7676215171813965, "lr": 0.0002, "elapsed_sec": 8550.92323255539, "step_time_sec": 8.228352222999092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1032, "loss": 5.748566150665283, "lr": 0.0002, "elapsed_sec": 8559.150975704193, "step_time_sec": 8.227539149986114, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1033, "loss": 5.6889214515686035, "lr": 0.0002, "elapsed_sec": 8567.37697672844, "step_time_sec": 8.225839041988365, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1034, "loss": 5.704984188079834, "lr": 0.0002, "elapsed_sec": 8575.604864358902, "step_time_sec": 8.227777728025103, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1035, "loss": 5.680926322937012, "lr": 0.0002, "elapsed_sec": 8583.833467960358, "step_time_sec": 8.228404023015173, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1036, "loss": 5.857619285583496, "lr": 0.0002, "elapsed_sec": 8592.062552213669, "step_time_sec": 8.228920729015954, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1037, "loss": 5.649969577789307, "lr": 0.0002, "elapsed_sec": 8600.291911840439, "step_time_sec": 8.22920714400243, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1038, "loss": 5.778899669647217, "lr": 0.0002, "elapsed_sec": 8608.520359277725, "step_time_sec": 8.2282927549968, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1039, "loss": 5.59987211227417, "lr": 0.0002, "elapsed_sec": 8616.750552654266, "step_time_sec": 8.230039909016341, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1040, "loss": 5.664917469024658, "lr": 0.0002, "elapsed_sec": 8624.98041844368, "step_time_sec": 8.229755827982444, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1041, "loss": 5.665317058563232, "lr": 0.0002, "elapsed_sec": 8633.210891485214, "step_time_sec": 8.230275702982908, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1042, "loss": 5.623473167419434, "lr": 0.0002, "elapsed_sec": 8641.441508293152, "step_time_sec": 8.230460120015778, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1043, "loss": 5.761837005615234, "lr": 0.0002, "elapsed_sec": 8649.671588659286, "step_time_sec": 8.22996370002511, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1044, "loss": 5.673610210418701, "lr": 0.0002, "elapsed_sec": 8657.901933431625, "step_time_sec": 8.230230579996714, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1045, "loss": 5.690779209136963, "lr": 0.0002, "elapsed_sec": 8666.131685733795, "step_time_sec": 8.229554928984726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1046, "loss": 5.82152795791626, "lr": 0.0002, "elapsed_sec": 8674.361715078354, "step_time_sec": 8.22981823401642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1047, "loss": 5.777355670928955, "lr": 0.0002, "elapsed_sec": 8682.59229016304, "step_time_sec": 8.230432231997838, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1048, "loss": 5.810904502868652, "lr": 0.0002, "elapsed_sec": 8690.824053525925, "step_time_sec": 8.231602512998506, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1049, "loss": 5.736048698425293, "lr": 0.0002, "elapsed_sec": 8699.053855419159, "step_time_sec": 8.22963835200062, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1050, "loss": 5.778247833251953, "lr": 0.0002, "elapsed_sec": 8707.2831428051, "step_time_sec": 8.229143175994977, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1051, "loss": 5.642415523529053, "lr": 0.0002, "elapsed_sec": 8715.513336658478, "step_time_sec": 8.230105299997376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1052, "loss": 5.727174758911133, "lr": 0.0002, "elapsed_sec": 8723.740763902664, "step_time_sec": 8.227192180987913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1053, "loss": 5.7644877433776855, "lr": 0.0002, "elapsed_sec": 8731.968091487885, "step_time_sec": 8.22719597100513, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1054, "loss": 5.648550987243652, "lr": 0.0002, "elapsed_sec": 8740.19647860527, "step_time_sec": 8.22826570601319, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1055, "loss": 5.703385353088379, "lr": 0.0002, "elapsed_sec": 8748.425819396973, "step_time_sec": 8.229200718022184, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1056, "loss": 5.76930570602417, "lr": 0.0002, "elapsed_sec": 8756.65275645256, "step_time_sec": 8.226721035985975, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1057, "loss": 5.601773738861084, "lr": 0.0002, "elapsed_sec": 8764.880321979523, "step_time_sec": 8.227446446020622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1058, "loss": 5.602904796600342, "lr": 0.0002, "elapsed_sec": 8773.107227802277, "step_time_sec": 8.226712988020154, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1059, "loss": 5.655826568603516, "lr": 0.0002, "elapsed_sec": 8781.337300777435, "step_time_sec": 8.229978240997298, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1060, "loss": 5.810366153717041, "lr": 0.0002, "elapsed_sec": 8789.56677699089, "step_time_sec": 8.229290240007685, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1061, "loss": 5.7428388595581055, "lr": 0.0002, "elapsed_sec": 8797.795129537582, "step_time_sec": 8.22819337999681, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1062, "loss": 5.71402645111084, "lr": 0.0002, "elapsed_sec": 8806.024720668793, "step_time_sec": 8.229494875995442, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1063, "loss": 5.684255123138428, "lr": 0.0002, "elapsed_sec": 8814.252261161804, "step_time_sec": 8.227334754017647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1064, "loss": 5.6559343338012695, "lr": 0.0002, "elapsed_sec": 8822.481950044632, "step_time_sec": 8.229534357989905, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1065, "loss": 5.732975006103516, "lr": 0.0002, "elapsed_sec": 8830.711822271347, "step_time_sec": 8.229710360988975, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1066, "loss": 5.733541488647461, "lr": 0.0002, "elapsed_sec": 8838.941868305206, "step_time_sec": 8.229902778984979, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1067, "loss": 5.637548446655273, "lr": 0.0002, "elapsed_sec": 8847.17146229744, "step_time_sec": 8.229462930001318, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1068, "loss": 5.731112003326416, "lr": 0.0002, "elapsed_sec": 8855.401229858398, "step_time_sec": 8.229592552001122, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1069, "loss": 5.739403247833252, "lr": 0.0002, "elapsed_sec": 8863.62862253189, "step_time_sec": 8.227243692992488, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1070, "loss": 5.697066307067871, "lr": 0.0002, "elapsed_sec": 8871.858012199402, "step_time_sec": 8.22922744799871, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1071, "loss": 5.567259311676025, "lr": 0.0002, "elapsed_sec": 8880.08743095398, "step_time_sec": 8.229302911000559, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1072, "loss": 5.607485771179199, "lr": 0.0002, "elapsed_sec": 8888.315417289734, "step_time_sec": 8.227815403981367, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1073, "loss": 5.664398193359375, "lr": 0.0002, "elapsed_sec": 8896.54250741005, "step_time_sec": 8.226961545005906, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1074, "loss": 5.622435569763184, "lr": 0.0002, "elapsed_sec": 8904.77122426033, "step_time_sec": 8.228532756998902, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1075, "loss": 5.7146525382995605, "lr": 0.0002, "elapsed_sec": 8913.000924825668, "step_time_sec": 8.229536531987833, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1076, "loss": 5.649174213409424, "lr": 0.0002, "elapsed_sec": 8921.230381965637, "step_time_sec": 8.229367469000863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1077, "loss": 5.6612935066223145, "lr": 0.0002, "elapsed_sec": 8929.457471132278, "step_time_sec": 8.226900837995345, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1078, "loss": 5.629218578338623, "lr": 0.0002, "elapsed_sec": 8937.686877250671, "step_time_sec": 8.229227763978997, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1079, "loss": 5.590912342071533, "lr": 0.0002, "elapsed_sec": 8945.916957855225, "step_time_sec": 8.229937613010406, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1080, "loss": 5.634620189666748, "lr": 0.0002, "elapsed_sec": 8954.147818803787, "step_time_sec": 8.230711331998464, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1081, "loss": 5.632055759429932, "lr": 0.0002, "elapsed_sec": 8962.375947237015, "step_time_sec": 8.227997879002942, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1082, "loss": 5.462393283843994, "lr": 0.0002, "elapsed_sec": 8970.605711460114, "step_time_sec": 8.229597569006728, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1083, "loss": 5.614382266998291, "lr": 0.0002, "elapsed_sec": 8978.832820415497, "step_time_sec": 8.226949962001527, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1084, "loss": 5.642831802368164, "lr": 0.0002, "elapsed_sec": 8987.062495231628, "step_time_sec": 8.229529016010929, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1085, "loss": 5.699198246002197, "lr": 0.0002, "elapsed_sec": 8995.289705991745, "step_time_sec": 8.227100671996595, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1086, "loss": 5.636162757873535, "lr": 0.0002, "elapsed_sec": 9003.519641637802, "step_time_sec": 8.229753937979694, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1087, "loss": 5.64954137802124, "lr": 0.0002, "elapsed_sec": 9011.74988746643, "step_time_sec": 8.230138198006898, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1088, "loss": 5.587447643280029, "lr": 0.0002, "elapsed_sec": 9019.979687213898, "step_time_sec": 8.22960436300491, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1089, "loss": 5.584773540496826, "lr": 0.0002, "elapsed_sec": 9028.209778547287, "step_time_sec": 8.229923932987731, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1090, "loss": 5.577056884765625, "lr": 0.0002, "elapsed_sec": 9036.439467668533, "step_time_sec": 8.229597684025066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1091, "loss": 5.635481357574463, "lr": 0.0002, "elapsed_sec": 9044.667095422745, "step_time_sec": 8.227425122982822, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1092, "loss": 5.7226881980896, "lr": 0.0002, "elapsed_sec": 9052.897166967392, "step_time_sec": 8.229891233990202, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1093, "loss": 5.659003257751465, "lr": 0.0002, "elapsed_sec": 9061.125066757202, "step_time_sec": 8.227753462007968, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1094, "loss": 5.709053993225098, "lr": 0.0002, "elapsed_sec": 9069.35229229927, "step_time_sec": 8.227083322999533, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1095, "loss": 5.671091556549072, "lr": 0.0002, "elapsed_sec": 9077.581969976425, "step_time_sec": 8.22950255998876, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1096, "loss": 5.679475784301758, "lr": 0.0002, "elapsed_sec": 9085.811104774475, "step_time_sec": 8.228973713994492, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1097, "loss": 5.645805358886719, "lr": 0.0002, "elapsed_sec": 9094.041222810745, "step_time_sec": 8.22998748300597, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1098, "loss": 5.666377067565918, "lr": 0.0002, "elapsed_sec": 9102.27134346962, "step_time_sec": 8.230013845983194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1099, "loss": 5.556997776031494, "lr": 0.0002, "elapsed_sec": 9110.501942873001, "step_time_sec": 8.230411694996292, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1100, "loss": 5.609332084655762, "lr": 0.0002, "elapsed_sec": 9118.731496810913, "step_time_sec": 8.229343429993605, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1101, "loss": 5.678241729736328, "lr": 0.0002, "elapsed_sec": 9126.961267232895, "step_time_sec": 8.229653408983722, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1102, "loss": 5.526804447174072, "lr": 0.0002, "elapsed_sec": 9135.191815137863, "step_time_sec": 8.230420375009999, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1103, "loss": 5.690858840942383, "lr": 0.0002, "elapsed_sec": 9143.421519756317, "step_time_sec": 8.229495881998446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1104, "loss": 5.621304988861084, "lr": 0.0002, "elapsed_sec": 9151.650877714157, "step_time_sec": 8.229224009002792, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1105, "loss": 5.7157883644104, "lr": 0.0002, "elapsed_sec": 9159.880441904068, "step_time_sec": 8.229403809993528, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1106, "loss": 5.684826850891113, "lr": 0.0002, "elapsed_sec": 9168.110336780548, "step_time_sec": 8.229773113009287, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1107, "loss": 5.6019086837768555, "lr": 0.0002, "elapsed_sec": 9176.339635848999, "step_time_sec": 8.229108309024014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1108, "loss": 5.6401495933532715, "lr": 0.0002, "elapsed_sec": 9184.566986322403, "step_time_sec": 8.227197109983535, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1109, "loss": 5.597107410430908, "lr": 0.0002, "elapsed_sec": 9192.79139637947, "step_time_sec": 8.22424104201491, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1110, "loss": 5.619153022766113, "lr": 0.0002, "elapsed_sec": 9201.019297838211, "step_time_sec": 8.227747338009067, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1111, "loss": 5.686688423156738, "lr": 0.0002, "elapsed_sec": 9209.247845888138, "step_time_sec": 8.228405847999966, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1112, "loss": 5.682906627655029, "lr": 0.0002, "elapsed_sec": 9217.476999759674, "step_time_sec": 8.228996349993395, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1113, "loss": 5.663275718688965, "lr": 0.0002, "elapsed_sec": 9225.70668554306, "step_time_sec": 8.229522056994028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1114, "loss": 5.532290935516357, "lr": 0.0002, "elapsed_sec": 9233.936758995056, "step_time_sec": 8.229911180009367, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1115, "loss": 5.752762794494629, "lr": 0.0002, "elapsed_sec": 9242.167499780655, "step_time_sec": 8.230616116023157, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1116, "loss": 5.642740249633789, "lr": 0.0002, "elapsed_sec": 9250.397146463394, "step_time_sec": 8.22950455598766, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1117, "loss": 5.601714134216309, "lr": 0.0002, "elapsed_sec": 9258.627445459366, "step_time_sec": 8.230190530011896, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1118, "loss": 5.665139675140381, "lr": 0.0002, "elapsed_sec": 9266.856734514236, "step_time_sec": 8.229094937007176, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1119, "loss": 5.542497158050537, "lr": 0.0002, "elapsed_sec": 9275.086322069168, "step_time_sec": 8.229394450987456, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1120, "loss": 5.813870429992676, "lr": 0.0002, "elapsed_sec": 9283.316999197006, "step_time_sec": 8.230518330994528, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1121, "loss": 5.694828987121582, "lr": 0.0002, "elapsed_sec": 9291.544620990753, "step_time_sec": 8.227470167999854, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1122, "loss": 5.581991195678711, "lr": 0.0002, "elapsed_sec": 9299.773690462112, "step_time_sec": 8.228932804020587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1123, "loss": 5.668137073516846, "lr": 0.0002, "elapsed_sec": 9308.003408193588, "step_time_sec": 8.229557615995873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1124, "loss": 5.574669361114502, "lr": 0.0002, "elapsed_sec": 9316.232563734055, "step_time_sec": 8.228990898001939, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1125, "loss": 5.584060192108154, "lr": 0.0002, "elapsed_sec": 9324.462585449219, "step_time_sec": 8.229888644011226, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1126, "loss": 5.71416711807251, "lr": 0.0002, "elapsed_sec": 9332.68956565857, "step_time_sec": 8.226825943012955, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1127, "loss": 5.707598686218262, "lr": 0.0002, "elapsed_sec": 9340.918159246445, "step_time_sec": 8.228453718998935, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1128, "loss": 5.608211517333984, "lr": 0.0002, "elapsed_sec": 9349.148530721664, "step_time_sec": 8.230183534993557, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1129, "loss": 5.5843682289123535, "lr": 0.0002, "elapsed_sec": 9357.378591775894, "step_time_sec": 8.22995905400603, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1130, "loss": 5.607525825500488, "lr": 0.0002, "elapsed_sec": 9365.605167388916, "step_time_sec": 8.226416457007872, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1131, "loss": 5.8395867347717285, "lr": 0.0002, "elapsed_sec": 9373.833773136139, "step_time_sec": 8.228466953005409, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1132, "loss": 5.88712215423584, "lr": 0.0002, "elapsed_sec": 9382.06288099289, "step_time_sec": 8.228892319020815, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1133, "loss": 5.678305149078369, "lr": 0.0002, "elapsed_sec": 9390.292986154556, "step_time_sec": 8.229902203020174, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1134, "loss": 5.72599983215332, "lr": 0.0002, "elapsed_sec": 9398.5227496624, "step_time_sec": 8.229645612998866, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1135, "loss": 5.6528000831604, "lr": 0.0002, "elapsed_sec": 9406.751112937927, "step_time_sec": 8.228264692006633, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1136, "loss": 5.63776159286499, "lr": 0.0002, "elapsed_sec": 9414.982100963593, "step_time_sec": 8.230741443985607, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1137, "loss": 5.576612949371338, "lr": 0.0002, "elapsed_sec": 9423.21221780777, "step_time_sec": 8.22995720000472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1138, "loss": 5.6396098136901855, "lr": 0.0002, "elapsed_sec": 9431.441742420197, "step_time_sec": 8.229440743016312, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1139, "loss": 5.576886177062988, "lr": 0.0002, "elapsed_sec": 9439.672595977783, "step_time_sec": 8.230639090994373, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1140, "loss": 5.664805889129639, "lr": 0.0002, "elapsed_sec": 9447.902506113052, "step_time_sec": 8.229751049977494, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1141, "loss": 5.672874450683594, "lr": 0.0002, "elapsed_sec": 9456.130907773972, "step_time_sec": 8.228337235981598, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1142, "loss": 5.588301181793213, "lr": 0.0002, "elapsed_sec": 9464.358962535858, "step_time_sec": 8.227820786996745, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1143, "loss": 5.669850826263428, "lr": 0.0002, "elapsed_sec": 9472.589368343353, "step_time_sec": 8.230264254001668, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1144, "loss": 5.679689407348633, "lr": 0.0002, "elapsed_sec": 9480.819158792496, "step_time_sec": 8.22966758601251, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1145, "loss": 5.623167037963867, "lr": 0.0002, "elapsed_sec": 9489.047792673111, "step_time_sec": 8.228439011989394, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1146, "loss": 5.664449214935303, "lr": 0.0002, "elapsed_sec": 9497.276444911957, "step_time_sec": 8.228491369984113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1147, "loss": 5.519106388092041, "lr": 0.0002, "elapsed_sec": 9505.50427031517, "step_time_sec": 8.22772048600018, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1148, "loss": 5.596560478210449, "lr": 0.0002, "elapsed_sec": 9513.733034849167, "step_time_sec": 8.228577421978116, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1149, "loss": 5.627766132354736, "lr": 0.0002, "elapsed_sec": 9521.959863901138, "step_time_sec": 8.22669017300359, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1150, "loss": 5.509012222290039, "lr": 0.0002, "elapsed_sec": 9530.189810276031, "step_time_sec": 8.229762743023457, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1151, "loss": 5.523999214172363, "lr": 0.0002, "elapsed_sec": 9538.419922113419, "step_time_sec": 8.230056154978229, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1152, "loss": 5.5496673583984375, "lr": 0.0002, "elapsed_sec": 9546.650399208069, "step_time_sec": 8.230238199990708, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1153, "loss": 5.544317722320557, "lr": 0.0002, "elapsed_sec": 9554.880475521088, "step_time_sec": 8.229916829004651, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1154, "loss": 5.512048244476318, "lr": 0.0002, "elapsed_sec": 9563.110515356064, "step_time_sec": 8.229878402984468, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1155, "loss": 5.579041957855225, "lr": 0.0002, "elapsed_sec": 9571.34127998352, "step_time_sec": 8.230667468975298, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1156, "loss": 5.682010173797607, "lr": 0.0002, "elapsed_sec": 9579.571279287338, "step_time_sec": 8.229817068000557, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1157, "loss": 5.694503307342529, "lr": 0.0002, "elapsed_sec": 9587.801498651505, "step_time_sec": 8.23007682999014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1158, "loss": 5.63309383392334, "lr": 0.0002, "elapsed_sec": 9596.030009746552, "step_time_sec": 8.228334355022525, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1159, "loss": 5.571252346038818, "lr": 0.0002, "elapsed_sec": 9604.258209943771, "step_time_sec": 8.228041149995988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1160, "loss": 5.659143924713135, "lr": 0.0002, "elapsed_sec": 9612.486436367035, "step_time_sec": 8.228069384989794, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1161, "loss": 5.615793228149414, "lr": 0.0002, "elapsed_sec": 9620.714813947678, "step_time_sec": 8.228254268004093, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1162, "loss": 5.6013407707214355, "lr": 0.0002, "elapsed_sec": 9628.944147586823, "step_time_sec": 8.229140581010142, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1163, "loss": 5.647983074188232, "lr": 0.0002, "elapsed_sec": 9637.174936294556, "step_time_sec": 8.230691439996008, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1164, "loss": 5.568436622619629, "lr": 0.0002, "elapsed_sec": 9645.40531039238, "step_time_sec": 8.230165357002988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1165, "loss": 5.575326919555664, "lr": 0.0002, "elapsed_sec": 9653.636512517929, "step_time_sec": 8.231035970995435, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1166, "loss": 5.53780460357666, "lr": 0.0002, "elapsed_sec": 9661.86659693718, "step_time_sec": 8.229931314010173, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1167, "loss": 5.557323932647705, "lr": 0.0002, "elapsed_sec": 9670.096691846848, "step_time_sec": 8.229945790983038, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1168, "loss": 5.633837699890137, "lr": 0.0002, "elapsed_sec": 9678.326879501343, "step_time_sec": 8.230079336004565, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1169, "loss": 5.626791477203369, "lr": 0.0002, "elapsed_sec": 9686.556996107101, "step_time_sec": 8.229936610994628, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1170, "loss": 5.555593490600586, "lr": 0.0002, "elapsed_sec": 9694.78726696968, "step_time_sec": 8.23016129800817, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1171, "loss": 5.582589149475098, "lr": 0.0002, "elapsed_sec": 9703.015444040298, "step_time_sec": 8.227962121018209, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1172, "loss": 5.572051525115967, "lr": 0.0002, "elapsed_sec": 9711.245619297028, "step_time_sec": 8.230039446993032, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1173, "loss": 5.465534210205078, "lr": 0.0002, "elapsed_sec": 9719.47387123108, "step_time_sec": 8.228098252991913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1174, "loss": 5.6299943923950195, "lr": 0.0002, "elapsed_sec": 9727.701691865921, "step_time_sec": 8.227638812997611, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1175, "loss": 5.573958873748779, "lr": 0.0002, "elapsed_sec": 9735.93015217781, "step_time_sec": 8.228369287011446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1176, "loss": 5.555603504180908, "lr": 0.0002, "elapsed_sec": 9744.159151315689, "step_time_sec": 8.228783837024821, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1177, "loss": 5.498271942138672, "lr": 0.0002, "elapsed_sec": 9752.385518550873, "step_time_sec": 8.226268405007431, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1178, "loss": 5.600169658660889, "lr": 0.0002, "elapsed_sec": 9760.613312482834, "step_time_sec": 8.227667257015128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1179, "loss": 5.543737411499023, "lr": 0.0002, "elapsed_sec": 9768.842891216278, "step_time_sec": 8.22935171899735, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1180, "loss": 5.396057605743408, "lr": 0.0002, "elapsed_sec": 9777.071965456009, "step_time_sec": 8.228930633020354, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1181, "loss": 5.55145263671875, "lr": 0.0002, "elapsed_sec": 9785.300126791, "step_time_sec": 8.228010540973628, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1182, "loss": 5.481174945831299, "lr": 0.0002, "elapsed_sec": 9793.526692867279, "step_time_sec": 8.226459631987382, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1183, "loss": 5.5334649085998535, "lr": 0.0002, "elapsed_sec": 9801.757051467896, "step_time_sec": 8.230138764018193, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1184, "loss": 5.609901428222656, "lr": 0.0002, "elapsed_sec": 9809.98683643341, "step_time_sec": 8.229621702019358, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1185, "loss": 5.588444709777832, "lr": 0.0002, "elapsed_sec": 9818.215593099594, "step_time_sec": 8.228609394980595, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1186, "loss": 5.519460678100586, "lr": 0.0002, "elapsed_sec": 9826.443905115128, "step_time_sec": 8.22814775898587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1187, "loss": 5.58220911026001, "lr": 0.0002, "elapsed_sec": 9834.672040462494, "step_time_sec": 8.22800166500383, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1188, "loss": 5.5655364990234375, "lr": 0.0002, "elapsed_sec": 9842.9002597332, "step_time_sec": 8.228052782011218, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1189, "loss": 5.56648588180542, "lr": 0.0002, "elapsed_sec": 9851.129731416702, "step_time_sec": 8.229332178016193, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1190, "loss": 5.515830993652344, "lr": 0.0002, "elapsed_sec": 9859.358702898026, "step_time_sec": 8.228852430998813, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1191, "loss": 5.477833271026611, "lr": 0.0002, "elapsed_sec": 9867.586060762405, "step_time_sec": 8.227205900009722, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1192, "loss": 5.46124267578125, "lr": 0.0002, "elapsed_sec": 9875.814212083817, "step_time_sec": 8.227980985015165, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1193, "loss": 5.609429359436035, "lr": 0.0002, "elapsed_sec": 9884.043245315552, "step_time_sec": 8.228930529003264, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1194, "loss": 5.5623579025268555, "lr": 0.0002, "elapsed_sec": 9892.27128148079, "step_time_sec": 8.227853254997171, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1195, "loss": 5.431643009185791, "lr": 0.0002, "elapsed_sec": 9900.501665592194, "step_time_sec": 8.230184917978477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1196, "loss": 5.519975662231445, "lr": 0.0002, "elapsed_sec": 9908.731619596481, "step_time_sec": 8.229832017008448, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1197, "loss": 5.589743137359619, "lr": 0.0002, "elapsed_sec": 9916.962189674377, "step_time_sec": 8.230405052017886, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1198, "loss": 5.462673664093018, "lr": 0.0002, "elapsed_sec": 9925.190434455872, "step_time_sec": 8.228050747013185, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1199, "loss": 5.6563005447387695, "lr": 0.0002, "elapsed_sec": 9933.420503616333, "step_time_sec": 8.229943743004696, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1200, "loss": 5.532320022583008, "lr": 0.0002, "elapsed_sec": 9941.65063405037, "step_time_sec": 8.229971170978388, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1201, "loss": 5.653494834899902, "lr": 0.0002, "elapsed_sec": 9949.88128066063, "step_time_sec": 8.230505048006307, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1202, "loss": 5.4700140953063965, "lr": 0.0002, "elapsed_sec": 9958.108242034912, "step_time_sec": 8.226795658993069, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1203, "loss": 5.464073657989502, "lr": 0.0002, "elapsed_sec": 9966.338738679886, "step_time_sec": 8.230404996982543, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1204, "loss": 5.652071475982666, "lr": 0.0002, "elapsed_sec": 9974.568662881851, "step_time_sec": 8.229704349010717, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1205, "loss": 5.5933027267456055, "lr": 0.0002, "elapsed_sec": 9982.798387765884, "step_time_sec": 8.229605318978429, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1206, "loss": 5.467453479766846, "lr": 0.0002, "elapsed_sec": 9991.028276205063, "step_time_sec": 8.22979447801481, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1207, "loss": 5.574728965759277, "lr": 0.0002, "elapsed_sec": 9999.258513212204, "step_time_sec": 8.229999981995206, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1208, "loss": 5.445085525512695, "lr": 0.0002, "elapsed_sec": 10007.48887705803, "step_time_sec": 8.230231030000141, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1209, "loss": 5.634969711303711, "lr": 0.0002, "elapsed_sec": 10015.719889879227, "step_time_sec": 8.23081545601599, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1210, "loss": 5.654153347015381, "lr": 0.0002, "elapsed_sec": 10023.950390338898, "step_time_sec": 8.230348373006564, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1211, "loss": 5.602108001708984, "lr": 0.0002, "elapsed_sec": 10032.180440425873, "step_time_sec": 8.229914991999976, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1212, "loss": 5.681407451629639, "lr": 0.0002, "elapsed_sec": 10040.411254882812, "step_time_sec": 8.230664370988961, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1213, "loss": 5.666357040405273, "lr": 0.0002, "elapsed_sec": 10048.64153289795, "step_time_sec": 8.23013843598892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1214, "loss": 5.543376922607422, "lr": 0.0002, "elapsed_sec": 10056.871004343033, "step_time_sec": 8.229308562993538, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1215, "loss": 5.51506233215332, "lr": 0.0002, "elapsed_sec": 10065.101355314255, "step_time_sec": 8.230177341989474, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1216, "loss": 5.505213737487793, "lr": 0.0002, "elapsed_sec": 10073.330545902252, "step_time_sec": 8.229046947992174, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1217, "loss": 5.420942306518555, "lr": 0.0002, "elapsed_sec": 10081.560159683228, "step_time_sec": 8.22946926401346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1218, "loss": 5.473155975341797, "lr": 0.0002, "elapsed_sec": 10089.790727376938, "step_time_sec": 8.230412008007988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1219, "loss": 5.565522193908691, "lr": 0.0002, "elapsed_sec": 10098.020611047745, "step_time_sec": 8.229704828001559, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1220, "loss": 5.498402118682861, "lr": 0.0002, "elapsed_sec": 10106.247621536255, "step_time_sec": 8.226914497994585, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1221, "loss": 5.548435688018799, "lr": 0.0002, "elapsed_sec": 10114.476898431778, "step_time_sec": 8.229086424980778, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1222, "loss": 5.3724751472473145, "lr": 0.0002, "elapsed_sec": 10122.70682144165, "step_time_sec": 8.229818975989474, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1223, "loss": 5.663214683532715, "lr": 0.0002, "elapsed_sec": 10130.937629699707, "step_time_sec": 8.230655658990145, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1224, "loss": 5.418937683105469, "lr": 0.0002, "elapsed_sec": 10139.167440652847, "step_time_sec": 8.229644120990997, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1225, "loss": 5.506650447845459, "lr": 0.0002, "elapsed_sec": 10147.39757180214, "step_time_sec": 8.229952049994608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1226, "loss": 5.451193809509277, "lr": 0.0002, "elapsed_sec": 10155.626853942871, "step_time_sec": 8.229142077994766, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1227, "loss": 5.59239387512207, "lr": 0.0002, "elapsed_sec": 10163.85323214531, "step_time_sec": 8.226191191992257, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1228, "loss": 5.551384925842285, "lr": 0.0002, "elapsed_sec": 10172.082866191864, "step_time_sec": 8.229480189009337, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1229, "loss": 5.533841609954834, "lr": 0.0002, "elapsed_sec": 10180.311722755432, "step_time_sec": 8.22870098700514, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1230, "loss": 5.408297538757324, "lr": 0.0002, "elapsed_sec": 10188.540479183197, "step_time_sec": 8.228613292012597, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1231, "loss": 5.712447643280029, "lr": 0.0002, "elapsed_sec": 10196.769941091537, "step_time_sec": 8.22930506701232, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1232, "loss": 5.523532867431641, "lr": 0.0002, "elapsed_sec": 10204.998081922531, "step_time_sec": 8.227994903980289, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1233, "loss": 5.547264099121094, "lr": 0.0002, "elapsed_sec": 10213.227115392685, "step_time_sec": 8.228857023001183, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1234, "loss": 5.653656482696533, "lr": 0.0002, "elapsed_sec": 10221.455395460129, "step_time_sec": 8.228151006012922, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1235, "loss": 5.481283187866211, "lr": 0.0002, "elapsed_sec": 10229.68286538124, "step_time_sec": 8.227364087011665, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1236, "loss": 5.555461406707764, "lr": 0.0002, "elapsed_sec": 10237.911298274994, "step_time_sec": 8.228285138000501, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1237, "loss": 5.647822856903076, "lr": 0.0002, "elapsed_sec": 10246.139413118362, "step_time_sec": 8.22797655899194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1238, "loss": 5.552128791809082, "lr": 0.0002, "elapsed_sec": 10254.37107515335, "step_time_sec": 8.231439789989963, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1239, "loss": 5.463386058807373, "lr": 0.0002, "elapsed_sec": 10262.601032972336, "step_time_sec": 8.229793231992517, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1240, "loss": 5.358076095581055, "lr": 0.0002, "elapsed_sec": 10270.829500198364, "step_time_sec": 8.228311640006723, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1241, "loss": 5.501327037811279, "lr": 0.0002, "elapsed_sec": 10279.059492588043, "step_time_sec": 8.229857858008472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1242, "loss": 5.517333984375, "lr": 0.0002, "elapsed_sec": 10287.286674499512, "step_time_sec": 8.227023968996946, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1243, "loss": 5.520384788513184, "lr": 0.0002, "elapsed_sec": 10295.51512503624, "step_time_sec": 8.228285972989397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1244, "loss": 5.527592182159424, "lr": 0.0002, "elapsed_sec": 10303.744078159332, "step_time_sec": 8.228820494987303, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1245, "loss": 5.539409637451172, "lr": 0.0002, "elapsed_sec": 10311.974240541458, "step_time_sec": 8.229985340003623, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1246, "loss": 5.529576778411865, "lr": 0.0002, "elapsed_sec": 10320.243112564087, "step_time_sec": 8.225219246000051, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1247, "loss": 5.487466812133789, "lr": 0.0002, "elapsed_sec": 10328.470373153687, "step_time_sec": 8.227126121986657, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1248, "loss": 5.478843688964844, "lr": 0.0002, "elapsed_sec": 10336.698389053345, "step_time_sec": 8.227875254990067, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1249, "loss": 5.556009769439697, "lr": 0.0002, "elapsed_sec": 10344.9282143116, "step_time_sec": 8.229659101983998, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1250, "loss": 5.477759838104248, "lr": 0.0002, "elapsed_sec": 10353.157923221588, "step_time_sec": 8.22958035999909, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1251, "loss": 5.458887577056885, "lr": 0.0002, "elapsed_sec": 10361.388422966003, "step_time_sec": 8.230329317972064, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1252, "loss": 5.466886520385742, "lr": 0.0002, "elapsed_sec": 10369.618727684021, "step_time_sec": 8.230152723001083, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1253, "loss": 5.505678653717041, "lr": 0.0002, "elapsed_sec": 10377.848898887634, "step_time_sec": 8.230014286993537, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1254, "loss": 5.487246036529541, "lr": 0.0002, "elapsed_sec": 10386.078619241714, "step_time_sec": 8.229584807006177, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1255, "loss": 5.4675703048706055, "lr": 0.0002, "elapsed_sec": 10394.30817079544, "step_time_sec": 8.229351877991576, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1256, "loss": 5.537589073181152, "lr": 0.0002, "elapsed_sec": 10402.538867235184, "step_time_sec": 8.230540082004154, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1257, "loss": 5.413910388946533, "lr": 0.0002, "elapsed_sec": 10410.766807079315, "step_time_sec": 8.227795819984749, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1258, "loss": 5.407729625701904, "lr": 0.0002, "elapsed_sec": 10418.994829416275, "step_time_sec": 8.227871034003329, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1259, "loss": 5.485443115234375, "lr": 0.0002, "elapsed_sec": 10427.224785804749, "step_time_sec": 8.22982946402044, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1260, "loss": 5.503393173217773, "lr": 0.0002, "elapsed_sec": 10435.453939914703, "step_time_sec": 8.228981610998744, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1261, "loss": 5.514583110809326, "lr": 0.0002, "elapsed_sec": 10443.682260990143, "step_time_sec": 8.228162340004928, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1262, "loss": 5.429033279418945, "lr": 0.0002, "elapsed_sec": 10451.91083574295, "step_time_sec": 8.228419693012256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1263, "loss": 5.443813800811768, "lr": 0.0002, "elapsed_sec": 10460.140134334564, "step_time_sec": 8.229207018011948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1264, "loss": 5.48978853225708, "lr": 0.0002, "elapsed_sec": 10468.369873046875, "step_time_sec": 8.229512902995339, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1265, "loss": 5.463457107543945, "lr": 0.0002, "elapsed_sec": 10476.600584030151, "step_time_sec": 8.230558086012024, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1266, "loss": 5.434953212738037, "lr": 0.0002, "elapsed_sec": 10484.829419851303, "step_time_sec": 8.22869138000533, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1267, "loss": 5.554238796234131, "lr": 0.0002, "elapsed_sec": 10493.057295799255, "step_time_sec": 8.227781902009156, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1268, "loss": 5.517668724060059, "lr": 0.0002, "elapsed_sec": 10501.285996437073, "step_time_sec": 8.228538828989258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1269, "loss": 5.534060478210449, "lr": 0.0002, "elapsed_sec": 10509.51260137558, "step_time_sec": 8.226424074993702, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1270, "loss": 5.4459357261657715, "lr": 0.0002, "elapsed_sec": 10517.74076628685, "step_time_sec": 8.227995020977687, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1271, "loss": 5.422922134399414, "lr": 0.0002, "elapsed_sec": 10525.970719099045, "step_time_sec": 8.229792986996472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1272, "loss": 5.381073474884033, "lr": 0.0002, "elapsed_sec": 10534.200918674469, "step_time_sec": 8.230103061010595, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1273, "loss": 5.538698673248291, "lr": 0.0002, "elapsed_sec": 10542.430762767792, "step_time_sec": 8.229596929013496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1274, "loss": 5.4998016357421875, "lr": 0.0002, "elapsed_sec": 10550.661475658417, "step_time_sec": 8.230551782005932, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1275, "loss": 5.46717643737793, "lr": 0.0002, "elapsed_sec": 10558.891443490982, "step_time_sec": 8.229826414986746, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1276, "loss": 5.3665947914123535, "lr": 0.0002, "elapsed_sec": 10567.120560407639, "step_time_sec": 8.228977018006844, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1277, "loss": 5.4568328857421875, "lr": 0.0002, "elapsed_sec": 10575.350739717484, "step_time_sec": 8.23008093397948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1278, "loss": 5.439971923828125, "lr": 0.0002, "elapsed_sec": 10583.580790758133, "step_time_sec": 8.229820918000769, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1279, "loss": 5.43610143661499, "lr": 0.0002, "elapsed_sec": 10591.81078863144, "step_time_sec": 8.229851543001132, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1280, "loss": 5.5180535316467285, "lr": 0.0002, "elapsed_sec": 10600.040849685669, "step_time_sec": 8.229904344014358, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1281, "loss": 5.484982490539551, "lr": 0.0002, "elapsed_sec": 10608.269005298615, "step_time_sec": 8.228064160997747, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1282, "loss": 5.382582187652588, "lr": 0.0002, "elapsed_sec": 10616.497013807297, "step_time_sec": 8.22781285099336, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1283, "loss": 5.461899280548096, "lr": 0.0002, "elapsed_sec": 10624.72650194168, "step_time_sec": 8.229311748000327, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1284, "loss": 5.521420001983643, "lr": 0.0002, "elapsed_sec": 10632.956838607788, "step_time_sec": 8.230140327999834, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1285, "loss": 5.468243598937988, "lr": 0.0002, "elapsed_sec": 10641.187468767166, "step_time_sec": 8.23049240198452, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1286, "loss": 5.590656280517578, "lr": 0.0002, "elapsed_sec": 10649.417083978653, "step_time_sec": 8.229476171982242, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1287, "loss": 5.560186862945557, "lr": 0.0002, "elapsed_sec": 10657.646883964539, "step_time_sec": 8.229606020991923, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1288, "loss": 5.525536060333252, "lr": 0.0002, "elapsed_sec": 10665.876679897308, "step_time_sec": 8.229717904992867, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1289, "loss": 5.3322906494140625, "lr": 0.0002, "elapsed_sec": 10674.107548236847, "step_time_sec": 8.230645757983439, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1290, "loss": 5.408121109008789, "lr": 0.0002, "elapsed_sec": 10682.334763288498, "step_time_sec": 8.227063249010826, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1291, "loss": 5.475848197937012, "lr": 0.0002, "elapsed_sec": 10690.563426494598, "step_time_sec": 8.22855455899844, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1292, "loss": 5.517673015594482, "lr": 0.0002, "elapsed_sec": 10698.791994571686, "step_time_sec": 8.22834245901322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1293, "loss": 5.464345455169678, "lr": 0.0002, "elapsed_sec": 10707.021388053894, "step_time_sec": 8.22922901000129, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1294, "loss": 5.494153022766113, "lr": 0.0002, "elapsed_sec": 10715.251720905304, "step_time_sec": 8.230184551008279, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1295, "loss": 5.444564342498779, "lr": 0.0002, "elapsed_sec": 10723.481857061386, "step_time_sec": 8.2300235540024, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1296, "loss": 5.546168804168701, "lr": 0.0002, "elapsed_sec": 10731.71278166771, "step_time_sec": 8.230766449996736, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1297, "loss": 5.476256847381592, "lr": 0.0002, "elapsed_sec": 10739.940961360931, "step_time_sec": 8.228022023977246, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1298, "loss": 5.4232611656188965, "lr": 0.0002, "elapsed_sec": 10748.169059991837, "step_time_sec": 8.227910517976852, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1299, "loss": 5.508090019226074, "lr": 0.0002, "elapsed_sec": 10756.399379491806, "step_time_sec": 8.230218779994175, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1300, "loss": 5.442134380340576, "lr": 0.0002, "elapsed_sec": 10764.629306554794, "step_time_sec": 8.229720111005008, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1301, "loss": 5.427575588226318, "lr": 0.0002, "elapsed_sec": 10772.859491109848, "step_time_sec": 8.230008442013059, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1302, "loss": 5.495466709136963, "lr": 0.0002, "elapsed_sec": 10781.087477207184, "step_time_sec": 8.22789448601543, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1303, "loss": 5.459002494812012, "lr": 0.0002, "elapsed_sec": 10789.317349910736, "step_time_sec": 8.229725863988278, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1304, "loss": 5.516921520233154, "lr": 0.0002, "elapsed_sec": 10797.545395851135, "step_time_sec": 8.227839932980714, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1305, "loss": 5.34785795211792, "lr": 0.0002, "elapsed_sec": 10805.773694753647, "step_time_sec": 8.228134032979142, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1306, "loss": 5.437723159790039, "lr": 0.0002, "elapsed_sec": 10814.002345085144, "step_time_sec": 8.228520114003913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1307, "loss": 5.480686187744141, "lr": 0.0002, "elapsed_sec": 10822.230333805084, "step_time_sec": 8.22782694498892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1308, "loss": 5.4076385498046875, "lr": 0.0002, "elapsed_sec": 10830.458882808685, "step_time_sec": 8.22840801300481, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1309, "loss": 5.459902763366699, "lr": 0.0002, "elapsed_sec": 10838.689317464828, "step_time_sec": 8.230315352993784, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1310, "loss": 5.719472885131836, "lr": 0.0002, "elapsed_sec": 10846.919084787369, "step_time_sec": 8.229594412987353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1311, "loss": 5.4531354904174805, "lr": 0.0002, "elapsed_sec": 10855.148592233658, "step_time_sec": 8.229313066985924, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1312, "loss": 5.500546455383301, "lr": 0.0002, "elapsed_sec": 10863.37750339508, "step_time_sec": 8.22877205000259, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1313, "loss": 5.395552158355713, "lr": 0.0002, "elapsed_sec": 10871.606792211533, "step_time_sec": 8.229207222000696, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1314, "loss": 5.330509185791016, "lr": 0.0002, "elapsed_sec": 10879.837123155594, "step_time_sec": 8.230095822014846, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1315, "loss": 5.413529872894287, "lr": 0.0002, "elapsed_sec": 10888.06717991829, "step_time_sec": 8.22990892000962, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1316, "loss": 5.44632625579834, "lr": 0.0002, "elapsed_sec": 10896.297232866287, "step_time_sec": 8.229951357003301, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1317, "loss": 5.486462593078613, "lr": 0.0002, "elapsed_sec": 10904.524827003479, "step_time_sec": 8.227369145024568, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1318, "loss": 5.361860275268555, "lr": 0.0002, "elapsed_sec": 10912.753576993942, "step_time_sec": 8.22860544998548, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1319, "loss": 5.361323356628418, "lr": 0.0002, "elapsed_sec": 10920.980776309967, "step_time_sec": 8.227058686985401, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1320, "loss": 5.462493419647217, "lr": 0.0002, "elapsed_sec": 10929.20983171463, "step_time_sec": 8.228920032008318, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1321, "loss": 5.326561450958252, "lr": 0.0002, "elapsed_sec": 10937.439992189407, "step_time_sec": 8.229987138998695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1322, "loss": 5.397173881530762, "lr": 0.0002, "elapsed_sec": 10945.668056964874, "step_time_sec": 8.227908725006273, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1323, "loss": 5.489904403686523, "lr": 0.0002, "elapsed_sec": 10953.895612478256, "step_time_sec": 8.227450443984708, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1324, "loss": 5.5042924880981445, "lr": 0.0002, "elapsed_sec": 10962.122204303741, "step_time_sec": 8.226396938000107, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1325, "loss": 5.397862911224365, "lr": 0.0002, "elapsed_sec": 10970.351624250412, "step_time_sec": 8.229241174994968, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1326, "loss": 5.566128730773926, "lr": 0.0002, "elapsed_sec": 10978.581406116486, "step_time_sec": 8.229708282015054, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1327, "loss": 5.429647922515869, "lr": 0.0002, "elapsed_sec": 10986.80927658081, "step_time_sec": 8.227686644007917, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1328, "loss": 5.388434410095215, "lr": 0.0002, "elapsed_sec": 10995.03760266304, "step_time_sec": 8.228201137011638, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1329, "loss": 5.343064308166504, "lr": 0.0002, "elapsed_sec": 11003.268109798431, "step_time_sec": 8.230282297998201, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1330, "loss": 5.379863262176514, "lr": 0.0002, "elapsed_sec": 11011.497350931168, "step_time_sec": 8.229139211005531, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1331, "loss": 5.458694934844971, "lr": 0.0002, "elapsed_sec": 11019.72739982605, "step_time_sec": 8.229855379002402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1332, "loss": 5.32288122177124, "lr": 0.0002, "elapsed_sec": 11027.957378149033, "step_time_sec": 8.229820120992372, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1333, "loss": 5.337150573730469, "lr": 0.0002, "elapsed_sec": 11036.188843488693, "step_time_sec": 8.231318578007631, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1334, "loss": 5.41562032699585, "lr": 0.0002, "elapsed_sec": 11044.418755531311, "step_time_sec": 8.229760665999493, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1335, "loss": 5.542124271392822, "lr": 0.0002, "elapsed_sec": 11052.648233175278, "step_time_sec": 8.229328379995422, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1336, "loss": 5.395009994506836, "lr": 0.0002, "elapsed_sec": 11060.878235578537, "step_time_sec": 8.229850572999567, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1337, "loss": 5.541747570037842, "lr": 0.0002, "elapsed_sec": 11069.10803604126, "step_time_sec": 8.229661436984316, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1338, "loss": 5.462318420410156, "lr": 0.0002, "elapsed_sec": 11077.33916592598, "step_time_sec": 8.230983775982168, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1339, "loss": 5.447853088378906, "lr": 0.0002, "elapsed_sec": 11085.56788611412, "step_time_sec": 8.228549057996133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1340, "loss": 5.404531955718994, "lr": 0.0002, "elapsed_sec": 11093.796276330948, "step_time_sec": 8.228262967983028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1341, "loss": 5.35699987411499, "lr": 0.0002, "elapsed_sec": 11102.026656150818, "step_time_sec": 8.230194472009316, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1342, "loss": 5.377230644226074, "lr": 0.0002, "elapsed_sec": 11110.254101514816, "step_time_sec": 8.227283565996913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1343, "loss": 5.465502738952637, "lr": 0.0002, "elapsed_sec": 11118.481853723526, "step_time_sec": 8.227609168010531, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1344, "loss": 5.4137115478515625, "lr": 0.0002, "elapsed_sec": 11126.710671901703, "step_time_sec": 8.228700022009434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1345, "loss": 5.416406631469727, "lr": 0.0002, "elapsed_sec": 11134.93889594078, "step_time_sec": 8.228021446993807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1346, "loss": 5.2531914710998535, "lr": 0.0002, "elapsed_sec": 11143.16931438446, "step_time_sec": 8.230269873980433, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1347, "loss": 5.341501712799072, "lr": 0.0002, "elapsed_sec": 11151.399708509445, "step_time_sec": 8.230231399007607, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1348, "loss": 5.397266387939453, "lr": 0.0002, "elapsed_sec": 11159.629956960678, "step_time_sec": 8.230117968982086, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1349, "loss": 5.401182174682617, "lr": 0.0002, "elapsed_sec": 11167.858865976334, "step_time_sec": 8.228737855999498, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1350, "loss": 5.345470428466797, "lr": 0.0002, "elapsed_sec": 11176.087391614914, "step_time_sec": 8.228407762013376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1351, "loss": 5.43839693069458, "lr": 0.0002, "elapsed_sec": 11184.315860271454, "step_time_sec": 8.228308368008584, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1352, "loss": 5.33042049407959, "lr": 0.0002, "elapsed_sec": 11192.543750047684, "step_time_sec": 8.227730726997834, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1353, "loss": 5.328221797943115, "lr": 0.0002, "elapsed_sec": 11200.7720079422, "step_time_sec": 8.228094918013085, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1354, "loss": 5.535606861114502, "lr": 0.0002, "elapsed_sec": 11209.001651287079, "step_time_sec": 8.22952136999811, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1355, "loss": 5.3553972244262695, "lr": 0.0002, "elapsed_sec": 11217.231432676315, "step_time_sec": 8.229593695985386, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1356, "loss": 5.387993812561035, "lr": 0.0002, "elapsed_sec": 11225.461261987686, "step_time_sec": 8.229674328991678, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1357, "loss": 5.569149494171143, "lr": 0.0002, "elapsed_sec": 11233.691105604172, "step_time_sec": 8.229753736988641, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1358, "loss": 5.371999740600586, "lr": 0.0002, "elapsed_sec": 11241.921840906143, "step_time_sec": 8.230554283014499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1359, "loss": 5.503944396972656, "lr": 0.0002, "elapsed_sec": 11250.152884483337, "step_time_sec": 8.230841737997252, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1360, "loss": 5.424986839294434, "lr": 0.0002, "elapsed_sec": 11258.382761240005, "step_time_sec": 8.22971707599936, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1361, "loss": 5.47074031829834, "lr": 0.0002, "elapsed_sec": 11266.613065719604, "step_time_sec": 8.230177149991505, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1362, "loss": 5.388761043548584, "lr": 0.0002, "elapsed_sec": 11274.843357563019, "step_time_sec": 8.230092277022777, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1363, "loss": 5.4109697341918945, "lr": 0.0002, "elapsed_sec": 11283.07370519638, "step_time_sec": 8.230201914004283, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1364, "loss": 5.4440741539001465, "lr": 0.0002, "elapsed_sec": 11291.303397655487, "step_time_sec": 8.229532880999614, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1365, "loss": 5.4141740798950195, "lr": 0.0002, "elapsed_sec": 11299.532896280289, "step_time_sec": 8.22937517601531, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1366, "loss": 5.464831352233887, "lr": 0.0002, "elapsed_sec": 11307.762128353119, "step_time_sec": 8.229040233010892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1367, "loss": 5.390719890594482, "lr": 0.0002, "elapsed_sec": 11315.991484165192, "step_time_sec": 8.229210941004567, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1368, "loss": 5.378866195678711, "lr": 0.0002, "elapsed_sec": 11324.222429275513, "step_time_sec": 8.23084069500328, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1369, "loss": 5.473995685577393, "lr": 0.0002, "elapsed_sec": 11332.454010725021, "step_time_sec": 8.231419579009525, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1370, "loss": 5.293039321899414, "lr": 0.0002, "elapsed_sec": 11340.684518814087, "step_time_sec": 8.230294326000148, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1371, "loss": 5.328714847564697, "lr": 0.0002, "elapsed_sec": 11348.91274857521, "step_time_sec": 8.228074632992502, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1372, "loss": 5.492664813995361, "lr": 0.0002, "elapsed_sec": 11357.141166210175, "step_time_sec": 8.228327965014614, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1373, "loss": 5.420698642730713, "lr": 0.0002, "elapsed_sec": 11365.370265722275, "step_time_sec": 8.228906100994209, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1374, "loss": 5.349390506744385, "lr": 0.0002, "elapsed_sec": 11373.599961042404, "step_time_sec": 8.22953481102013, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1375, "loss": 5.470114231109619, "lr": 0.0002, "elapsed_sec": 11381.829689741135, "step_time_sec": 8.22962528798962, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1376, "loss": 5.335556983947754, "lr": 0.0002, "elapsed_sec": 11390.060130357742, "step_time_sec": 8.230240947013954, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1377, "loss": 5.334333896636963, "lr": 0.0002, "elapsed_sec": 11398.28824186325, "step_time_sec": 8.22803007697803, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1378, "loss": 5.422367572784424, "lr": 0.0002, "elapsed_sec": 11406.517440795898, "step_time_sec": 8.229032884002663, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1379, "loss": 5.49714469909668, "lr": 0.0002, "elapsed_sec": 11414.747778892517, "step_time_sec": 8.230107503011823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1380, "loss": 5.431042671203613, "lr": 0.0002, "elapsed_sec": 11422.977872610092, "step_time_sec": 8.229948099004105, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1381, "loss": 5.421980381011963, "lr": 0.0002, "elapsed_sec": 11431.208571910858, "step_time_sec": 8.230592534004245, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1382, "loss": 5.2968926429748535, "lr": 0.0002, "elapsed_sec": 11439.438310861588, "step_time_sec": 8.229613491974305, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1383, "loss": 5.446869850158691, "lr": 0.0002, "elapsed_sec": 11447.668534517288, "step_time_sec": 8.230089152988512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1384, "loss": 5.403665542602539, "lr": 0.0002, "elapsed_sec": 11455.898800611496, "step_time_sec": 8.230016373010585, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1385, "loss": 5.407164573669434, "lr": 0.0002, "elapsed_sec": 11464.128032922745, "step_time_sec": 8.229071423993446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1386, "loss": 5.451365947723389, "lr": 0.0002, "elapsed_sec": 11472.355461835861, "step_time_sec": 8.2273004539893, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1387, "loss": 5.355420112609863, "lr": 0.0002, "elapsed_sec": 11480.583969831467, "step_time_sec": 8.22834451799281, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1388, "loss": 5.251337051391602, "lr": 0.0002, "elapsed_sec": 11488.812304973602, "step_time_sec": 8.22818786802236, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1389, "loss": 5.339956760406494, "lr": 0.0002, "elapsed_sec": 11497.042397975922, "step_time_sec": 8.229947436979273, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1390, "loss": 5.288456916809082, "lr": 0.0002, "elapsed_sec": 11505.271678447723, "step_time_sec": 8.229122435004683, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1391, "loss": 5.317320823669434, "lr": 0.0002, "elapsed_sec": 11513.498982191086, "step_time_sec": 8.227132140018512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1392, "loss": 5.425436496734619, "lr": 0.0002, "elapsed_sec": 11521.727091550827, "step_time_sec": 8.22791992398561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1393, "loss": 5.354106426239014, "lr": 0.0002, "elapsed_sec": 11529.956286907196, "step_time_sec": 8.22906829000567, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1394, "loss": 5.386872291564941, "lr": 0.0002, "elapsed_sec": 11538.186758518219, "step_time_sec": 8.230361221998464, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1395, "loss": 5.323346138000488, "lr": 0.0002, "elapsed_sec": 11546.416732311249, "step_time_sec": 8.229754604020854, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1396, "loss": 5.411965370178223, "lr": 0.0002, "elapsed_sec": 11554.643931627274, "step_time_sec": 8.227092432003701, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1397, "loss": 5.2905755043029785, "lr": 0.0002, "elapsed_sec": 11562.87349152565, "step_time_sec": 8.229403459001333, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1398, "loss": 5.397989749908447, "lr": 0.0002, "elapsed_sec": 11571.104863166809, "step_time_sec": 8.231184442993253, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1399, "loss": 5.3946027755737305, "lr": 0.0002, "elapsed_sec": 11579.335103988647, "step_time_sec": 8.230109891999746, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1400, "loss": 5.4033050537109375, "lr": 0.0002, "elapsed_sec": 11587.56481051445, "step_time_sec": 8.22951178601943, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1401, "loss": 5.255270481109619, "lr": 0.0002, "elapsed_sec": 11595.793823719025, "step_time_sec": 8.228873051004484, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1402, "loss": 5.183459758758545, "lr": 0.0002, "elapsed_sec": 11604.022777318954, "step_time_sec": 8.228776976000518, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1403, "loss": 5.310100555419922, "lr": 0.0002, "elapsed_sec": 11612.24985742569, "step_time_sec": 8.227004718006356, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1404, "loss": 5.295159816741943, "lr": 0.0002, "elapsed_sec": 11620.476887226105, "step_time_sec": 8.226823967997916, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1405, "loss": 5.350277423858643, "lr": 0.0002, "elapsed_sec": 11628.707669258118, "step_time_sec": 8.230628940014867, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1406, "loss": 5.465940475463867, "lr": 0.0002, "elapsed_sec": 11636.93632721901, "step_time_sec": 8.228493594011525, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1407, "loss": 5.401731014251709, "lr": 0.0002, "elapsed_sec": 11645.165580749512, "step_time_sec": 8.229108138999436, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1408, "loss": 5.40169620513916, "lr": 0.0002, "elapsed_sec": 11653.392538785934, "step_time_sec": 8.22681077499874, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1409, "loss": 5.295819282531738, "lr": 0.0002, "elapsed_sec": 11661.621180772781, "step_time_sec": 8.228504108992638, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1410, "loss": 5.428177833557129, "lr": 0.0002, "elapsed_sec": 11669.848866939545, "step_time_sec": 8.227544883004157, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1411, "loss": 5.384238243103027, "lr": 0.0002, "elapsed_sec": 11678.079549312592, "step_time_sec": 8.230457498983014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1412, "loss": 5.407491207122803, "lr": 0.0002, "elapsed_sec": 11686.30991601944, "step_time_sec": 8.230221948004328, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1413, "loss": 5.397771835327148, "lr": 0.0002, "elapsed_sec": 11694.540896892548, "step_time_sec": 8.230875462992117, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1414, "loss": 5.304927349090576, "lr": 0.0002, "elapsed_sec": 11702.77155828476, "step_time_sec": 8.230475032993127, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1415, "loss": 5.388273239135742, "lr": 0.0002, "elapsed_sec": 11711.00170993805, "step_time_sec": 8.230001444986556, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1416, "loss": 5.336938858032227, "lr": 0.0002, "elapsed_sec": 11719.232662677765, "step_time_sec": 8.230814729991835, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1417, "loss": 5.215335845947266, "lr": 0.0002, "elapsed_sec": 11727.461755752563, "step_time_sec": 8.22895986601361, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1418, "loss": 5.368323802947998, "lr": 0.0002, "elapsed_sec": 11735.690066576004, "step_time_sec": 8.228114442987135, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1419, "loss": 5.37953519821167, "lr": 0.0002, "elapsed_sec": 11743.920242071152, "step_time_sec": 8.230031984014204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1420, "loss": 5.656293869018555, "lr": 0.0002, "elapsed_sec": 11752.150851011276, "step_time_sec": 8.230477570992662, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1421, "loss": 5.280479907989502, "lr": 0.0002, "elapsed_sec": 11760.379283428192, "step_time_sec": 8.228254257992376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1422, "loss": 5.490659236907959, "lr": 0.0002, "elapsed_sec": 11768.608391284943, "step_time_sec": 8.228938537999056, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1423, "loss": 5.341949462890625, "lr": 0.0002, "elapsed_sec": 11776.838566303253, "step_time_sec": 8.230049120989861, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1424, "loss": 5.3166890144348145, "lr": 0.0002, "elapsed_sec": 11785.068635225296, "step_time_sec": 8.229881341016153, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1425, "loss": 5.422796249389648, "lr": 0.0002, "elapsed_sec": 11793.298805236816, "step_time_sec": 8.230059942987282, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1426, "loss": 5.412402153015137, "lr": 0.0002, "elapsed_sec": 11801.527552366257, "step_time_sec": 8.228549933002796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1427, "loss": 5.438803672790527, "lr": 0.0002, "elapsed_sec": 11809.756611824036, "step_time_sec": 8.228959164000116, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1428, "loss": 5.685011386871338, "lr": 0.0002, "elapsed_sec": 11817.983832597733, "step_time_sec": 8.2270006199833, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1429, "loss": 5.410223484039307, "lr": 0.0002, "elapsed_sec": 11826.211345672607, "step_time_sec": 8.227368439984275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1430, "loss": 5.346933364868164, "lr": 0.0002, "elapsed_sec": 11834.439854383469, "step_time_sec": 8.228336748987203, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1431, "loss": 5.299886703491211, "lr": 0.0002, "elapsed_sec": 11842.670288801193, "step_time_sec": 8.230308821977815, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1432, "loss": 5.289981842041016, "lr": 0.0002, "elapsed_sec": 11850.898867845535, "step_time_sec": 8.228424813976744, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1433, "loss": 5.368850231170654, "lr": 0.0002, "elapsed_sec": 11859.12684583664, "step_time_sec": 8.227802744018845, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1434, "loss": 5.3013997077941895, "lr": 0.0002, "elapsed_sec": 11867.357872486115, "step_time_sec": 8.230931842001155, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1435, "loss": 5.320317268371582, "lr": 0.0002, "elapsed_sec": 11875.587672948837, "step_time_sec": 8.229578703001607, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1436, "loss": 5.4035820960998535, "lr": 0.0002, "elapsed_sec": 11883.818142414093, "step_time_sec": 8.230329268000787, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1437, "loss": 5.3899641036987305, "lr": 0.0002, "elapsed_sec": 11892.047892332077, "step_time_sec": 8.229595707991393, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1438, "loss": 5.202321529388428, "lr": 0.0002, "elapsed_sec": 11900.276584386826, "step_time_sec": 8.228603633993771, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1439, "loss": 5.261757850646973, "lr": 0.0002, "elapsed_sec": 11908.506828546524, "step_time_sec": 8.23001723800553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1440, "loss": 5.347582817077637, "lr": 0.0002, "elapsed_sec": 11916.736956834793, "step_time_sec": 8.229998169990722, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1441, "loss": 5.442564010620117, "lr": 0.0002, "elapsed_sec": 11924.96750330925, "step_time_sec": 8.23041624299367, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1442, "loss": 5.184544563293457, "lr": 0.0002, "elapsed_sec": 11933.198027849197, "step_time_sec": 8.230315011984203, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1443, "loss": 5.188876628875732, "lr": 0.0002, "elapsed_sec": 11941.42715549469, "step_time_sec": 8.228949919983279, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1444, "loss": 5.381946086883545, "lr": 0.0002, "elapsed_sec": 11949.657200574875, "step_time_sec": 8.229912352981046, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1445, "loss": 5.326719760894775, "lr": 0.0002, "elapsed_sec": 11957.886303424835, "step_time_sec": 8.2289654139895, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1446, "loss": 5.286273002624512, "lr": 0.0002, "elapsed_sec": 11966.11505818367, "step_time_sec": 8.228561836993322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1447, "loss": 5.3751397132873535, "lr": 0.0002, "elapsed_sec": 11974.34492468834, "step_time_sec": 8.229686821025098, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1448, "loss": 5.399181365966797, "lr": 0.0002, "elapsed_sec": 11982.574824333191, "step_time_sec": 8.229769158991985, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1449, "loss": 5.330533027648926, "lr": 0.0002, "elapsed_sec": 11990.804218769073, "step_time_sec": 8.229198379995069, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1450, "loss": 5.248652458190918, "lr": 0.0002, "elapsed_sec": 11999.033522844315, "step_time_sec": 8.22915036699851, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1451, "loss": 5.292382717132568, "lr": 0.0002, "elapsed_sec": 12007.261165618896, "step_time_sec": 8.227539987012278, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1452, "loss": 5.460977077484131, "lr": 0.0002, "elapsed_sec": 12015.490360975266, "step_time_sec": 8.229028873000061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1453, "loss": 5.288763046264648, "lr": 0.0002, "elapsed_sec": 12023.71937918663, "step_time_sec": 8.228803779988084, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1454, "loss": 5.225565433502197, "lr": 0.0002, "elapsed_sec": 12031.949147701263, "step_time_sec": 8.229588820016943, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1455, "loss": 5.304860591888428, "lr": 0.0002, "elapsed_sec": 12040.179329633713, "step_time_sec": 8.230033627012745, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1456, "loss": 5.36111307144165, "lr": 0.0002, "elapsed_sec": 12048.408047437668, "step_time_sec": 8.228550189989619, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1457, "loss": 5.2632737159729, "lr": 0.0002, "elapsed_sec": 12056.636078119278, "step_time_sec": 8.227798299019923, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1458, "loss": 5.278379917144775, "lr": 0.0002, "elapsed_sec": 12064.86464881897, "step_time_sec": 8.22842859098455, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1459, "loss": 5.37193489074707, "lr": 0.0002, "elapsed_sec": 12073.092261075974, "step_time_sec": 8.227452674007509, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1460, "loss": 5.253283977508545, "lr": 0.0002, "elapsed_sec": 12081.321876049042, "step_time_sec": 8.229446737008402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1461, "loss": 5.3512420654296875, "lr": 0.0002, "elapsed_sec": 12089.552509307861, "step_time_sec": 8.23050009200233, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1462, "loss": 5.41215181350708, "lr": 0.0002, "elapsed_sec": 12097.781175613403, "step_time_sec": 8.228506801999174, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1463, "loss": 5.393285274505615, "lr": 0.0002, "elapsed_sec": 12106.009858846664, "step_time_sec": 8.228539660020033, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1464, "loss": 5.347075462341309, "lr": 0.0002, "elapsed_sec": 12114.24101114273, "step_time_sec": 8.23098878498422, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1465, "loss": 5.416265487670898, "lr": 0.0002, "elapsed_sec": 12122.471156835556, "step_time_sec": 8.230005470017204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1466, "loss": 5.3557844161987305, "lr": 0.0002, "elapsed_sec": 12130.703013181686, "step_time_sec": 8.23163521499373, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1467, "loss": 5.44078254699707, "lr": 0.0002, "elapsed_sec": 12138.93279337883, "step_time_sec": 8.229612561990507, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1468, "loss": 5.351944923400879, "lr": 0.0002, "elapsed_sec": 12147.161391973495, "step_time_sec": 8.22846759101958, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1469, "loss": 5.478590488433838, "lr": 0.0002, "elapsed_sec": 12155.391786336899, "step_time_sec": 8.230268634011736, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1470, "loss": 5.401966571807861, "lr": 0.0002, "elapsed_sec": 12163.621833562851, "step_time_sec": 8.22985282400623, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1471, "loss": 5.251651763916016, "lr": 0.0002, "elapsed_sec": 12171.852603912354, "step_time_sec": 8.230598497000756, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1472, "loss": 5.3992204666137695, "lr": 0.0002, "elapsed_sec": 12180.081127166748, "step_time_sec": 8.22844593701302, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1473, "loss": 5.279092788696289, "lr": 0.0002, "elapsed_sec": 12188.310360908508, "step_time_sec": 8.22898652398726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1474, "loss": 5.3234357833862305, "lr": 0.0002, "elapsed_sec": 12196.53752708435, "step_time_sec": 8.227016487013316, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1475, "loss": 5.26941442489624, "lr": 0.0002, "elapsed_sec": 12204.76582121849, "step_time_sec": 8.228123090986628, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1476, "loss": 5.3271803855896, "lr": 0.0002, "elapsed_sec": 12212.994223356247, "step_time_sec": 8.228233917005127, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1477, "loss": 5.265645980834961, "lr": 0.0002, "elapsed_sec": 12221.223416566849, "step_time_sec": 8.22903365897946, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1478, "loss": 5.221604347229004, "lr": 0.0002, "elapsed_sec": 12229.45373415947, "step_time_sec": 8.230163525004173, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1479, "loss": 5.352782249450684, "lr": 0.0002, "elapsed_sec": 12237.684047698975, "step_time_sec": 8.230144052999094, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1480, "loss": 5.313464164733887, "lr": 0.0002, "elapsed_sec": 12245.914512872696, "step_time_sec": 8.230264921992784, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1481, "loss": 5.388936996459961, "lr": 0.0002, "elapsed_sec": 12254.144374370575, "step_time_sec": 8.22969302997808, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1482, "loss": 5.3255228996276855, "lr": 0.0002, "elapsed_sec": 12262.37274646759, "step_time_sec": 8.228216929011978, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1483, "loss": 5.330015182495117, "lr": 0.0002, "elapsed_sec": 12270.600438594818, "step_time_sec": 8.227565681008855, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1484, "loss": 5.370212554931641, "lr": 0.0002, "elapsed_sec": 12278.83129644394, "step_time_sec": 8.230660824978258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1485, "loss": 5.309347629547119, "lr": 0.0002, "elapsed_sec": 12287.061402320862, "step_time_sec": 8.229942808975466, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1486, "loss": 5.194454193115234, "lr": 0.0002, "elapsed_sec": 12295.2904047966, "step_time_sec": 8.228890926984604, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1487, "loss": 5.341149806976318, "lr": 0.0002, "elapsed_sec": 12303.519338846207, "step_time_sec": 8.228755461983383, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1488, "loss": 5.266222953796387, "lr": 0.0002, "elapsed_sec": 12311.746935844421, "step_time_sec": 8.227475681022042, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1489, "loss": 5.346837997436523, "lr": 0.0002, "elapsed_sec": 12319.976143598557, "step_time_sec": 8.229039158992236, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1490, "loss": 5.388810634613037, "lr": 0.0002, "elapsed_sec": 12328.20484161377, "step_time_sec": 8.22845442997641, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1491, "loss": 5.244274616241455, "lr": 0.0002, "elapsed_sec": 12336.433992385864, "step_time_sec": 8.22905071801506, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1492, "loss": 5.232855796813965, "lr": 0.0002, "elapsed_sec": 12344.66277885437, "step_time_sec": 8.228598462010268, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1493, "loss": 5.260524749755859, "lr": 0.0002, "elapsed_sec": 12352.89084482193, "step_time_sec": 8.22788077100995, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1494, "loss": 5.367447853088379, "lr": 0.0002, "elapsed_sec": 12361.121587753296, "step_time_sec": 8.230576963018393, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1495, "loss": 5.429032325744629, "lr": 0.0002, "elapsed_sec": 12369.351596355438, "step_time_sec": 8.229874428012408, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1496, "loss": 5.273684978485107, "lr": 0.0002, "elapsed_sec": 12377.579930067062, "step_time_sec": 8.228187767002964, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1497, "loss": 5.251448631286621, "lr": 0.0002, "elapsed_sec": 12385.808651208878, "step_time_sec": 8.228492224996444, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1498, "loss": 5.248360633850098, "lr": 0.0002, "elapsed_sec": 12394.038583278656, "step_time_sec": 8.229845377994934, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1499, "loss": 5.376967906951904, "lr": 0.0002, "elapsed_sec": 12402.268560647964, "step_time_sec": 8.229767379991245, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1500, "loss": 5.311886787414551, "lr": 0.0002, "elapsed_sec": 12410.49930882454, "step_time_sec": 29.62452654799563, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1501, "loss": 5.364065647125244, "lr": 0.0002, "elapsed_sec": 12440.13678431511, "step_time_sec": 8.243102259992156, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1502, "loss": 5.228957653045654, "lr": 0.0002, "elapsed_sec": 12448.353034496307, "step_time_sec": 8.21602600600454, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1503, "loss": 5.303123474121094, "lr": 0.0002, "elapsed_sec": 12456.568934679031, "step_time_sec": 8.215785696986131, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1504, "loss": 5.315816402435303, "lr": 0.0002, "elapsed_sec": 12464.784781455994, "step_time_sec": 8.215660228015622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1505, "loss": 5.324832916259766, "lr": 0.0002, "elapsed_sec": 12473.011330366135, "step_time_sec": 8.226381943997694, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1506, "loss": 5.2483439445495605, "lr": 0.0002, "elapsed_sec": 12481.240431308746, "step_time_sec": 8.228922087000683, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1507, "loss": 5.298169136047363, "lr": 0.0002, "elapsed_sec": 12489.470514059067, "step_time_sec": 8.229949023982044, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1508, "loss": 5.2817535400390625, "lr": 0.0002, "elapsed_sec": 12497.700978517532, "step_time_sec": 8.230316575994948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1509, "loss": 5.2709574699401855, "lr": 0.0002, "elapsed_sec": 12505.93099617958, "step_time_sec": 8.229811333992984, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1510, "loss": 5.412075042724609, "lr": 0.0002, "elapsed_sec": 12514.158694028854, "step_time_sec": 8.227587802015478, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1511, "loss": 5.340758323669434, "lr": 0.0002, "elapsed_sec": 12522.387260437012, "step_time_sec": 8.228349065990187, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1512, "loss": 4.388454437255859, "lr": 0.0002, "elapsed_sec": 12530.617022752762, "step_time_sec": 8.22967342400807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1513, "loss": 5.344824314117432, "lr": 0.0002, "elapsed_sec": 12538.847321748734, "step_time_sec": 8.23008052201476, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1514, "loss": 5.304281711578369, "lr": 0.0002, "elapsed_sec": 12547.077985525131, "step_time_sec": 8.230515887989895, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1515, "loss": 5.27113676071167, "lr": 0.0002, "elapsed_sec": 12555.307999134064, "step_time_sec": 8.229927282984136, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1516, "loss": 5.3063063621521, "lr": 0.0002, "elapsed_sec": 12563.538551330566, "step_time_sec": 8.23033602599753, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1517, "loss": 5.355304718017578, "lr": 0.0002, "elapsed_sec": 12571.767528057098, "step_time_sec": 8.228793625021353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1518, "loss": 5.306970119476318, "lr": 0.0002, "elapsed_sec": 12579.998583316803, "step_time_sec": 8.230905111005995, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1519, "loss": 5.366566181182861, "lr": 0.0002, "elapsed_sec": 12588.230539560318, "step_time_sec": 8.231845919013722, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1520, "loss": 5.288058757781982, "lr": 0.0002, "elapsed_sec": 12596.45920419693, "step_time_sec": 8.228474970004754, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1521, "loss": 5.303884029388428, "lr": 0.0002, "elapsed_sec": 12604.688995838165, "step_time_sec": 8.229632948001381, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1522, "loss": 5.3853254318237305, "lr": 0.0002, "elapsed_sec": 12612.919079780579, "step_time_sec": 8.229944928025361, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1523, "loss": 5.243937015533447, "lr": 0.0002, "elapsed_sec": 12621.150232076645, "step_time_sec": 8.230944940005429, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1524, "loss": 5.160130023956299, "lr": 0.0002, "elapsed_sec": 12629.379280090332, "step_time_sec": 8.228947135008639, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1525, "loss": 5.291526794433594, "lr": 0.0002, "elapsed_sec": 12637.609676361084, "step_time_sec": 8.230233907001093, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1526, "loss": 5.33349609375, "lr": 0.0002, "elapsed_sec": 12645.839708566666, "step_time_sec": 8.229911421018187, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1527, "loss": 5.36237096786499, "lr": 0.0002, "elapsed_sec": 12654.07052397728, "step_time_sec": 8.230606143013574, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1528, "loss": 5.213667869567871, "lr": 0.0002, "elapsed_sec": 12662.300608634949, "step_time_sec": 8.229924298997503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1529, "loss": 5.34772253036499, "lr": 0.0002, "elapsed_sec": 12670.530014276505, "step_time_sec": 8.22925466898596, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1530, "loss": 5.3274712562561035, "lr": 0.0002, "elapsed_sec": 12678.758257389069, "step_time_sec": 8.228103214001749, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1531, "loss": 5.194119930267334, "lr": 0.0002, "elapsed_sec": 12686.987451076508, "step_time_sec": 8.229034418007359, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1532, "loss": 5.198384761810303, "lr": 0.0002, "elapsed_sec": 12695.214823246002, "step_time_sec": 8.22721066500526, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1533, "loss": 5.316221714019775, "lr": 0.0002, "elapsed_sec": 12703.445101976395, "step_time_sec": 8.230130133975763, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1534, "loss": 5.287467956542969, "lr": 0.0002, "elapsed_sec": 12711.675613880157, "step_time_sec": 8.23042684799293, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1535, "loss": 5.382444858551025, "lr": 0.0002, "elapsed_sec": 12719.903988361359, "step_time_sec": 8.22815676900791, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1536, "loss": 5.30141019821167, "lr": 0.0002, "elapsed_sec": 12728.131527900696, "step_time_sec": 8.22741397601203, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1537, "loss": 5.218835353851318, "lr": 0.0002, "elapsed_sec": 12736.359355926514, "step_time_sec": 8.227627014974132, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1538, "loss": 5.3224310874938965, "lr": 0.0002, "elapsed_sec": 12744.589490890503, "step_time_sec": 8.229960916010896, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1539, "loss": 5.360857009887695, "lr": 0.0002, "elapsed_sec": 12752.818230390549, "step_time_sec": 8.228635973006021, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1540, "loss": 5.249100685119629, "lr": 0.0002, "elapsed_sec": 12761.048766613007, "step_time_sec": 8.230336988985073, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1541, "loss": 5.219057083129883, "lr": 0.0002, "elapsed_sec": 12769.280107498169, "step_time_sec": 8.231156994996127, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1542, "loss": 5.289678573608398, "lr": 0.0002, "elapsed_sec": 12777.509943962097, "step_time_sec": 8.2296671049844, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1543, "loss": 5.314565181732178, "lr": 0.0002, "elapsed_sec": 12785.739959955215, "step_time_sec": 8.229926885978784, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1544, "loss": 5.3853759765625, "lr": 0.0002, "elapsed_sec": 12793.970191955566, "step_time_sec": 8.23008227997343, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1545, "loss": 5.315835475921631, "lr": 0.0002, "elapsed_sec": 12802.19937968254, "step_time_sec": 8.22895741200773, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1546, "loss": 5.1934919357299805, "lr": 0.0002, "elapsed_sec": 12810.428758144379, "step_time_sec": 8.229211748985108, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1547, "loss": 5.354621887207031, "lr": 0.0002, "elapsed_sec": 12818.655815124512, "step_time_sec": 8.226895130996127, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1548, "loss": 5.12799072265625, "lr": 0.0002, "elapsed_sec": 12826.884487390518, "step_time_sec": 8.228551403008169, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1549, "loss": 5.218661308288574, "lr": 0.0002, "elapsed_sec": 12835.114298343658, "step_time_sec": 8.229617072996916, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1550, "loss": 5.23795747756958, "lr": 0.0002, "elapsed_sec": 12843.341487169266, "step_time_sec": 8.227010597009212, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1551, "loss": 5.24676513671875, "lr": 0.0002, "elapsed_sec": 12851.56960940361, "step_time_sec": 8.227947190986015, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1552, "loss": 5.320764064788818, "lr": 0.0002, "elapsed_sec": 12859.79736328125, "step_time_sec": 8.227629272994818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1553, "loss": 5.186319828033447, "lr": 0.0002, "elapsed_sec": 12868.027632713318, "step_time_sec": 8.230144337023376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1554, "loss": 5.280018329620361, "lr": 0.0002, "elapsed_sec": 12876.257194757462, "step_time_sec": 8.22943638000288, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1555, "loss": 5.326862812042236, "lr": 0.0002, "elapsed_sec": 12884.486622810364, "step_time_sec": 8.229213667014847, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1556, "loss": 5.327235698699951, "lr": 0.0002, "elapsed_sec": 12892.717481851578, "step_time_sec": 8.230688468000153, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1557, "loss": 5.186554908752441, "lr": 0.0002, "elapsed_sec": 12900.94577741623, "step_time_sec": 8.228197454998735, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1558, "loss": 5.305722236633301, "lr": 0.0002, "elapsed_sec": 12909.174088478088, "step_time_sec": 8.22810998398927, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1559, "loss": 5.335611820220947, "lr": 0.0002, "elapsed_sec": 12917.404146671295, "step_time_sec": 8.229902179999044, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1560, "loss": 5.268307209014893, "lr": 0.0002, "elapsed_sec": 12925.634032011032, "step_time_sec": 8.229703646997223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1561, "loss": 5.200066089630127, "lr": 0.0002, "elapsed_sec": 12933.864035367966, "step_time_sec": 8.229871340008685, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1562, "loss": 5.188622951507568, "lr": 0.0002, "elapsed_sec": 12942.094165563583, "step_time_sec": 8.229950109001948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1563, "loss": 5.250656604766846, "lr": 0.0002, "elapsed_sec": 12950.323578834534, "step_time_sec": 8.22923872602405, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1564, "loss": 5.386078357696533, "lr": 0.0002, "elapsed_sec": 12958.553001403809, "step_time_sec": 8.229325465013972, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1565, "loss": 5.425146102905273, "lr": 0.0002, "elapsed_sec": 12966.782608747482, "step_time_sec": 8.229443108022679, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1566, "loss": 5.300329685211182, "lr": 0.0002, "elapsed_sec": 12975.01342868805, "step_time_sec": 8.230661689012777, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1567, "loss": 5.216520309448242, "lr": 0.0002, "elapsed_sec": 12983.241999149323, "step_time_sec": 8.22837426199112, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1568, "loss": 5.218174934387207, "lr": 0.0002, "elapsed_sec": 12991.470460176468, "step_time_sec": 8.228252817993052, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1569, "loss": 5.341640472412109, "lr": 0.0002, "elapsed_sec": 12999.699682235718, "step_time_sec": 8.229037718992913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1570, "loss": 5.269341945648193, "lr": 0.0002, "elapsed_sec": 13007.929421424866, "step_time_sec": 8.229604526015464, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1571, "loss": 5.264288425445557, "lr": 0.0002, "elapsed_sec": 13016.159247636795, "step_time_sec": 8.229696738009807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1572, "loss": 5.357127666473389, "lr": 0.0002, "elapsed_sec": 13024.389632225037, "step_time_sec": 8.230185787018854, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1573, "loss": 5.152184963226318, "lr": 0.0002, "elapsed_sec": 13032.61989235878, "step_time_sec": 8.230091372999595, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1574, "loss": 5.277871608734131, "lr": 0.0002, "elapsed_sec": 13040.849715709686, "step_time_sec": 8.229670110013103, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1575, "loss": 5.224621295928955, "lr": 0.0002, "elapsed_sec": 13049.075815439224, "step_time_sec": 8.225939557014499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1576, "loss": 5.289681911468506, "lr": 0.0002, "elapsed_sec": 13057.304163455963, "step_time_sec": 8.228203745005885, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1577, "loss": 5.24407434463501, "lr": 0.0002, "elapsed_sec": 13065.531267881393, "step_time_sec": 8.226966042013373, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1578, "loss": 5.208822727203369, "lr": 0.0002, "elapsed_sec": 13073.762075901031, "step_time_sec": 8.230614936008351, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1579, "loss": 5.249774932861328, "lr": 0.0002, "elapsed_sec": 13081.99303483963, "step_time_sec": 8.230830423999578, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1580, "loss": 5.406113147735596, "lr": 0.0002, "elapsed_sec": 13090.222854614258, "step_time_sec": 8.229640150006162, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1581, "loss": 5.240412712097168, "lr": 0.0002, "elapsed_sec": 13098.453923463821, "step_time_sec": 8.230945250979858, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1582, "loss": 5.1985650062561035, "lr": 0.0002, "elapsed_sec": 13106.683927297592, "step_time_sec": 8.229822261986556, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1583, "loss": 5.302002429962158, "lr": 0.0002, "elapsed_sec": 13114.91464805603, "step_time_sec": 8.230515967996325, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1584, "loss": 5.159221172332764, "lr": 0.0002, "elapsed_sec": 13123.144393205643, "step_time_sec": 8.229639971017605, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1585, "loss": 5.172482967376709, "lr": 0.0002, "elapsed_sec": 13131.372996330261, "step_time_sec": 8.22844521101797, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1586, "loss": 5.3609724044799805, "lr": 0.0002, "elapsed_sec": 13139.601239442825, "step_time_sec": 8.228064188006101, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1587, "loss": 5.246559143066406, "lr": 0.0002, "elapsed_sec": 13147.828528881073, "step_time_sec": 8.227100282005267, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1588, "loss": 5.397809028625488, "lr": 0.0002, "elapsed_sec": 13156.05736899376, "step_time_sec": 8.228699389001122, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1589, "loss": 5.243554592132568, "lr": 0.0002, "elapsed_sec": 13164.286157369614, "step_time_sec": 8.22863622001023, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1590, "loss": 5.291842460632324, "lr": 0.0002, "elapsed_sec": 13172.516016721725, "step_time_sec": 8.229702751006698, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1591, "loss": 5.217782974243164, "lr": 0.0002, "elapsed_sec": 13180.74635052681, "step_time_sec": 8.23015152500011, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1592, "loss": 5.141608238220215, "lr": 0.0002, "elapsed_sec": 13188.976783752441, "step_time_sec": 8.23028835200239, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1593, "loss": 5.267754077911377, "lr": 0.0002, "elapsed_sec": 13197.207120418549, "step_time_sec": 8.230151288997149, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1594, "loss": 5.144164085388184, "lr": 0.0002, "elapsed_sec": 13205.438074827194, "step_time_sec": 8.230810242006555, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1595, "loss": 5.385934352874756, "lr": 0.0002, "elapsed_sec": 13213.668692827225, "step_time_sec": 8.230505170999095, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1596, "loss": 5.196333885192871, "lr": 0.0002, "elapsed_sec": 13221.898766994476, "step_time_sec": 8.229826404014602, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1597, "loss": 5.34324836730957, "lr": 0.0002, "elapsed_sec": 13230.129623889923, "step_time_sec": 8.230714044009801, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1598, "loss": 5.1978960037231445, "lr": 0.0002, "elapsed_sec": 13238.35851597786, "step_time_sec": 8.228758134995587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1599, "loss": 5.284778594970703, "lr": 0.0002, "elapsed_sec": 13246.587373971939, "step_time_sec": 8.228722817002563, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1600, "loss": 5.375219821929932, "lr": 0.0002, "elapsed_sec": 13254.816084384918, "step_time_sec": 8.228481372992974, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1601, "loss": 5.17708158493042, "lr": 0.0002, "elapsed_sec": 13263.044111967087, "step_time_sec": 8.227864912012592, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1602, "loss": 5.239114284515381, "lr": 0.0002, "elapsed_sec": 13271.270354509354, "step_time_sec": 8.226066560979234, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1603, "loss": 5.252806186676025, "lr": 0.0002, "elapsed_sec": 13279.500763654709, "step_time_sec": 8.230262412020238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1604, "loss": 5.3333845138549805, "lr": 0.0002, "elapsed_sec": 13287.731304645538, "step_time_sec": 8.23041257497971, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1605, "loss": 5.27605676651001, "lr": 0.0002, "elapsed_sec": 13295.959390163422, "step_time_sec": 8.227896689000772, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1606, "loss": 5.264378547668457, "lr": 0.0002, "elapsed_sec": 13304.1884329319, "step_time_sec": 8.228889538993826, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1607, "loss": 5.0827178955078125, "lr": 0.0002, "elapsed_sec": 13312.415606498718, "step_time_sec": 8.227068945998326, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1608, "loss": 5.160030841827393, "lr": 0.0002, "elapsed_sec": 13320.646649599075, "step_time_sec": 8.23081758999615, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1609, "loss": 5.103649616241455, "lr": 0.0002, "elapsed_sec": 13328.877407550812, "step_time_sec": 8.23058381199371, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1610, "loss": 5.211599349975586, "lr": 0.0002, "elapsed_sec": 13337.10547876358, "step_time_sec": 8.227963374985848, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1611, "loss": 5.240067005157471, "lr": 0.0002, "elapsed_sec": 13345.333974123001, "step_time_sec": 8.228308541001752, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1612, "loss": 5.195704460144043, "lr": 0.0002, "elapsed_sec": 13353.562461853027, "step_time_sec": 8.228347472992027, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1613, "loss": 5.293488025665283, "lr": 0.0002, "elapsed_sec": 13361.790843248367, "step_time_sec": 8.228244048979832, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1614, "loss": 5.185688018798828, "lr": 0.0002, "elapsed_sec": 13370.020740509033, "step_time_sec": 8.229702028998872, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1615, "loss": 5.163677215576172, "lr": 0.0002, "elapsed_sec": 13378.251133680344, "step_time_sec": 8.230311174993403, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1616, "loss": 5.276895046234131, "lr": 0.0002, "elapsed_sec": 13386.481538534164, "step_time_sec": 8.230233870010125, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1617, "loss": 5.359260082244873, "lr": 0.0002, "elapsed_sec": 13394.71020269394, "step_time_sec": 8.228461502003483, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1618, "loss": 5.327740669250488, "lr": 0.0002, "elapsed_sec": 13402.94209265709, "step_time_sec": 8.23179817100754, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1619, "loss": 5.318942546844482, "lr": 0.0002, "elapsed_sec": 13411.172397613525, "step_time_sec": 8.230080798006384, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1620, "loss": 5.17771053314209, "lr": 0.0002, "elapsed_sec": 13419.402772426605, "step_time_sec": 8.230255286995089, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1621, "loss": 5.238345623016357, "lr": 0.0002, "elapsed_sec": 13427.633129835129, "step_time_sec": 8.230187982000643, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1622, "loss": 5.212898254394531, "lr": 0.0002, "elapsed_sec": 13435.863816022873, "step_time_sec": 8.230519853997976, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1623, "loss": 5.200588703155518, "lr": 0.0002, "elapsed_sec": 13444.093792200089, "step_time_sec": 8.229836772021372, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1624, "loss": 5.362468242645264, "lr": 0.0002, "elapsed_sec": 13452.32249712944, "step_time_sec": 8.228521293000085, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1625, "loss": 5.151779651641846, "lr": 0.0002, "elapsed_sec": 13461.576930761337, "step_time_sec": 9.254333529999712, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1626, "loss": 5.237819194793701, "lr": 0.0002, "elapsed_sec": 13469.805528640747, "step_time_sec": 8.228402943990659, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1627, "loss": 5.253832817077637, "lr": 0.0002, "elapsed_sec": 13478.035462856293, "step_time_sec": 8.229779915011022, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1628, "loss": 5.263566970825195, "lr": 0.0002, "elapsed_sec": 13486.266534090042, "step_time_sec": 8.230912518018158, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1629, "loss": 5.227478504180908, "lr": 0.0002, "elapsed_sec": 13494.497614860535, "step_time_sec": 8.230930908001028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1630, "loss": 5.13606595993042, "lr": 0.0002, "elapsed_sec": 13502.727804660797, "step_time_sec": 8.230043490999378, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1631, "loss": 5.209609508514404, "lr": 0.0002, "elapsed_sec": 13510.958590269089, "step_time_sec": 8.23062531001051, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1632, "loss": 5.203192234039307, "lr": 0.0002, "elapsed_sec": 13519.189666748047, "step_time_sec": 8.230920409987448, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1633, "loss": 5.176675319671631, "lr": 0.0002, "elapsed_sec": 13527.419996976852, "step_time_sec": 8.230209343018942, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1634, "loss": 5.10483455657959, "lr": 0.0002, "elapsed_sec": 13535.651068925858, "step_time_sec": 8.230845276993932, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1635, "loss": 5.240583896636963, "lr": 0.0002, "elapsed_sec": 13543.879708051682, "step_time_sec": 8.228484530991409, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1636, "loss": 5.160721778869629, "lr": 0.0002, "elapsed_sec": 13552.108605623245, "step_time_sec": 8.228750318987295, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1637, "loss": 5.242792129516602, "lr": 0.0002, "elapsed_sec": 13560.33663225174, "step_time_sec": 8.227870094007812, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1638, "loss": 5.236257553100586, "lr": 0.0002, "elapsed_sec": 13568.56554555893, "step_time_sec": 8.228791067987913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1639, "loss": 5.056433200836182, "lr": 0.0002, "elapsed_sec": 13576.793441534042, "step_time_sec": 8.227772290003486, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1640, "loss": 5.35129451751709, "lr": 0.0002, "elapsed_sec": 13585.024570465088, "step_time_sec": 8.230916676984634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1641, "loss": 5.216903209686279, "lr": 0.0002, "elapsed_sec": 13593.255442619324, "step_time_sec": 8.230716678022873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1642, "loss": 5.311046123504639, "lr": 0.0002, "elapsed_sec": 13601.484746217728, "step_time_sec": 8.229146373982076, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1643, "loss": 5.0174174308776855, "lr": 0.0002, "elapsed_sec": 13609.716252565384, "step_time_sec": 8.231366395018995, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1644, "loss": 5.117071151733398, "lr": 0.0002, "elapsed_sec": 13617.946625471115, "step_time_sec": 8.23025387301459, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1645, "loss": 5.300999641418457, "lr": 0.0002, "elapsed_sec": 13626.174428224564, "step_time_sec": 8.227592339011608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1646, "loss": 5.314621448516846, "lr": 0.0002, "elapsed_sec": 13634.403045654297, "step_time_sec": 8.228462575003505, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1647, "loss": 5.077280044555664, "lr": 0.0002, "elapsed_sec": 13642.632058382034, "step_time_sec": 8.228864283009898, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1648, "loss": 5.260457515716553, "lr": 0.0002, "elapsed_sec": 13650.860299348831, "step_time_sec": 8.228086883987999, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1649, "loss": 5.213033676147461, "lr": 0.0002, "elapsed_sec": 13659.087574481964, "step_time_sec": 8.227109133003978, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1650, "loss": 5.139934539794922, "lr": 0.0002, "elapsed_sec": 13667.316041707993, "step_time_sec": 8.228306599019561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1651, "loss": 5.2101287841796875, "lr": 0.0002, "elapsed_sec": 13675.545790195465, "step_time_sec": 8.229588487010915, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1652, "loss": 5.149075031280518, "lr": 0.0002, "elapsed_sec": 13683.77617764473, "step_time_sec": 8.230283821001649, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1653, "loss": 5.277585029602051, "lr": 0.0002, "elapsed_sec": 13692.006642580032, "step_time_sec": 8.230320490984013, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1654, "loss": 5.135995864868164, "lr": 0.0002, "elapsed_sec": 13700.23759675026, "step_time_sec": 8.230809882981703, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1655, "loss": 5.385804176330566, "lr": 0.0002, "elapsed_sec": 13708.46706867218, "step_time_sec": 8.229246236005565, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1656, "loss": 5.216671466827393, "lr": 0.0002, "elapsed_sec": 13716.69418668747, "step_time_sec": 8.226970260002417, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1657, "loss": 5.323742866516113, "lr": 0.0002, "elapsed_sec": 13724.924957990646, "step_time_sec": 8.230575494992081, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1658, "loss": 5.266468048095703, "lr": 0.0002, "elapsed_sec": 13733.15569114685, "step_time_sec": 8.230624445015565, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1659, "loss": 5.143752098083496, "lr": 0.0002, "elapsed_sec": 13741.386399269104, "step_time_sec": 8.230513871996664, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1660, "loss": 5.280832290649414, "lr": 0.0002, "elapsed_sec": 13749.617087125778, "step_time_sec": 8.230547836021287, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1661, "loss": 5.177154064178467, "lr": 0.0002, "elapsed_sec": 13757.846961975098, "step_time_sec": 8.229700909985695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1662, "loss": 5.198588848114014, "lr": 0.0002, "elapsed_sec": 13766.077773571014, "step_time_sec": 8.230665087001398, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1663, "loss": 5.188675403594971, "lr": 0.0002, "elapsed_sec": 13774.308475494385, "step_time_sec": 8.230581506009912, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1664, "loss": 5.090674877166748, "lr": 0.0002, "elapsed_sec": 13782.538082838058, "step_time_sec": 8.229419629002223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1665, "loss": 5.149723529815674, "lr": 0.0002, "elapsed_sec": 13790.768454790115, "step_time_sec": 8.230193531984696, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1666, "loss": 5.163173675537109, "lr": 0.0002, "elapsed_sec": 13798.99864411354, "step_time_sec": 8.230080798995914, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1667, "loss": 5.200920104980469, "lr": 0.0002, "elapsed_sec": 13807.227938890457, "step_time_sec": 8.22913135800627, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1668, "loss": 5.144057750701904, "lr": 0.0002, "elapsed_sec": 13815.457010030746, "step_time_sec": 8.228918046021136, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1669, "loss": 5.166540622711182, "lr": 0.0002, "elapsed_sec": 13823.683610916138, "step_time_sec": 8.226445534004597, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1670, "loss": 5.141271114349365, "lr": 0.0002, "elapsed_sec": 13831.910953521729, "step_time_sec": 8.22711852498469, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1671, "loss": 5.14318323135376, "lr": 0.0002, "elapsed_sec": 13840.140064954758, "step_time_sec": 8.228970593016129, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1672, "loss": 5.179304599761963, "lr": 0.0002, "elapsed_sec": 13848.370200157166, "step_time_sec": 8.23000919597689, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1673, "loss": 5.186712265014648, "lr": 0.0002, "elapsed_sec": 13856.599059820175, "step_time_sec": 8.228614759020275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1674, "loss": 5.133162498474121, "lr": 0.0002, "elapsed_sec": 13864.826286792755, "step_time_sec": 8.227078668016475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1675, "loss": 5.2578935623168945, "lr": 0.0002, "elapsed_sec": 13873.05661869049, "step_time_sec": 8.230184683023253, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1676, "loss": 5.102133750915527, "lr": 0.0002, "elapsed_sec": 13881.285828113556, "step_time_sec": 8.229050785012078, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1677, "loss": 5.250524044036865, "lr": 0.0002, "elapsed_sec": 13889.514171600342, "step_time_sec": 8.228199973993469, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1678, "loss": 5.255073070526123, "lr": 0.0002, "elapsed_sec": 13897.742752790451, "step_time_sec": 8.22839899599785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1679, "loss": 5.1358208656311035, "lr": 0.0002, "elapsed_sec": 13905.971957921982, "step_time_sec": 8.229043867002474, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1680, "loss": 5.213582515716553, "lr": 0.0002, "elapsed_sec": 13914.204421281815, "step_time_sec": 8.232377110020025, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1681, "loss": 5.196303367614746, "lr": 0.0002, "elapsed_sec": 13922.434030532837, "step_time_sec": 8.229424568009563, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1682, "loss": 5.127326965332031, "lr": 0.0002, "elapsed_sec": 13930.665354013443, "step_time_sec": 8.231080267985817, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1683, "loss": 5.035671710968018, "lr": 0.0002, "elapsed_sec": 13938.894193410873, "step_time_sec": 8.228750565001974, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1684, "loss": 5.224015235900879, "lr": 0.0002, "elapsed_sec": 13947.124392986298, "step_time_sec": 8.230045873002382, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1685, "loss": 5.220308780670166, "lr": 0.0002, "elapsed_sec": 13955.35413813591, "step_time_sec": 8.229565165995155, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1686, "loss": 5.210274696350098, "lr": 0.0002, "elapsed_sec": 13963.584640979767, "step_time_sec": 8.230318271991564, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1687, "loss": 5.155210971832275, "lr": 0.0002, "elapsed_sec": 13971.813673257828, "step_time_sec": 8.228885062999325, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1688, "loss": 5.169779300689697, "lr": 0.0002, "elapsed_sec": 13980.043025255203, "step_time_sec": 8.229193900973769, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1689, "loss": 5.220111846923828, "lr": 0.0002, "elapsed_sec": 13988.272386789322, "step_time_sec": 8.22921942800167, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1690, "loss": 5.314274311065674, "lr": 0.0002, "elapsed_sec": 13996.502832889557, "step_time_sec": 8.23030185198877, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1691, "loss": 5.114179611206055, "lr": 0.0002, "elapsed_sec": 14004.732589960098, "step_time_sec": 8.229568721988471, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1692, "loss": 5.198371410369873, "lr": 0.0002, "elapsed_sec": 14012.96177649498, "step_time_sec": 8.229050177003955, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1693, "loss": 5.0602312088012695, "lr": 0.0002, "elapsed_sec": 14021.192053318024, "step_time_sec": 8.230126798007404, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1694, "loss": 5.12473726272583, "lr": 0.0002, "elapsed_sec": 14029.422091722488, "step_time_sec": 8.229931534006028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1695, "loss": 5.278766632080078, "lr": 0.0002, "elapsed_sec": 14037.651931285858, "step_time_sec": 8.229679173004115, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1696, "loss": 5.133466720581055, "lr": 0.0002, "elapsed_sec": 14045.87979722023, "step_time_sec": 8.22764246101724, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1697, "loss": 5.393361568450928, "lr": 0.0002, "elapsed_sec": 14054.10882306099, "step_time_sec": 8.22893206399749, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1698, "loss": 5.217679023742676, "lr": 0.0002, "elapsed_sec": 14062.337748527527, "step_time_sec": 8.22875790699618, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1699, "loss": 5.067663669586182, "lr": 0.0002, "elapsed_sec": 14070.567241191864, "step_time_sec": 8.229316921991995, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1700, "loss": 5.07785177230835, "lr": 0.0002, "elapsed_sec": 14078.798162937164, "step_time_sec": 8.230819218995748, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1701, "loss": 5.105025768280029, "lr": 0.0002, "elapsed_sec": 14087.02806854248, "step_time_sec": 8.229682887991657, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1702, "loss": 5.144552707672119, "lr": 0.0002, "elapsed_sec": 14095.257333278656, "step_time_sec": 8.229105348989833, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1703, "loss": 5.121956825256348, "lr": 0.0002, "elapsed_sec": 14103.485352039337, "step_time_sec": 8.227821370994207, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1704, "loss": 5.3215765953063965, "lr": 0.0002, "elapsed_sec": 14111.714953422546, "step_time_sec": 8.22947369798203, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1705, "loss": 5.156256198883057, "lr": 0.0002, "elapsed_sec": 14119.943102359772, "step_time_sec": 8.227987893973477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1706, "loss": 5.127401828765869, "lr": 0.0002, "elapsed_sec": 14128.172138690948, "step_time_sec": 8.228872359992238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1707, "loss": 5.134334564208984, "lr": 0.0002, "elapsed_sec": 14136.399785041809, "step_time_sec": 8.227565512002911, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1708, "loss": 5.271867275238037, "lr": 0.0002, "elapsed_sec": 14144.628656864166, "step_time_sec": 8.228643654001644, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1709, "loss": 5.279910564422607, "lr": 0.0002, "elapsed_sec": 14152.8580596447, "step_time_sec": 8.22924991499167, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1710, "loss": 5.176009178161621, "lr": 0.0002, "elapsed_sec": 14161.088431358337, "step_time_sec": 8.230211123998743, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1711, "loss": 5.274176120758057, "lr": 0.0002, "elapsed_sec": 14169.319189071655, "step_time_sec": 8.230652836995432, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1712, "loss": 5.309977054595947, "lr": 0.0002, "elapsed_sec": 14177.548374652863, "step_time_sec": 8.229008560010698, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1713, "loss": 5.2450852394104, "lr": 0.0002, "elapsed_sec": 14185.777730464935, "step_time_sec": 8.2291570600064, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1714, "loss": 5.122819900512695, "lr": 0.0002, "elapsed_sec": 14194.005130290985, "step_time_sec": 8.227312084985897, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1715, "loss": 5.1728315353393555, "lr": 0.0002, "elapsed_sec": 14202.23614025116, "step_time_sec": 8.230799940007273, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1716, "loss": 5.185508728027344, "lr": 0.0002, "elapsed_sec": 14210.466784954071, "step_time_sec": 8.230497994984034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1717, "loss": 5.0916032791137695, "lr": 0.0002, "elapsed_sec": 14218.69655919075, "step_time_sec": 8.229623888008064, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1718, "loss": 5.113310813903809, "lr": 0.0002, "elapsed_sec": 14226.926642417908, "step_time_sec": 8.229936310992343, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1719, "loss": 5.081583499908447, "lr": 0.0002, "elapsed_sec": 14235.157288312912, "step_time_sec": 8.230527679988882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1720, "loss": 5.098166465759277, "lr": 0.0002, "elapsed_sec": 14243.387971162796, "step_time_sec": 8.230447395995725, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1721, "loss": 5.083621501922607, "lr": 0.0002, "elapsed_sec": 14251.61781668663, "step_time_sec": 8.229681075987173, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1722, "loss": 5.074539661407471, "lr": 0.0002, "elapsed_sec": 14259.849143743515, "step_time_sec": 8.23115808799048, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1723, "loss": 5.179816722869873, "lr": 0.0002, "elapsed_sec": 14268.077831029892, "step_time_sec": 8.228571599989664, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1724, "loss": 5.1675920486450195, "lr": 0.0002, "elapsed_sec": 14276.30753827095, "step_time_sec": 8.229507550015114, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1725, "loss": 5.0210676193237305, "lr": 0.0002, "elapsed_sec": 14284.535483837128, "step_time_sec": 8.227789461001521, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1726, "loss": 5.194875717163086, "lr": 0.0002, "elapsed_sec": 14292.762493371964, "step_time_sec": 8.22690593500738, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1727, "loss": 5.07668399810791, "lr": 0.0002, "elapsed_sec": 14300.993287563324, "step_time_sec": 8.230589026003145, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1728, "loss": 5.125586986541748, "lr": 0.0002, "elapsed_sec": 14309.22351717949, "step_time_sec": 8.230152947013266, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1729, "loss": 5.171890735626221, "lr": 0.0002, "elapsed_sec": 14317.454419612885, "step_time_sec": 8.230692620010814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1730, "loss": 5.148971080780029, "lr": 0.0002, "elapsed_sec": 14325.68505358696, "step_time_sec": 8.230473084986443, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1731, "loss": 5.2284135818481445, "lr": 0.0002, "elapsed_sec": 14333.916429758072, "step_time_sec": 8.23123958002543, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1732, "loss": 5.271515846252441, "lr": 0.0002, "elapsed_sec": 14342.147816896439, "step_time_sec": 8.231160097988322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1733, "loss": 5.168789386749268, "lr": 0.0002, "elapsed_sec": 14350.376073122025, "step_time_sec": 8.228117163991556, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1734, "loss": 5.2991838455200195, "lr": 0.0002, "elapsed_sec": 14358.604858398438, "step_time_sec": 8.228623688977677, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1735, "loss": 5.165524482727051, "lr": 0.0002, "elapsed_sec": 14366.835417032242, "step_time_sec": 8.230419683997752, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1736, "loss": 5.0642571449279785, "lr": 0.0002, "elapsed_sec": 14375.06555390358, "step_time_sec": 8.22996100000455, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1737, "loss": 5.015491485595703, "lr": 0.0002, "elapsed_sec": 14383.295592069626, "step_time_sec": 8.229895803000545, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1738, "loss": 5.094751834869385, "lr": 0.0002, "elapsed_sec": 14391.526136398315, "step_time_sec": 8.230406392016448, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1739, "loss": 5.053066730499268, "lr": 0.0002, "elapsed_sec": 14399.755081415176, "step_time_sec": 8.228821530006826, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1740, "loss": 4.990212917327881, "lr": 0.0002, "elapsed_sec": 14407.983389377594, "step_time_sec": 8.228152362979017, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1741, "loss": 5.158604145050049, "lr": 0.0002, "elapsed_sec": 14416.212218046188, "step_time_sec": 8.228612105973298, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1742, "loss": 5.1765241622924805, "lr": 0.0002, "elapsed_sec": 14424.441987037659, "step_time_sec": 8.229652202979196, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1743, "loss": 5.125940799713135, "lr": 0.0002, "elapsed_sec": 14432.672752141953, "step_time_sec": 8.230554427980678, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1744, "loss": 5.148566722869873, "lr": 0.0002, "elapsed_sec": 14440.904333114624, "step_time_sec": 8.231437039008597, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1745, "loss": 5.119121551513672, "lr": 0.0002, "elapsed_sec": 14449.135149717331, "step_time_sec": 8.230708413990214, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1746, "loss": 5.162078380584717, "lr": 0.0002, "elapsed_sec": 14457.363444566727, "step_time_sec": 8.228065231000073, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1747, "loss": 5.166964530944824, "lr": 0.0002, "elapsed_sec": 14465.592246055603, "step_time_sec": 8.22867747899727, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1748, "loss": 5.120932102203369, "lr": 0.0002, "elapsed_sec": 14473.823315143585, "step_time_sec": 8.23095115099568, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1749, "loss": 5.085119247436523, "lr": 0.0002, "elapsed_sec": 14482.05495595932, "step_time_sec": 8.231408655992709, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1750, "loss": 5.180343151092529, "lr": 0.0002, "elapsed_sec": 14490.285474300385, "step_time_sec": 8.23036081600003, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1751, "loss": 5.183297634124756, "lr": 0.0002, "elapsed_sec": 14498.514747858047, "step_time_sec": 8.229099934978876, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1752, "loss": 5.091614246368408, "lr": 0.0002, "elapsed_sec": 14506.742311239243, "step_time_sec": 8.227471235004487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1753, "loss": 5.16668701171875, "lr": 0.0002, "elapsed_sec": 14514.971285581589, "step_time_sec": 8.228748885012465, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1754, "loss": 5.09870719909668, "lr": 0.0002, "elapsed_sec": 14523.200735569, "step_time_sec": 8.229360213008476, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1755, "loss": 5.002506256103516, "lr": 0.0002, "elapsed_sec": 14531.428045034409, "step_time_sec": 8.227157980989432, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1756, "loss": 5.123249530792236, "lr": 0.0002, "elapsed_sec": 14539.659929990768, "step_time_sec": 8.231703936005943, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1757, "loss": 5.158659934997559, "lr": 0.0002, "elapsed_sec": 14547.889169931412, "step_time_sec": 8.229066427011276, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1758, "loss": 5.157903671264648, "lr": 0.0002, "elapsed_sec": 14556.117198705673, "step_time_sec": 8.227827246009838, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1759, "loss": 5.275635719299316, "lr": 0.0002, "elapsed_sec": 14564.345300197601, "step_time_sec": 8.227981289004674, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1760, "loss": 5.078993797302246, "lr": 0.0002, "elapsed_sec": 14572.574861764908, "step_time_sec": 8.22936610400211, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1761, "loss": 5.264904499053955, "lr": 0.0002, "elapsed_sec": 14580.805959701538, "step_time_sec": 8.230953071004478, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1762, "loss": 5.0771660804748535, "lr": 0.0002, "elapsed_sec": 14589.036541700363, "step_time_sec": 8.230429941002512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1763, "loss": 5.036196708679199, "lr": 0.0002, "elapsed_sec": 14597.265169143677, "step_time_sec": 8.228484601015225, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1764, "loss": 5.306055545806885, "lr": 0.0002, "elapsed_sec": 14605.494257926941, "step_time_sec": 8.228970541997114, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1765, "loss": 5.040171146392822, "lr": 0.0002, "elapsed_sec": 14613.72153043747, "step_time_sec": 8.22703709101188, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1766, "loss": 5.035990238189697, "lr": 0.0002, "elapsed_sec": 14621.9517993927, "step_time_sec": 8.230132536991732, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1767, "loss": 5.209347248077393, "lr": 0.0002, "elapsed_sec": 14630.181181192398, "step_time_sec": 8.229283346008742, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1768, "loss": 5.1051106452941895, "lr": 0.0002, "elapsed_sec": 14638.410643577576, "step_time_sec": 8.229314006021013, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1769, "loss": 5.013489723205566, "lr": 0.0002, "elapsed_sec": 14646.639496564865, "step_time_sec": 8.228686933987774, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1770, "loss": 5.262411117553711, "lr": 0.0002, "elapsed_sec": 14654.868458271027, "step_time_sec": 8.228736721008318, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1771, "loss": 5.142910003662109, "lr": 0.0002, "elapsed_sec": 14663.097988843918, "step_time_sec": 8.229429230996175, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1772, "loss": 5.044069766998291, "lr": 0.0002, "elapsed_sec": 14671.327970027924, "step_time_sec": 8.229793792997953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1773, "loss": 5.1606926918029785, "lr": 0.0002, "elapsed_sec": 14679.558117866516, "step_time_sec": 8.230008288024692, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1774, "loss": 5.120678424835205, "lr": 0.0002, "elapsed_sec": 14687.788456439972, "step_time_sec": 8.230135717982193, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1775, "loss": 5.122468948364258, "lr": 0.0002, "elapsed_sec": 14696.019447803497, "step_time_sec": 8.230880941002397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1776, "loss": 5.205641746520996, "lr": 0.0002, "elapsed_sec": 14704.2514398098, "step_time_sec": 8.231787315977272, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1777, "loss": 5.0565290451049805, "lr": 0.0002, "elapsed_sec": 14712.481278181076, "step_time_sec": 8.229676540009677, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1778, "loss": 5.283851146697998, "lr": 0.0002, "elapsed_sec": 14720.712983608246, "step_time_sec": 8.231579165003495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1779, "loss": 4.976420879364014, "lr": 0.0002, "elapsed_sec": 14728.941496133804, "step_time_sec": 8.228301181981806, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1780, "loss": 5.161965847015381, "lr": 0.0002, "elapsed_sec": 14737.170700788498, "step_time_sec": 8.229049185989425, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1781, "loss": 4.954302787780762, "lr": 0.0002, "elapsed_sec": 14745.401740074158, "step_time_sec": 8.230913440987933, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1782, "loss": 5.162482261657715, "lr": 0.0002, "elapsed_sec": 14753.631860017776, "step_time_sec": 8.229920470999787, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1783, "loss": 5.197453498840332, "lr": 0.0002, "elapsed_sec": 14761.863139152527, "step_time_sec": 8.231117477000225, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1784, "loss": 4.973842620849609, "lr": 0.0002, "elapsed_sec": 14770.091295957565, "step_time_sec": 8.228064447990619, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1785, "loss": 5.066488742828369, "lr": 0.0002, "elapsed_sec": 14778.320790529251, "step_time_sec": 8.229358832002617, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1786, "loss": 5.193114757537842, "lr": 0.0002, "elapsed_sec": 14786.551618099213, "step_time_sec": 8.230613354011439, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1787, "loss": 5.150945663452148, "lr": 0.0002, "elapsed_sec": 14794.782193660736, "step_time_sec": 8.230469838017598, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1788, "loss": 5.173721790313721, "lr": 0.0002, "elapsed_sec": 14803.013524770737, "step_time_sec": 8.231128064973745, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1789, "loss": 5.038779258728027, "lr": 0.0002, "elapsed_sec": 14811.242159128189, "step_time_sec": 8.228482998994878, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1790, "loss": 5.014922618865967, "lr": 0.0002, "elapsed_sec": 14819.472429275513, "step_time_sec": 8.230172278999817, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1791, "loss": 5.047393798828125, "lr": 0.0002, "elapsed_sec": 14827.701000452042, "step_time_sec": 8.228361956018489, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1792, "loss": 5.105037212371826, "lr": 0.0002, "elapsed_sec": 14835.928799390793, "step_time_sec": 8.227650247019483, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1793, "loss": 5.108584880828857, "lr": 0.0002, "elapsed_sec": 14844.156452417374, "step_time_sec": 8.227489308017539, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1794, "loss": 5.0125579833984375, "lr": 0.0002, "elapsed_sec": 14852.38480424881, "step_time_sec": 8.228197439020732, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1795, "loss": 5.082324504852295, "lr": 0.0002, "elapsed_sec": 14860.613726615906, "step_time_sec": 8.22875908401329, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1796, "loss": 5.182806491851807, "lr": 0.0002, "elapsed_sec": 14868.846304655075, "step_time_sec": 8.232419338019099, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1797, "loss": 5.099659442901611, "lr": 0.0002, "elapsed_sec": 14877.077005147934, "step_time_sec": 8.230536947987275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1798, "loss": 5.013326644897461, "lr": 0.0002, "elapsed_sec": 14885.307956933975, "step_time_sec": 8.230819663003786, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1799, "loss": 5.117491722106934, "lr": 0.0002, "elapsed_sec": 14893.538294315338, "step_time_sec": 8.230245808023028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1800, "loss": 5.0511393547058105, "lr": 0.0002, "elapsed_sec": 14901.769534826279, "step_time_sec": 8.230995990976226, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1801, "loss": 5.011392116546631, "lr": 0.0002, "elapsed_sec": 14909.997203350067, "step_time_sec": 8.227520059997914, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1802, "loss": 5.013503551483154, "lr": 0.0002, "elapsed_sec": 14918.225643634796, "step_time_sec": 8.228307111014146, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1803, "loss": 5.081010341644287, "lr": 0.0002, "elapsed_sec": 14926.45497226715, "step_time_sec": 8.229171464976389, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1804, "loss": 5.090877056121826, "lr": 0.0002, "elapsed_sec": 14934.685718774796, "step_time_sec": 8.23061008399236, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1805, "loss": 5.070024013519287, "lr": 0.0002, "elapsed_sec": 14942.916349887848, "step_time_sec": 8.230428256996674, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1806, "loss": 5.262120246887207, "lr": 0.0002, "elapsed_sec": 14951.145606279373, "step_time_sec": 8.22917993098963, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1807, "loss": 5.260026931762695, "lr": 0.0002, "elapsed_sec": 14959.376753091812, "step_time_sec": 8.230913631996373, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1808, "loss": 5.055148124694824, "lr": 0.0002, "elapsed_sec": 14967.607962369919, "step_time_sec": 8.23102471799939, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1809, "loss": 5.079630374908447, "lr": 0.0002, "elapsed_sec": 14975.83810210228, "step_time_sec": 8.229972603003262, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1810, "loss": 5.073517799377441, "lr": 0.0002, "elapsed_sec": 14984.069289207458, "step_time_sec": 8.231040796992602, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1811, "loss": 5.148521900177002, "lr": 0.0002, "elapsed_sec": 14992.299286603928, "step_time_sec": 8.229853273980552, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1812, "loss": 5.217446327209473, "lr": 0.0002, "elapsed_sec": 15000.530065536499, "step_time_sec": 8.23063877702225, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1813, "loss": 5.065737724304199, "lr": 0.0002, "elapsed_sec": 15008.760954380035, "step_time_sec": 8.230730794981355, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1814, "loss": 5.105092525482178, "lr": 0.0002, "elapsed_sec": 15016.98811340332, "step_time_sec": 8.226992210984463, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1815, "loss": 5.068460941314697, "lr": 0.0002, "elapsed_sec": 15025.216932296753, "step_time_sec": 8.228612611012068, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1816, "loss": 5.057052135467529, "lr": 0.0002, "elapsed_sec": 15033.443628549576, "step_time_sec": 8.226600791997043, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1817, "loss": 5.214064121246338, "lr": 0.0002, "elapsed_sec": 15041.67367529869, "step_time_sec": 8.229850751988124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1818, "loss": 4.948578357696533, "lr": 0.0002, "elapsed_sec": 15049.901480913162, "step_time_sec": 8.227666251012124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1819, "loss": 5.055105209350586, "lr": 0.0002, "elapsed_sec": 15058.1301445961, "step_time_sec": 8.228464984014863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1820, "loss": 5.009084701538086, "lr": 0.0002, "elapsed_sec": 15066.358840227127, "step_time_sec": 8.228568304999499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1821, "loss": 5.180148124694824, "lr": 0.0002, "elapsed_sec": 15074.589549064636, "step_time_sec": 8.23057539301226, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1822, "loss": 4.974043846130371, "lr": 0.0002, "elapsed_sec": 15082.82121682167, "step_time_sec": 8.231487159995595, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1823, "loss": 5.091672897338867, "lr": 0.0002, "elapsed_sec": 15091.052140951157, "step_time_sec": 8.230767998000374, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1824, "loss": 5.077847957611084, "lr": 0.0002, "elapsed_sec": 15099.283198356628, "step_time_sec": 8.230915172985988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1825, "loss": 5.15242338180542, "lr": 0.0002, "elapsed_sec": 15107.514617204666, "step_time_sec": 8.231250479992013, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1826, "loss": 5.2490105628967285, "lr": 0.0002, "elapsed_sec": 15115.745725393295, "step_time_sec": 8.230938221007818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1827, "loss": 5.127140522003174, "lr": 0.0002, "elapsed_sec": 15123.974091053009, "step_time_sec": 8.228225874016061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1828, "loss": 5.084841251373291, "lr": 0.0002, "elapsed_sec": 15132.202680587769, "step_time_sec": 8.228402348002419, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1829, "loss": 5.1569647789001465, "lr": 0.0002, "elapsed_sec": 15140.431484937668, "step_time_sec": 8.22871335601667, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1830, "loss": 5.042111873626709, "lr": 0.0002, "elapsed_sec": 15148.660860538483, "step_time_sec": 8.229201462003402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1831, "loss": 5.045795440673828, "lr": 0.0002, "elapsed_sec": 15156.887476444244, "step_time_sec": 8.226425285014557, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1832, "loss": 4.981960296630859, "lr": 0.0002, "elapsed_sec": 15165.117757797241, "step_time_sec": 8.230109849013388, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1833, "loss": 5.04245662689209, "lr": 0.0002, "elapsed_sec": 15173.349002122879, "step_time_sec": 8.231090425979346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1834, "loss": 5.101820468902588, "lr": 0.0002, "elapsed_sec": 15181.579724311829, "step_time_sec": 8.230624904012075, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1835, "loss": 5.0073652267456055, "lr": 0.0002, "elapsed_sec": 15189.810416460037, "step_time_sec": 8.230535807990236, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1836, "loss": 5.072088718414307, "lr": 0.0002, "elapsed_sec": 15198.04007267952, "step_time_sec": 8.22945549999713, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1837, "loss": 5.073742389678955, "lr": 0.0002, "elapsed_sec": 15206.269496202469, "step_time_sec": 8.229265848989598, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1838, "loss": 5.054749011993408, "lr": 0.0002, "elapsed_sec": 15214.498435735703, "step_time_sec": 8.22882059699623, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1839, "loss": 5.123669147491455, "lr": 0.0002, "elapsed_sec": 15222.728764772415, "step_time_sec": 8.230163284984883, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1840, "loss": 5.077054023742676, "lr": 0.0002, "elapsed_sec": 15230.959287405014, "step_time_sec": 8.230359703011345, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1841, "loss": 4.989439010620117, "lr": 0.0002, "elapsed_sec": 15239.190136671066, "step_time_sec": 8.230705569993006, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1842, "loss": 5.079813003540039, "lr": 0.0002, "elapsed_sec": 15247.421263933182, "step_time_sec": 8.230962433997774, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1843, "loss": 4.966151714324951, "lr": 0.0002, "elapsed_sec": 15255.651339530945, "step_time_sec": 8.229932895017555, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1844, "loss": 5.2354888916015625, "lr": 0.0002, "elapsed_sec": 15263.878969430923, "step_time_sec": 8.227508879994275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1845, "loss": 5.157803058624268, "lr": 0.0002, "elapsed_sec": 15272.10795378685, "step_time_sec": 8.228767805005191, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1846, "loss": 5.044526100158691, "lr": 0.0002, "elapsed_sec": 15280.337762355804, "step_time_sec": 8.22966481000185, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1847, "loss": 5.09097146987915, "lr": 0.0002, "elapsed_sec": 15288.568143367767, "step_time_sec": 8.230224582977826, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1848, "loss": 5.245993137359619, "lr": 0.0002, "elapsed_sec": 15296.799112796783, "step_time_sec": 8.230812647991115, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1849, "loss": 5.129588603973389, "lr": 0.0002, "elapsed_sec": 15305.029247999191, "step_time_sec": 8.23001079299138, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1850, "loss": 4.902318000793457, "lr": 0.0002, "elapsed_sec": 15313.25899028778, "step_time_sec": 8.229537663981318, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1851, "loss": 5.000155448913574, "lr": 0.0002, "elapsed_sec": 15321.488377094269, "step_time_sec": 8.229225739982212, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1852, "loss": 4.998627662658691, "lr": 0.0002, "elapsed_sec": 15329.719268798828, "step_time_sec": 8.230806834006216, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1853, "loss": 5.144513130187988, "lr": 0.0002, "elapsed_sec": 15337.948714256287, "step_time_sec": 8.229215369006852, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1854, "loss": 5.001450061798096, "lr": 0.0002, "elapsed_sec": 15346.176873922348, "step_time_sec": 8.228058186999988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1855, "loss": 5.138557434082031, "lr": 0.0002, "elapsed_sec": 15354.405711650848, "step_time_sec": 8.22863713800325, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1856, "loss": 5.133574962615967, "lr": 0.0002, "elapsed_sec": 15362.634959697723, "step_time_sec": 8.229083051992347, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1857, "loss": 5.1012725830078125, "lr": 0.0002, "elapsed_sec": 15370.865142822266, "step_time_sec": 8.230031026992947, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1858, "loss": 4.950342655181885, "lr": 0.0002, "elapsed_sec": 15379.09551858902, "step_time_sec": 8.230268120008986, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1859, "loss": 5.076935768127441, "lr": 0.0002, "elapsed_sec": 15387.326108932495, "step_time_sec": 8.230406615999527, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1860, "loss": 5.119637489318848, "lr": 0.0002, "elapsed_sec": 15395.5564494133, "step_time_sec": 8.23016440999345, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1861, "loss": 5.186173915863037, "lr": 0.0002, "elapsed_sec": 15403.787701368332, "step_time_sec": 8.231078643002547, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1862, "loss": 5.086459159851074, "lr": 0.0002, "elapsed_sec": 15412.017137765884, "step_time_sec": 8.229328387998976, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1863, "loss": 4.911352157592773, "lr": 0.0002, "elapsed_sec": 15420.247286319733, "step_time_sec": 8.22996721000527, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1864, "loss": 5.121244430541992, "lr": 0.0002, "elapsed_sec": 15428.47852897644, "step_time_sec": 8.231057173019508, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1865, "loss": 5.0791096687316895, "lr": 0.0002, "elapsed_sec": 15436.709617614746, "step_time_sec": 8.2309807560232, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1866, "loss": 4.919623374938965, "lr": 0.0002, "elapsed_sec": 15444.941177845001, "step_time_sec": 8.231398941017687, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1867, "loss": 5.007384300231934, "lr": 0.0002, "elapsed_sec": 15453.171506404877, "step_time_sec": 8.230135699996026, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1868, "loss": 5.098748207092285, "lr": 0.0002, "elapsed_sec": 15461.40050983429, "step_time_sec": 8.228824231016915, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1869, "loss": 5.068026065826416, "lr": 0.0002, "elapsed_sec": 15469.630448579788, "step_time_sec": 8.229806894989451, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1870, "loss": 5.125720024108887, "lr": 0.0002, "elapsed_sec": 15477.860563755035, "step_time_sec": 8.230004248995101, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1871, "loss": 5.001260280609131, "lr": 0.0002, "elapsed_sec": 15486.15039229393, "step_time_sec": 8.233235494000837, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1872, "loss": 5.066832542419434, "lr": 0.0002, "elapsed_sec": 15494.379783391953, "step_time_sec": 8.22923468099907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1873, "loss": 5.129796504974365, "lr": 0.0002, "elapsed_sec": 15502.608201026917, "step_time_sec": 8.228282916999888, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1874, "loss": 5.229033470153809, "lr": 0.0002, "elapsed_sec": 15510.835946559906, "step_time_sec": 8.227598472003592, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1875, "loss": 5.0935163497924805, "lr": 0.0002, "elapsed_sec": 15519.06448340416, "step_time_sec": 8.228413539007306, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1876, "loss": 5.177403450012207, "lr": 0.0002, "elapsed_sec": 15527.293666124344, "step_time_sec": 8.228988363989629, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1877, "loss": 4.996833324432373, "lr": 0.0002, "elapsed_sec": 15535.52257657051, "step_time_sec": 8.228727678011637, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1878, "loss": 5.138216972351074, "lr": 0.0002, "elapsed_sec": 15543.751089572906, "step_time_sec": 8.228350829012925, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1879, "loss": 5.09609317779541, "lr": 0.0002, "elapsed_sec": 15551.978790521622, "step_time_sec": 8.22754764399724, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1880, "loss": 5.096823692321777, "lr": 0.0002, "elapsed_sec": 15560.21027970314, "step_time_sec": 8.231415749003645, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1881, "loss": 5.074191093444824, "lr": 0.0002, "elapsed_sec": 15568.44111442566, "step_time_sec": 8.230607756006066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1882, "loss": 5.001741409301758, "lr": 0.0002, "elapsed_sec": 15576.669642448425, "step_time_sec": 8.228401961008785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1883, "loss": 5.0490498542785645, "lr": 0.0002, "elapsed_sec": 15584.897879600525, "step_time_sec": 8.228040362009779, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1884, "loss": 5.074704647064209, "lr": 0.0002, "elapsed_sec": 15593.126627206802, "step_time_sec": 8.228580635011895, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1885, "loss": 4.9938554763793945, "lr": 0.0002, "elapsed_sec": 15601.358211278915, "step_time_sec": 8.231431245978456, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1886, "loss": 5.142399311065674, "lr": 0.0002, "elapsed_sec": 15609.588168382645, "step_time_sec": 8.229816532984842, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1887, "loss": 5.140115261077881, "lr": 0.0002, "elapsed_sec": 15617.818879842758, "step_time_sec": 8.23051283301902, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1888, "loss": 5.058691024780273, "lr": 0.0002, "elapsed_sec": 15626.049096822739, "step_time_sec": 8.230042801005766, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1889, "loss": 4.9704413414001465, "lr": 0.0002, "elapsed_sec": 15634.27998638153, "step_time_sec": 8.230811196990544, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1890, "loss": 5.00160551071167, "lr": 0.0002, "elapsed_sec": 15642.509282588959, "step_time_sec": 8.229107640014263, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1891, "loss": 5.072193622589111, "lr": 0.0002, "elapsed_sec": 15650.738352060318, "step_time_sec": 8.228848341997946, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1892, "loss": 5.13345193862915, "lr": 0.0002, "elapsed_sec": 15658.965975046158, "step_time_sec": 8.227486373973079, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1893, "loss": 5.177514553070068, "lr": 0.0002, "elapsed_sec": 15667.195257425308, "step_time_sec": 8.229149388993392, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1894, "loss": 5.040550708770752, "lr": 0.0002, "elapsed_sec": 15675.423243284225, "step_time_sec": 8.22787813199102, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1895, "loss": 5.140946865081787, "lr": 0.0002, "elapsed_sec": 15683.652549028397, "step_time_sec": 8.229067856998881, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1896, "loss": 5.036363124847412, "lr": 0.0002, "elapsed_sec": 15691.880116939545, "step_time_sec": 8.227410489984322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1897, "loss": 5.1395111083984375, "lr": 0.0002, "elapsed_sec": 15700.107450008392, "step_time_sec": 8.227220746019157, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1898, "loss": 5.0846686363220215, "lr": 0.0002, "elapsed_sec": 15708.33641076088, "step_time_sec": 8.228805910010124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1899, "loss": 5.05433464050293, "lr": 0.0002, "elapsed_sec": 15716.564720869064, "step_time_sec": 8.228119225008413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1900, "loss": 4.978627681732178, "lr": 0.0002, "elapsed_sec": 15724.795990228653, "step_time_sec": 8.231127087987261, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1901, "loss": 5.081668376922607, "lr": 0.0002, "elapsed_sec": 15733.026935577393, "step_time_sec": 8.230795876996126, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1902, "loss": 5.058319568634033, "lr": 0.0002, "elapsed_sec": 15741.253994464874, "step_time_sec": 8.226825735997409, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1903, "loss": 4.990942001342773, "lr": 0.0002, "elapsed_sec": 15749.482917070389, "step_time_sec": 8.228786227002274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1904, "loss": 5.076462268829346, "lr": 0.0002, "elapsed_sec": 15757.711451530457, "step_time_sec": 8.228351689991541, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1905, "loss": 4.997347354888916, "lr": 0.0002, "elapsed_sec": 15765.940813779831, "step_time_sec": 8.22917968899128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1906, "loss": 4.985584735870361, "lr": 0.0002, "elapsed_sec": 15774.170078754425, "step_time_sec": 8.229165178985568, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1907, "loss": 4.982185363769531, "lr": 0.0002, "elapsed_sec": 15782.40023612976, "step_time_sec": 8.229966704995604, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1908, "loss": 5.092463970184326, "lr": 0.0002, "elapsed_sec": 15790.63163471222, "step_time_sec": 8.231231328012655, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1909, "loss": 5.118754863739014, "lr": 0.0002, "elapsed_sec": 15798.859780550003, "step_time_sec": 8.227986560988938, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1910, "loss": 5.117187976837158, "lr": 0.0002, "elapsed_sec": 15807.088958263397, "step_time_sec": 8.229042254999513, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1911, "loss": 5.047907829284668, "lr": 0.0002, "elapsed_sec": 15815.31865119934, "step_time_sec": 8.229570283991052, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1912, "loss": 5.062713623046875, "lr": 0.0002, "elapsed_sec": 15823.548562049866, "step_time_sec": 8.229702022013953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1913, "loss": 5.032175540924072, "lr": 0.0002, "elapsed_sec": 15831.779416799545, "step_time_sec": 8.230710302013904, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1914, "loss": 4.980936050415039, "lr": 0.0002, "elapsed_sec": 15840.009662866592, "step_time_sec": 8.230069681972964, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1915, "loss": 5.088891983032227, "lr": 0.0002, "elapsed_sec": 15848.240149021149, "step_time_sec": 8.230344339011936, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1916, "loss": 5.053956031799316, "lr": 0.0002, "elapsed_sec": 15856.469760894775, "step_time_sec": 8.2294577670109, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1917, "loss": 4.99406099319458, "lr": 0.0002, "elapsed_sec": 15864.700160980225, "step_time_sec": 8.230298251000931, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1918, "loss": 5.112574100494385, "lr": 0.0002, "elapsed_sec": 15872.930359363556, "step_time_sec": 8.230001534015173, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1919, "loss": 5.0815043449401855, "lr": 0.0002, "elapsed_sec": 15881.16159248352, "step_time_sec": 8.231075512012467, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1920, "loss": 5.091851234436035, "lr": 0.0002, "elapsed_sec": 15889.38989496231, "step_time_sec": 8.228211869980441, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1921, "loss": 5.032758712768555, "lr": 0.0002, "elapsed_sec": 15897.61809539795, "step_time_sec": 8.228001967974706, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1922, "loss": 5.052602291107178, "lr": 0.0002, "elapsed_sec": 15905.849596977234, "step_time_sec": 8.231322167004691, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1923, "loss": 4.976047515869141, "lr": 0.0002, "elapsed_sec": 15914.080576896667, "step_time_sec": 8.230839891999494, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1924, "loss": 5.06498908996582, "lr": 0.0002, "elapsed_sec": 15922.311562299728, "step_time_sec": 8.230801703000907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1925, "loss": 5.041213035583496, "lr": 0.0002, "elapsed_sec": 15930.542421579361, "step_time_sec": 8.230768328998238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1926, "loss": 5.018303871154785, "lr": 0.0002, "elapsed_sec": 15938.772359371185, "step_time_sec": 8.2297364360129, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1927, "loss": 5.038211345672607, "lr": 0.0002, "elapsed_sec": 15947.003005981445, "step_time_sec": 8.23050725797657, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1928, "loss": 5.070408344268799, "lr": 0.0002, "elapsed_sec": 15955.233808994293, "step_time_sec": 8.23065932400641, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1929, "loss": 5.101343154907227, "lr": 0.0002, "elapsed_sec": 15963.464498758316, "step_time_sec": 8.230541855999036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1930, "loss": 5.101836681365967, "lr": 0.0002, "elapsed_sec": 15971.692072153091, "step_time_sec": 8.227374393987702, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1931, "loss": 4.999782085418701, "lr": 0.0002, "elapsed_sec": 15979.921425104141, "step_time_sec": 8.229208582022693, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1932, "loss": 4.945400714874268, "lr": 0.0002, "elapsed_sec": 15988.149897813797, "step_time_sec": 8.228296207002131, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1933, "loss": 5.022439002990723, "lr": 0.0002, "elapsed_sec": 15996.38010430336, "step_time_sec": 8.230139760999009, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1934, "loss": 5.077878475189209, "lr": 0.0002, "elapsed_sec": 16004.6109790802, "step_time_sec": 8.23068645800231, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1935, "loss": 4.984251022338867, "lr": 0.0002, "elapsed_sec": 16012.839769363403, "step_time_sec": 8.228578051988734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1936, "loss": 4.987560272216797, "lr": 0.0002, "elapsed_sec": 16021.067847013474, "step_time_sec": 8.227919928991469, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1937, "loss": 5.0147247314453125, "lr": 0.0002, "elapsed_sec": 16029.295760631561, "step_time_sec": 8.22777249198407, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1938, "loss": 5.02937650680542, "lr": 0.0002, "elapsed_sec": 16037.526829242706, "step_time_sec": 8.23090514098294, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1939, "loss": 5.064787864685059, "lr": 0.0002, "elapsed_sec": 16045.757185935974, "step_time_sec": 8.23020522398292, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1940, "loss": 5.159794330596924, "lr": 0.0002, "elapsed_sec": 16053.988279104233, "step_time_sec": 8.230963981011882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1941, "loss": 5.035817623138428, "lr": 0.0002, "elapsed_sec": 16062.2193505764, "step_time_sec": 8.23087908999878, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1942, "loss": 5.150062084197998, "lr": 0.0002, "elapsed_sec": 16070.450606107712, "step_time_sec": 8.231149896018906, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1943, "loss": 5.096841812133789, "lr": 0.0002, "elapsed_sec": 16078.68104481697, "step_time_sec": 8.230234206013847, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1944, "loss": 4.985398292541504, "lr": 0.0002, "elapsed_sec": 16086.912230968475, "step_time_sec": 8.231037452002056, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1945, "loss": 5.058170318603516, "lr": 0.0002, "elapsed_sec": 16095.143303155899, "step_time_sec": 8.230991327989614, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1946, "loss": 5.132395267486572, "lr": 0.0002, "elapsed_sec": 16103.373205900192, "step_time_sec": 8.229665903025307, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1947, "loss": 4.995510578155518, "lr": 0.0002, "elapsed_sec": 16111.603417396545, "step_time_sec": 8.230056972010061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1948, "loss": 5.083045482635498, "lr": 0.0002, "elapsed_sec": 16119.83302640915, "step_time_sec": 8.229476513981353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1949, "loss": 4.854523658752441, "lr": 0.0002, "elapsed_sec": 16128.065064191818, "step_time_sec": 8.231883653003024, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1950, "loss": 5.00015926361084, "lr": 0.0002, "elapsed_sec": 16136.292591810226, "step_time_sec": 8.227306497981772, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1951, "loss": 4.924371719360352, "lr": 0.0002, "elapsed_sec": 16144.524121284485, "step_time_sec": 8.231392707995838, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1952, "loss": 4.986765384674072, "lr": 0.0002, "elapsed_sec": 16152.755135536194, "step_time_sec": 8.230851041997084, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1953, "loss": 4.977497577667236, "lr": 0.0002, "elapsed_sec": 16160.985565900803, "step_time_sec": 8.230279209994478, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1954, "loss": 4.9065446853637695, "lr": 0.0002, "elapsed_sec": 16169.21618938446, "step_time_sec": 8.230533238995122, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1955, "loss": 5.1114115715026855, "lr": 0.0002, "elapsed_sec": 16177.44795703888, "step_time_sec": 8.231531066994648, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1956, "loss": 5.0524678230285645, "lr": 0.0002, "elapsed_sec": 16185.678333759308, "step_time_sec": 8.230262542987475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1957, "loss": 4.927937984466553, "lr": 0.0002, "elapsed_sec": 16193.909731149673, "step_time_sec": 8.231201246002456, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1958, "loss": 4.9459967613220215, "lr": 0.0002, "elapsed_sec": 16202.141676664352, "step_time_sec": 8.231776664993959, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1959, "loss": 4.88058614730835, "lr": 0.0002, "elapsed_sec": 16210.370698213577, "step_time_sec": 8.228873361018486, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1960, "loss": 4.971252918243408, "lr": 0.0002, "elapsed_sec": 16218.600885152817, "step_time_sec": 8.230008440994425, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1961, "loss": 4.824376583099365, "lr": 0.0002, "elapsed_sec": 16226.832212209702, "step_time_sec": 8.231193435000023, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1962, "loss": 4.974361419677734, "lr": 0.0002, "elapsed_sec": 16235.062900304794, "step_time_sec": 8.230580536008347, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1963, "loss": 5.073472023010254, "lr": 0.0002, "elapsed_sec": 16243.294484615326, "step_time_sec": 8.231385857972782, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1964, "loss": 5.049589157104492, "lr": 0.0002, "elapsed_sec": 16251.523772478104, "step_time_sec": 8.22911595198093, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1965, "loss": 4.987788200378418, "lr": 0.0002, "elapsed_sec": 16259.75390791893, "step_time_sec": 8.230050949001452, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1966, "loss": 5.10806941986084, "lr": 0.0002, "elapsed_sec": 16267.984359741211, "step_time_sec": 8.230240636999952, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1967, "loss": 4.94236946105957, "lr": 0.0002, "elapsed_sec": 16276.215927362442, "step_time_sec": 8.231400940974709, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1968, "loss": 4.990049362182617, "lr": 0.0002, "elapsed_sec": 16284.446140289307, "step_time_sec": 8.230060456000501, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1969, "loss": 5.124480724334717, "lr": 0.0002, "elapsed_sec": 16292.676712989807, "step_time_sec": 8.230409887008136, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1970, "loss": 4.981599807739258, "lr": 0.0002, "elapsed_sec": 16300.907529830933, "step_time_sec": 8.230667209019884, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1971, "loss": 5.0085015296936035, "lr": 0.0002, "elapsed_sec": 16309.135202169418, "step_time_sec": 8.227534136007307, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1972, "loss": 5.012034893035889, "lr": 0.0002, "elapsed_sec": 16317.36583685875, "step_time_sec": 8.230456907011103, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1973, "loss": 4.933043003082275, "lr": 0.0002, "elapsed_sec": 16325.596355199814, "step_time_sec": 8.230393917998299, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1974, "loss": 5.085055828094482, "lr": 0.0002, "elapsed_sec": 16333.824751853943, "step_time_sec": 8.228245386009803, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1975, "loss": 5.112534523010254, "lr": 0.0002, "elapsed_sec": 16342.053220748901, "step_time_sec": 8.228257643000688, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1976, "loss": 4.951669216156006, "lr": 0.0002, "elapsed_sec": 16350.281380653381, "step_time_sec": 8.228062168986071, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1977, "loss": 5.040303707122803, "lr": 0.0002, "elapsed_sec": 16358.511374950409, "step_time_sec": 8.22978081801557, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1978, "loss": 4.964309215545654, "lr": 0.0002, "elapsed_sec": 16366.739020586014, "step_time_sec": 8.227538210019702, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1979, "loss": 4.931437015533447, "lr": 0.0002, "elapsed_sec": 16374.969209194183, "step_time_sec": 8.230032853025477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1980, "loss": 4.96348237991333, "lr": 0.0002, "elapsed_sec": 16383.201058864594, "step_time_sec": 8.231654934003018, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1981, "loss": 4.805417060852051, "lr": 0.0002, "elapsed_sec": 16391.431468248367, "step_time_sec": 8.23024201800581, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1982, "loss": 5.131181716918945, "lr": 0.0002, "elapsed_sec": 16399.661725521088, "step_time_sec": 8.230116384016583, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1983, "loss": 4.8484296798706055, "lr": 0.0002, "elapsed_sec": 16407.89185643196, "step_time_sec": 8.229968479980016, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1984, "loss": 5.031515598297119, "lr": 0.0002, "elapsed_sec": 16416.121099233627, "step_time_sec": 8.229086521023419, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1985, "loss": 4.98311185836792, "lr": 0.0002, "elapsed_sec": 16424.349158763885, "step_time_sec": 8.22797728999285, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1986, "loss": 4.996380805969238, "lr": 0.0002, "elapsed_sec": 16432.57721734047, "step_time_sec": 8.22781117699924, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1987, "loss": 5.004439830780029, "lr": 0.0002, "elapsed_sec": 16440.806923866272, "step_time_sec": 8.229600227990886, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1988, "loss": 4.919236660003662, "lr": 0.0002, "elapsed_sec": 16449.036454916, "step_time_sec": 8.229350569017697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1989, "loss": 5.057791233062744, "lr": 0.0002, "elapsed_sec": 16457.26655960083, "step_time_sec": 8.229945767001482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1990, "loss": 5.009000301361084, "lr": 0.0002, "elapsed_sec": 16465.49732732773, "step_time_sec": 8.230606672004797, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1991, "loss": 5.0311455726623535, "lr": 0.0002, "elapsed_sec": 16473.727620601654, "step_time_sec": 8.230127640999854, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1992, "loss": 4.9475998878479, "lr": 0.0002, "elapsed_sec": 16481.95858478546, "step_time_sec": 8.230816685012542, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1993, "loss": 4.91194486618042, "lr": 0.0002, "elapsed_sec": 16490.18976688385, "step_time_sec": 8.231078373006312, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1994, "loss": 4.994056701660156, "lr": 0.0002, "elapsed_sec": 16498.421026945114, "step_time_sec": 8.231160924013238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1995, "loss": 4.860559940338135, "lr": 0.0002, "elapsed_sec": 16506.649134159088, "step_time_sec": 8.227859106002143, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1996, "loss": 5.069502830505371, "lr": 0.0002, "elapsed_sec": 16514.875955104828, "step_time_sec": 8.226681000989629, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1997, "loss": 4.9285688400268555, "lr": 0.0002, "elapsed_sec": 16523.10609817505, "step_time_sec": 8.229928774992004, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1998, "loss": 5.187678813934326, "lr": 0.0002, "elapsed_sec": 16531.33589220047, "step_time_sec": 8.22967946899007, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 1999, "loss": 4.956171989440918, "lr": 0.0002, "elapsed_sec": 16539.566649198532, "step_time_sec": 8.230565238016425, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2000, "loss": 5.0978593826293945, "lr": 0.0002, "elapsed_sec": 16547.79737353325, "step_time_sec": 51.642767673998605, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": true, "probe_ran": true, "probe_elapsed_sec": 0.9843624049972277, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2001, "loss": 5.010745048522949, "lr": 0.0002, "elapsed_sec": 16599.43814253807, "step_time_sec": 8.228483654005686, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2002, "loss": 4.945369720458984, "lr": 0.0002, "elapsed_sec": 16607.65859413147, "step_time_sec": 8.220269062992884, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2003, "loss": 5.0396575927734375, "lr": 0.0002, "elapsed_sec": 16615.888442516327, "step_time_sec": 8.229719271010254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2004, "loss": 4.913992881774902, "lr": 0.0002, "elapsed_sec": 16624.117659330368, "step_time_sec": 8.229042294988176, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2005, "loss": 5.000729560852051, "lr": 0.0002, "elapsed_sec": 16632.348264217377, "step_time_sec": 8.230524511978729, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2006, "loss": 4.897037506103516, "lr": 0.0002, "elapsed_sec": 16640.580003738403, "step_time_sec": 8.231560526997782, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2007, "loss": 5.061288356781006, "lr": 0.0002, "elapsed_sec": 16648.810408353806, "step_time_sec": 8.230178500001784, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2008, "loss": 4.877204418182373, "lr": 0.0002, "elapsed_sec": 16657.040421009064, "step_time_sec": 8.229882157000247, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2009, "loss": 5.065770626068115, "lr": 0.0002, "elapsed_sec": 16665.270178556442, "step_time_sec": 8.22966437699506, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2010, "loss": 4.970103740692139, "lr": 0.0002, "elapsed_sec": 16673.50159716606, "step_time_sec": 8.231208774988772, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2011, "loss": 4.889019012451172, "lr": 0.0002, "elapsed_sec": 16681.73235821724, "step_time_sec": 8.230604910000693, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2012, "loss": 5.007341384887695, "lr": 0.0002, "elapsed_sec": 16689.963245630264, "step_time_sec": 8.230717660975643, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2013, "loss": 4.890741348266602, "lr": 0.0002, "elapsed_sec": 16698.19399356842, "step_time_sec": 8.230597323999973, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2014, "loss": 4.991355895996094, "lr": 0.0002, "elapsed_sec": 16706.42207145691, "step_time_sec": 8.22791659401264, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2015, "loss": 4.8223347663879395, "lr": 0.0002, "elapsed_sec": 16714.651631593704, "step_time_sec": 8.22939413698623, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2016, "loss": 4.956272602081299, "lr": 0.0002, "elapsed_sec": 16722.880802631378, "step_time_sec": 8.229025172011461, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2017, "loss": 4.958982467651367, "lr": 0.0002, "elapsed_sec": 16731.1109187603, "step_time_sec": 8.229990613006521, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2018, "loss": 4.9058403968811035, "lr": 0.0002, "elapsed_sec": 16739.340932369232, "step_time_sec": 8.229855643003248, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2019, "loss": 4.832540988922119, "lr": 0.0002, "elapsed_sec": 16747.57107782364, "step_time_sec": 8.229969809995964, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2020, "loss": 5.036129474639893, "lr": 0.0002, "elapsed_sec": 16755.801085472107, "step_time_sec": 8.229914594994625, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2021, "loss": 4.940530776977539, "lr": 0.0002, "elapsed_sec": 16764.031287431717, "step_time_sec": 8.229964825994102, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2022, "loss": 4.974854469299316, "lr": 0.0002, "elapsed_sec": 16772.262854099274, "step_time_sec": 8.231390418979572, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2023, "loss": 4.937746047973633, "lr": 0.0002, "elapsed_sec": 16780.49428296089, "step_time_sec": 8.2313261149975, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2024, "loss": 4.882196426391602, "lr": 0.0002, "elapsed_sec": 16788.725382089615, "step_time_sec": 8.230888927995693, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2025, "loss": 5.0318803787231445, "lr": 0.0002, "elapsed_sec": 16796.956459522247, "step_time_sec": 8.230918738001492, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2026, "loss": 4.963298320770264, "lr": 0.0002, "elapsed_sec": 16805.18752336502, "step_time_sec": 8.230959289998282, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2027, "loss": 4.989414215087891, "lr": 0.0002, "elapsed_sec": 16813.418123722076, "step_time_sec": 8.230423781002173, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2028, "loss": 4.934896945953369, "lr": 0.0002, "elapsed_sec": 16821.649060726166, "step_time_sec": 8.230781205027597, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2029, "loss": 4.981855392456055, "lr": 0.0002, "elapsed_sec": 16829.878360509872, "step_time_sec": 8.229132165986812, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2030, "loss": 4.807308673858643, "lr": 0.0002, "elapsed_sec": 16838.108574151993, "step_time_sec": 8.230048358993372, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2031, "loss": 4.887735366821289, "lr": 0.0002, "elapsed_sec": 16846.338452339172, "step_time_sec": 8.229774217994418, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2032, "loss": 4.996971607208252, "lr": 0.0002, "elapsed_sec": 16854.568344593048, "step_time_sec": 8.22969661699608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2033, "loss": 4.842093467712402, "lr": 0.0002, "elapsed_sec": 16862.79547381401, "step_time_sec": 8.226984235981945, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2034, "loss": 4.987602233886719, "lr": 0.0002, "elapsed_sec": 16871.02547621727, "step_time_sec": 8.229874067008495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2035, "loss": 4.962336540222168, "lr": 0.0002, "elapsed_sec": 16879.255106449127, "step_time_sec": 8.229470139020123, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2036, "loss": 4.948355197906494, "lr": 0.0002, "elapsed_sec": 16887.486127853394, "step_time_sec": 8.230848951003281, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2037, "loss": 4.973325252532959, "lr": 0.0002, "elapsed_sec": 16895.71751189232, "step_time_sec": 8.231310647999635, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2038, "loss": 4.932426452636719, "lr": 0.0002, "elapsed_sec": 16903.94892334938, "step_time_sec": 8.231176945992047, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2039, "loss": 4.970998764038086, "lr": 0.0002, "elapsed_sec": 16912.180952310562, "step_time_sec": 8.2318698290037, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2040, "loss": 4.842235565185547, "lr": 0.0002, "elapsed_sec": 16920.4113445282, "step_time_sec": 8.230257729999721, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2041, "loss": 4.912482738494873, "lr": 0.0002, "elapsed_sec": 16928.64235520363, "step_time_sec": 8.230846947000828, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2042, "loss": 4.958948612213135, "lr": 0.0002, "elapsed_sec": 16936.874004364014, "step_time_sec": 8.231525833019987, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2043, "loss": 4.998132705688477, "lr": 0.0002, "elapsed_sec": 16945.104964256287, "step_time_sec": 8.230754121992504, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2044, "loss": 4.862117767333984, "lr": 0.0002, "elapsed_sec": 16953.33608031273, "step_time_sec": 8.230963123001857, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2045, "loss": 4.9190144538879395, "lr": 0.0002, "elapsed_sec": 16961.566826820374, "step_time_sec": 8.23060366298887, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2046, "loss": 4.9205474853515625, "lr": 0.0002, "elapsed_sec": 16969.79828810692, "step_time_sec": 8.231326436973177, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2047, "loss": 4.947099685668945, "lr": 0.0002, "elapsed_sec": 16978.028181552887, "step_time_sec": 8.229762289003702, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2048, "loss": 4.927303314208984, "lr": 0.0002, "elapsed_sec": 16986.259430646896, "step_time_sec": 8.231103457015706, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2049, "loss": 4.88123893737793, "lr": 0.0002, "elapsed_sec": 16994.48705291748, "step_time_sec": 8.227422599011334, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2050, "loss": 4.902896404266357, "lr": 0.0002, "elapsed_sec": 17002.71673154831, "step_time_sec": 8.229524548980407, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2051, "loss": 4.839085578918457, "lr": 0.0002, "elapsed_sec": 17010.944429159164, "step_time_sec": 8.227586296998197, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2052, "loss": 4.908681869506836, "lr": 0.0002, "elapsed_sec": 17019.17470407486, "step_time_sec": 8.230070549994707, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2053, "loss": 4.796061992645264, "lr": 0.0002, "elapsed_sec": 17027.405627012253, "step_time_sec": 8.230757068988169, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2054, "loss": 4.8961992263793945, "lr": 0.0002, "elapsed_sec": 17035.634647607803, "step_time_sec": 8.228868826001417, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2055, "loss": 4.8309125900268555, "lr": 0.0002, "elapsed_sec": 17043.864847183228, "step_time_sec": 8.230052052997053, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2056, "loss": 4.865597248077393, "lr": 0.0002, "elapsed_sec": 17052.093797922134, "step_time_sec": 8.228776293020928, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2057, "loss": 4.857273578643799, "lr": 0.0002, "elapsed_sec": 17060.322417259216, "step_time_sec": 8.22853786899941, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2058, "loss": 4.917491912841797, "lr": 0.0002, "elapsed_sec": 17068.551773548126, "step_time_sec": 8.229135322995717, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2059, "loss": 4.886888027191162, "lr": 0.0002, "elapsed_sec": 17076.780792474747, "step_time_sec": 8.228941366978688, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2060, "loss": 4.8131022453308105, "lr": 0.0002, "elapsed_sec": 17085.009380578995, "step_time_sec": 8.228359814995201, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2061, "loss": 5.0766096115112305, "lr": 0.0002, "elapsed_sec": 17093.2396569252, "step_time_sec": 8.230118977022357, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2062, "loss": 4.854176998138428, "lr": 0.0002, "elapsed_sec": 17101.46853327751, "step_time_sec": 8.228782268008217, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2063, "loss": 4.834217071533203, "lr": 0.0002, "elapsed_sec": 17109.699104070663, "step_time_sec": 8.23039599598269, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2064, "loss": 4.746886253356934, "lr": 0.0002, "elapsed_sec": 17117.929466724396, "step_time_sec": 8.230190677015344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2065, "loss": 4.903706073760986, "lr": 0.0002, "elapsed_sec": 17126.159986019135, "step_time_sec": 8.230420663021505, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2066, "loss": 4.851858139038086, "lr": 0.0002, "elapsed_sec": 17134.39189195633, "step_time_sec": 8.231665542989504, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2067, "loss": 4.821674346923828, "lr": 0.0002, "elapsed_sec": 17142.622262716293, "step_time_sec": 8.230247833009344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2068, "loss": 4.960068225860596, "lr": 0.0002, "elapsed_sec": 17150.85326552391, "step_time_sec": 8.230825801001629, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2069, "loss": 4.9647955894470215, "lr": 0.0002, "elapsed_sec": 17159.083664894104, "step_time_sec": 8.230268106999574, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2070, "loss": 4.923617839813232, "lr": 0.0002, "elapsed_sec": 17167.31501889229, "step_time_sec": 8.231197402987164, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2071, "loss": 4.937320709228516, "lr": 0.0002, "elapsed_sec": 17175.545838832855, "step_time_sec": 8.23070076899603, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2072, "loss": 4.876791477203369, "lr": 0.0002, "elapsed_sec": 17183.776409864426, "step_time_sec": 8.230343013012316, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2073, "loss": 4.834181785583496, "lr": 0.0002, "elapsed_sec": 17192.008519411087, "step_time_sec": 8.232038542977534, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2074, "loss": 4.970451831817627, "lr": 0.0002, "elapsed_sec": 17200.239710092545, "step_time_sec": 8.230951867997646, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2075, "loss": 4.994749069213867, "lr": 0.0002, "elapsed_sec": 17208.469220399857, "step_time_sec": 8.229403596022166, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2076, "loss": 4.871640682220459, "lr": 0.0002, "elapsed_sec": 17216.697973251343, "step_time_sec": 8.228552897984628, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2077, "loss": 4.856750965118408, "lr": 0.0002, "elapsed_sec": 17224.926955461502, "step_time_sec": 8.228838743991219, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2078, "loss": 4.856607913970947, "lr": 0.0002, "elapsed_sec": 17233.155390024185, "step_time_sec": 8.228263486991636, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2079, "loss": 4.879632949829102, "lr": 0.0002, "elapsed_sec": 17241.384822368622, "step_time_sec": 8.229348816006677, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2080, "loss": 4.836623668670654, "lr": 0.0002, "elapsed_sec": 17249.61559700966, "step_time_sec": 8.230577825015644, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2081, "loss": 4.868025302886963, "lr": 0.0002, "elapsed_sec": 17257.84531211853, "step_time_sec": 8.229551233001985, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2082, "loss": 4.865211486816406, "lr": 0.0002, "elapsed_sec": 17266.0741379261, "step_time_sec": 8.228710321011022, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2083, "loss": 4.8929853439331055, "lr": 0.0002, "elapsed_sec": 17274.302903413773, "step_time_sec": 8.228571903979173, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2084, "loss": 4.84815788269043, "lr": 0.0002, "elapsed_sec": 17282.53352212906, "step_time_sec": 8.230466301989509, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2085, "loss": 4.847899436950684, "lr": 0.0002, "elapsed_sec": 17290.762614250183, "step_time_sec": 8.22892334900098, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2086, "loss": 5.0487236976623535, "lr": 0.0002, "elapsed_sec": 17298.9913854599, "step_time_sec": 8.228625871008262, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2087, "loss": 4.862522125244141, "lr": 0.0002, "elapsed_sec": 17307.220891952515, "step_time_sec": 8.229351932997815, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2088, "loss": 4.9990410804748535, "lr": 0.0002, "elapsed_sec": 17315.45222234726, "step_time_sec": 8.231183284980943, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2089, "loss": 4.868011474609375, "lr": 0.0002, "elapsed_sec": 17323.683426856995, "step_time_sec": 8.231056627002545, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2090, "loss": 4.855659008026123, "lr": 0.0002, "elapsed_sec": 17331.914480924606, "step_time_sec": 8.23089242100832, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2091, "loss": 4.964559078216553, "lr": 0.0002, "elapsed_sec": 17340.145613193512, "step_time_sec": 8.230980320018716, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2092, "loss": 4.843496799468994, "lr": 0.0002, "elapsed_sec": 17348.37642979622, "step_time_sec": 8.23066356900381, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2093, "loss": 4.960544109344482, "lr": 0.0002, "elapsed_sec": 17356.606745243073, "step_time_sec": 8.230215087009128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2094, "loss": 4.859277248382568, "lr": 0.0002, "elapsed_sec": 17364.8369679451, "step_time_sec": 8.230006830999628, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2095, "loss": 5.015407085418701, "lr": 0.0002, "elapsed_sec": 17373.067501306534, "step_time_sec": 8.23039634001907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2096, "loss": 5.056461334228516, "lr": 0.0002, "elapsed_sec": 17381.298358678818, "step_time_sec": 8.230743231019005, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2097, "loss": 4.838261127471924, "lr": 0.0002, "elapsed_sec": 17389.531203985214, "step_time_sec": 8.232673830003478, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2098, "loss": 4.961207866668701, "lr": 0.0002, "elapsed_sec": 17397.75849699974, "step_time_sec": 8.227092249988345, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2099, "loss": 4.800625324249268, "lr": 0.0002, "elapsed_sec": 17405.988467931747, "step_time_sec": 8.229818121995777, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2100, "loss": 5.026813983917236, "lr": 0.0002, "elapsed_sec": 17414.216695070267, "step_time_sec": 8.228087349998532, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2101, "loss": 4.966194152832031, "lr": 0.0002, "elapsed_sec": 17422.445519685745, "step_time_sec": 8.228663365996908, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2102, "loss": 4.804290294647217, "lr": 0.0002, "elapsed_sec": 17430.673986196518, "step_time_sec": 8.228311094018864, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2103, "loss": 4.892853260040283, "lr": 0.0002, "elapsed_sec": 17438.903077840805, "step_time_sec": 8.228938523010584, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2104, "loss": 4.817123889923096, "lr": 0.0002, "elapsed_sec": 17447.130961418152, "step_time_sec": 8.227770158991916, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2105, "loss": 4.921813488006592, "lr": 0.0002, "elapsed_sec": 17455.36231613159, "step_time_sec": 8.231151649000822, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2106, "loss": 4.8349151611328125, "lr": 0.0002, "elapsed_sec": 17463.59358024597, "step_time_sec": 8.231102109013591, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2107, "loss": 4.832493782043457, "lr": 0.0002, "elapsed_sec": 17471.82475399971, "step_time_sec": 8.231009314011317, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2108, "loss": 4.894941329956055, "lr": 0.0002, "elapsed_sec": 17480.055624723434, "step_time_sec": 8.230711446987698, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2109, "loss": 4.965118408203125, "lr": 0.0002, "elapsed_sec": 17488.28681063652, "step_time_sec": 8.231052129005548, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2110, "loss": 4.841169834136963, "lr": 0.0002, "elapsed_sec": 17496.517176628113, "step_time_sec": 8.230294160021003, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2111, "loss": 4.82327938079834, "lr": 0.0002, "elapsed_sec": 17504.747573137283, "step_time_sec": 8.230157609010348, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2112, "loss": 4.870457649230957, "lr": 0.0002, "elapsed_sec": 17512.978296279907, "step_time_sec": 8.23059123600251, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2113, "loss": 4.93668270111084, "lr": 0.0002, "elapsed_sec": 17521.209446907043, "step_time_sec": 8.230985989008332, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2114, "loss": 4.879837512969971, "lr": 0.0002, "elapsed_sec": 17529.440944194794, "step_time_sec": 8.231390197994187, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2115, "loss": 4.814072608947754, "lr": 0.0002, "elapsed_sec": 17537.672744512558, "step_time_sec": 8.231608416012023, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2116, "loss": 4.785638332366943, "lr": 0.0002, "elapsed_sec": 17545.903626203537, "step_time_sec": 8.23071599699324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2117, "loss": 4.975583076477051, "lr": 0.0002, "elapsed_sec": 17554.1351749897, "step_time_sec": 8.231473126012133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2118, "loss": 4.685171127319336, "lr": 0.0002, "elapsed_sec": 17562.366924762726, "step_time_sec": 8.231537381012458, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2119, "loss": 4.809004783630371, "lr": 0.0002, "elapsed_sec": 17570.59636950493, "step_time_sec": 8.229287227994064, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2120, "loss": 4.785568714141846, "lr": 0.0002, "elapsed_sec": 17578.82692170143, "step_time_sec": 8.230381297995336, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2121, "loss": 4.861395835876465, "lr": 0.0002, "elapsed_sec": 17587.05375790596, "step_time_sec": 8.226688911992824, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2122, "loss": 4.8860907554626465, "lr": 0.0002, "elapsed_sec": 17595.282344341278, "step_time_sec": 8.228498818993103, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2123, "loss": 4.902971267700195, "lr": 0.0002, "elapsed_sec": 17603.510372161865, "step_time_sec": 8.227815855992958, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2124, "loss": 4.815361499786377, "lr": 0.0002, "elapsed_sec": 17611.73954963684, "step_time_sec": 8.229079226002796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2125, "loss": 4.891315937042236, "lr": 0.0002, "elapsed_sec": 17619.968631267548, "step_time_sec": 8.22887067499687, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2126, "loss": 4.912464141845703, "lr": 0.0002, "elapsed_sec": 17628.199442386627, "step_time_sec": 8.230707940005232, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2127, "loss": 4.864314556121826, "lr": 0.0002, "elapsed_sec": 17636.430351495743, "step_time_sec": 8.230708327988395, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2128, "loss": 4.935466766357422, "lr": 0.0002, "elapsed_sec": 17644.658254146576, "step_time_sec": 8.227753193001263, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2129, "loss": 5.0345683097839355, "lr": 0.0002, "elapsed_sec": 17652.887645959854, "step_time_sec": 8.229249222000362, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2130, "loss": 4.906734466552734, "lr": 0.0002, "elapsed_sec": 17661.116021871567, "step_time_sec": 8.228200196987018, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2131, "loss": 4.879029750823975, "lr": 0.0002, "elapsed_sec": 17669.3463037014, "step_time_sec": 8.230152131989598, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2132, "loss": 4.895771503448486, "lr": 0.0002, "elapsed_sec": 17677.57709622383, "step_time_sec": 8.230652771017049, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2133, "loss": 4.872305870056152, "lr": 0.0002, "elapsed_sec": 17685.805997371674, "step_time_sec": 8.228689660987584, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2134, "loss": 4.904547214508057, "lr": 0.0002, "elapsed_sec": 17694.03507733345, "step_time_sec": 8.228922743990552, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2135, "loss": 4.817139148712158, "lr": 0.0002, "elapsed_sec": 17702.264574289322, "step_time_sec": 8.229324205982266, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2136, "loss": 4.822421550750732, "lr": 0.0002, "elapsed_sec": 17710.495001792908, "step_time_sec": 8.230296755995369, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2137, "loss": 4.869329929351807, "lr": 0.0002, "elapsed_sec": 17718.72533583641, "step_time_sec": 8.230176822980866, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2138, "loss": 4.993053436279297, "lr": 0.0002, "elapsed_sec": 17726.953271865845, "step_time_sec": 8.2278321509948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2139, "loss": 4.882951259613037, "lr": 0.0002, "elapsed_sec": 17735.18252825737, "step_time_sec": 8.229057696007658, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2140, "loss": 4.843679428100586, "lr": 0.0002, "elapsed_sec": 17743.411913633347, "step_time_sec": 8.229202343994984, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2141, "loss": 4.89755392074585, "lr": 0.0002, "elapsed_sec": 17751.643037319183, "step_time_sec": 8.231002975982847, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2142, "loss": 4.998235702514648, "lr": 0.0002, "elapsed_sec": 17759.87376689911, "step_time_sec": 8.230542853998486, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2143, "loss": 4.897313594818115, "lr": 0.0002, "elapsed_sec": 17768.10135769844, "step_time_sec": 8.227440076996572, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2144, "loss": 4.8216705322265625, "lr": 0.0002, "elapsed_sec": 17776.332443237305, "step_time_sec": 8.230929428013042, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2145, "loss": 4.924628257751465, "lr": 0.0002, "elapsed_sec": 17784.563894748688, "step_time_sec": 8.231304414017359, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2146, "loss": 4.912807941436768, "lr": 0.0002, "elapsed_sec": 17792.794061422348, "step_time_sec": 8.230020785005763, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2147, "loss": 4.844423294067383, "lr": 0.0002, "elapsed_sec": 17801.02428793907, "step_time_sec": 8.23008214100264, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2148, "loss": 4.8171186447143555, "lr": 0.0002, "elapsed_sec": 17809.255403757095, "step_time_sec": 8.23095176499919, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2149, "loss": 4.815960884094238, "lr": 0.0002, "elapsed_sec": 17817.48413348198, "step_time_sec": 8.228558030998101, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2150, "loss": 4.823194980621338, "lr": 0.0002, "elapsed_sec": 17825.71288871765, "step_time_sec": 8.228603919997113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2151, "loss": 4.813809871673584, "lr": 0.0002, "elapsed_sec": 17833.943628311157, "step_time_sec": 8.230641192989424, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2152, "loss": 4.971696853637695, "lr": 0.0002, "elapsed_sec": 17842.17454600334, "step_time_sec": 8.230770253983792, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2153, "loss": 4.794343948364258, "lr": 0.0002, "elapsed_sec": 17850.405046224594, "step_time_sec": 8.230299035989447, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2154, "loss": 4.823320388793945, "lr": 0.0002, "elapsed_sec": 17858.636110067368, "step_time_sec": 8.23091243300587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2155, "loss": 4.84894323348999, "lr": 0.0002, "elapsed_sec": 17866.867606639862, "step_time_sec": 8.23139906700817, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2156, "loss": 4.862388610839844, "lr": 0.0002, "elapsed_sec": 17875.098644018173, "step_time_sec": 8.230818207986886, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2157, "loss": 4.812862873077393, "lr": 0.0002, "elapsed_sec": 17883.328562021255, "step_time_sec": 8.229741404997185, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2158, "loss": 4.936522483825684, "lr": 0.0002, "elapsed_sec": 17891.55749464035, "step_time_sec": 8.228778815013357, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2159, "loss": 4.966897487640381, "lr": 0.0002, "elapsed_sec": 17899.78738594055, "step_time_sec": 8.22974469300243, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2160, "loss": 4.893511772155762, "lr": 0.0002, "elapsed_sec": 17908.01800751686, "step_time_sec": 8.230508856009692, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2161, "loss": 4.862572193145752, "lr": 0.0002, "elapsed_sec": 17916.24896287918, "step_time_sec": 8.23076402398874, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2162, "loss": 4.915312767028809, "lr": 0.0002, "elapsed_sec": 17924.480214595795, "step_time_sec": 8.231104915990727, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2163, "loss": 4.957468032836914, "lr": 0.0002, "elapsed_sec": 17932.710639238358, "step_time_sec": 8.230268148996402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2164, "loss": 4.8358001708984375, "lr": 0.0002, "elapsed_sec": 17940.939398527145, "step_time_sec": 8.228597427980276, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2165, "loss": 4.790856838226318, "lr": 0.0002, "elapsed_sec": 17949.16844725609, "step_time_sec": 8.22891017299844, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2166, "loss": 4.806599140167236, "lr": 0.0002, "elapsed_sec": 17957.39923262596, "step_time_sec": 8.230701491993386, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2167, "loss": 4.810802936553955, "lr": 0.0002, "elapsed_sec": 17965.629276037216, "step_time_sec": 8.229808256000979, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2168, "loss": 4.790072441101074, "lr": 0.0002, "elapsed_sec": 17973.85628938675, "step_time_sec": 8.226949758012779, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2169, "loss": 4.839962959289551, "lr": 0.0002, "elapsed_sec": 17982.08527970314, "step_time_sec": 8.228821497003082, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2170, "loss": 4.77271032333374, "lr": 0.0002, "elapsed_sec": 17990.314997196198, "step_time_sec": 8.229510636010673, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2171, "loss": 4.7878241539001465, "lr": 0.0002, "elapsed_sec": 17998.54496860504, "step_time_sec": 8.229857947007986, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2172, "loss": 5.049398899078369, "lr": 0.0002, "elapsed_sec": 18006.77543759346, "step_time_sec": 8.230306601006305, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2173, "loss": 4.980289936065674, "lr": 0.0002, "elapsed_sec": 18015.00629544258, "step_time_sec": 8.230718281003647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2174, "loss": 4.789319038391113, "lr": 0.0002, "elapsed_sec": 18023.23733496666, "step_time_sec": 8.230795441981172, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2175, "loss": 4.913626194000244, "lr": 0.0002, "elapsed_sec": 18031.466247797012, "step_time_sec": 8.228783484984888, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2176, "loss": 4.886028289794922, "lr": 0.0002, "elapsed_sec": 18039.69656395912, "step_time_sec": 8.230156259989599, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2177, "loss": 4.886553764343262, "lr": 0.0002, "elapsed_sec": 18047.927366018295, "step_time_sec": 8.230642609007191, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2178, "loss": 4.844274997711182, "lr": 0.0002, "elapsed_sec": 18056.15491604805, "step_time_sec": 8.227386213984573, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2179, "loss": 4.916290760040283, "lr": 0.0002, "elapsed_sec": 18064.38212275505, "step_time_sec": 8.227126076992135, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2180, "loss": 4.919339179992676, "lr": 0.0002, "elapsed_sec": 18072.61089038849, "step_time_sec": 8.228612014005193, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2181, "loss": 4.7758941650390625, "lr": 0.0002, "elapsed_sec": 18080.84129524231, "step_time_sec": 8.230161883984692, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2182, "loss": 4.8162922859191895, "lr": 0.0002, "elapsed_sec": 18089.071779966354, "step_time_sec": 8.230331853992539, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2183, "loss": 4.834508895874023, "lr": 0.0002, "elapsed_sec": 18097.30191397667, "step_time_sec": 8.230025786993792, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2184, "loss": 4.922464847564697, "lr": 0.0002, "elapsed_sec": 18105.532630205154, "step_time_sec": 8.230565556994407, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2185, "loss": 4.8522491455078125, "lr": 0.0002, "elapsed_sec": 18113.762688159943, "step_time_sec": 8.229863018990727, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2186, "loss": 4.936788082122803, "lr": 0.0002, "elapsed_sec": 18121.992441177368, "step_time_sec": 8.22964758600574, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2187, "loss": 4.65391206741333, "lr": 0.0002, "elapsed_sec": 18130.22301006317, "step_time_sec": 8.230404172005365, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2188, "loss": 4.844244956970215, "lr": 0.0002, "elapsed_sec": 18138.453807115555, "step_time_sec": 8.230645164992893, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2189, "loss": 4.890284538269043, "lr": 0.0002, "elapsed_sec": 18146.684846639633, "step_time_sec": 8.230846176011255, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2190, "loss": 4.928420543670654, "lr": 0.0002, "elapsed_sec": 18154.91538119316, "step_time_sec": 8.230377874016995, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2191, "loss": 4.8493242263793945, "lr": 0.0002, "elapsed_sec": 18163.146166801453, "step_time_sec": 8.23065794998547, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2192, "loss": 4.895724773406982, "lr": 0.0002, "elapsed_sec": 18171.374997854233, "step_time_sec": 8.228678263025358, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2193, "loss": 4.857178211212158, "lr": 0.0002, "elapsed_sec": 18179.605229854584, "step_time_sec": 8.230078677996062, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2194, "loss": 4.888773441314697, "lr": 0.0002, "elapsed_sec": 18187.835045576096, "step_time_sec": 8.229612227994949, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2195, "loss": 4.77186393737793, "lr": 0.0002, "elapsed_sec": 18196.06614780426, "step_time_sec": 8.230953988997499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2196, "loss": 4.69636869430542, "lr": 0.0002, "elapsed_sec": 18204.297831773758, "step_time_sec": 8.23153605099651, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2197, "loss": 4.769316673278809, "lr": 0.0002, "elapsed_sec": 18212.52585864067, "step_time_sec": 8.227917910000542, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2198, "loss": 4.875066757202148, "lr": 0.0002, "elapsed_sec": 18220.753890037537, "step_time_sec": 8.227829983021365, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2199, "loss": 4.891704082489014, "lr": 0.0002, "elapsed_sec": 18228.98477745056, "step_time_sec": 8.230783229984809, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2200, "loss": 4.898096561431885, "lr": 0.0002, "elapsed_sec": 18237.215879678726, "step_time_sec": 8.230934924009489, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2201, "loss": 4.829716205596924, "lr": 0.0002, "elapsed_sec": 18245.44783616066, "step_time_sec": 8.231770520011196, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2202, "loss": 4.846755504608154, "lr": 0.0002, "elapsed_sec": 18253.67858862877, "step_time_sec": 8.230665848008357, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2203, "loss": 4.795799732208252, "lr": 0.0002, "elapsed_sec": 18261.907845020294, "step_time_sec": 8.22906017000787, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2204, "loss": 4.83109188079834, "lr": 0.0002, "elapsed_sec": 18270.136410474777, "step_time_sec": 8.22837914401316, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2205, "loss": 4.712667465209961, "lr": 0.0002, "elapsed_sec": 18278.366794347763, "step_time_sec": 8.230238346994156, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2206, "loss": 4.732531547546387, "lr": 0.0002, "elapsed_sec": 18286.59761762619, "step_time_sec": 8.230709695024416, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2207, "loss": 4.817819118499756, "lr": 0.0002, "elapsed_sec": 18294.82904934883, "step_time_sec": 8.231232583988458, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2208, "loss": 4.814803600311279, "lr": 0.0002, "elapsed_sec": 18303.060304880142, "step_time_sec": 8.231109511980321, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2209, "loss": 4.76968240737915, "lr": 0.0002, "elapsed_sec": 18311.29038476944, "step_time_sec": 8.229933611990418, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2210, "loss": 4.7153143882751465, "lr": 0.0002, "elapsed_sec": 18319.519159078598, "step_time_sec": 8.228657373983879, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2211, "loss": 4.872645854949951, "lr": 0.0002, "elapsed_sec": 18327.74899506569, "step_time_sec": 8.229692686989438, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2212, "loss": 4.791602611541748, "lr": 0.0002, "elapsed_sec": 18335.979495048523, "step_time_sec": 8.230331572995055, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2213, "loss": 4.891279220581055, "lr": 0.0002, "elapsed_sec": 18344.20891046524, "step_time_sec": 8.229224191018147, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2214, "loss": 4.749175071716309, "lr": 0.0002, "elapsed_sec": 18352.43812274933, "step_time_sec": 8.229072519985493, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2215, "loss": 4.791592121124268, "lr": 0.0002, "elapsed_sec": 18360.666640281677, "step_time_sec": 8.228356595995137, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2216, "loss": 4.828591823577881, "lr": 0.0002, "elapsed_sec": 18368.895621061325, "step_time_sec": 8.228824227000587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2217, "loss": 4.775829315185547, "lr": 0.0002, "elapsed_sec": 18377.12240409851, "step_time_sec": 8.2266714060097, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2218, "loss": 4.794594764709473, "lr": 0.0002, "elapsed_sec": 18385.35222840309, "step_time_sec": 8.229640681005549, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2219, "loss": 4.77903413772583, "lr": 0.0002, "elapsed_sec": 18393.580349206924, "step_time_sec": 8.22796139199636, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2220, "loss": 4.784205436706543, "lr": 0.0002, "elapsed_sec": 18401.81122326851, "step_time_sec": 8.230721528001595, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2221, "loss": 4.802043914794922, "lr": 0.0002, "elapsed_sec": 18410.043501377106, "step_time_sec": 8.232105917995796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2222, "loss": 4.86336088180542, "lr": 0.0002, "elapsed_sec": 18418.27381658554, "step_time_sec": 8.23024312700727, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2223, "loss": 4.852276802062988, "lr": 0.0002, "elapsed_sec": 18426.504714012146, "step_time_sec": 8.230672222009161, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2224, "loss": 4.924962997436523, "lr": 0.0002, "elapsed_sec": 18434.73502588272, "step_time_sec": 8.230147088004742, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2225, "loss": 4.861051559448242, "lr": 0.0002, "elapsed_sec": 18442.966096639633, "step_time_sec": 8.230936259002192, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2226, "loss": 4.805220127105713, "lr": 0.0002, "elapsed_sec": 18451.194474697113, "step_time_sec": 8.228220081015024, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2227, "loss": 4.901761054992676, "lr": 0.0002, "elapsed_sec": 18459.423813581467, "step_time_sec": 8.229206348012667, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2228, "loss": 4.86838436126709, "lr": 0.0002, "elapsed_sec": 18467.65181541443, "step_time_sec": 8.227839739993215, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2229, "loss": 4.880248069763184, "lr": 0.0002, "elapsed_sec": 18475.882201194763, "step_time_sec": 8.230184977001045, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2230, "loss": 4.849673748016357, "lr": 0.0002, "elapsed_sec": 18484.110452890396, "step_time_sec": 8.228088726988062, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2231, "loss": 4.740013599395752, "lr": 0.0002, "elapsed_sec": 18492.340631484985, "step_time_sec": 8.230067061987938, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2232, "loss": 4.767248153686523, "lr": 0.0002, "elapsed_sec": 18500.570996046066, "step_time_sec": 8.23022949398728, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2233, "loss": 4.850771427154541, "lr": 0.0002, "elapsed_sec": 18508.8027009964, "step_time_sec": 8.231455311994068, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2234, "loss": 4.846970081329346, "lr": 0.0002, "elapsed_sec": 18517.033622980118, "step_time_sec": 8.230797716998495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2235, "loss": 4.814979076385498, "lr": 0.0002, "elapsed_sec": 18525.26469516754, "step_time_sec": 8.230918332003057, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2236, "loss": 4.799464702606201, "lr": 0.0002, "elapsed_sec": 18533.495631217957, "step_time_sec": 8.23074547300348, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2237, "loss": 4.93635368347168, "lr": 0.0002, "elapsed_sec": 18541.726621627808, "step_time_sec": 8.230922640999779, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2238, "loss": 4.7237229347229, "lr": 0.0002, "elapsed_sec": 18549.957582473755, "step_time_sec": 8.230732559983153, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2239, "loss": 4.842893600463867, "lr": 0.0002, "elapsed_sec": 18558.187222480774, "step_time_sec": 8.229525759990793, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2240, "loss": 4.8754754066467285, "lr": 0.0002, "elapsed_sec": 18566.41654586792, "step_time_sec": 8.229125751997344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2241, "loss": 4.828309535980225, "lr": 0.0002, "elapsed_sec": 18574.646907806396, "step_time_sec": 8.230236119998153, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2242, "loss": 4.893211841583252, "lr": 0.0002, "elapsed_sec": 18582.877806186676, "step_time_sec": 8.230770285998005, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2243, "loss": 4.792243003845215, "lr": 0.0002, "elapsed_sec": 18591.106385469437, "step_time_sec": 8.228381371009164, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2244, "loss": 4.8741278648376465, "lr": 0.0002, "elapsed_sec": 18599.33822631836, "step_time_sec": 8.23171577998437, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2245, "loss": 4.690305233001709, "lr": 0.0002, "elapsed_sec": 18607.568950891495, "step_time_sec": 8.230547552986536, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2246, "loss": 5.033898830413818, "lr": 0.0002, "elapsed_sec": 18615.79990887642, "step_time_sec": 8.230796143994667, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2247, "loss": 4.845549583435059, "lr": 0.0002, "elapsed_sec": 18624.031232357025, "step_time_sec": 8.231132846005494, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2248, "loss": 4.734635829925537, "lr": 0.0002, "elapsed_sec": 18632.262013435364, "step_time_sec": 8.230641600006493, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2249, "loss": 4.838695049285889, "lr": 0.0002, "elapsed_sec": 18640.49248814583, "step_time_sec": 8.23029095097445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2250, "loss": 4.7483439445495605, "lr": 0.0002, "elapsed_sec": 18648.723388433456, "step_time_sec": 8.230712084012339, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2251, "loss": 4.7952961921691895, "lr": 0.0002, "elapsed_sec": 18656.954241752625, "step_time_sec": 8.230710901989369, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2252, "loss": 4.926731586456299, "lr": 0.0002, "elapsed_sec": 18665.184502840042, "step_time_sec": 8.230109035997884, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2253, "loss": 4.784350395202637, "lr": 0.0002, "elapsed_sec": 18673.41343188286, "step_time_sec": 8.228752014023485, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2254, "loss": 4.909426212310791, "lr": 0.0002, "elapsed_sec": 18681.644889354706, "step_time_sec": 8.231301295018056, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2255, "loss": 4.726560592651367, "lr": 0.0002, "elapsed_sec": 18689.876026153564, "step_time_sec": 8.231037753022974, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2256, "loss": 4.815653324127197, "lr": 0.0002, "elapsed_sec": 18698.106916189194, "step_time_sec": 8.230684554000618, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2257, "loss": 4.829016208648682, "lr": 0.0002, "elapsed_sec": 18706.33755493164, "step_time_sec": 8.230441290012095, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2258, "loss": 4.737600326538086, "lr": 0.0002, "elapsed_sec": 18714.569975852966, "step_time_sec": 8.23225041400292, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2259, "loss": 4.812471389770508, "lr": 0.0002, "elapsed_sec": 18722.800270080566, "step_time_sec": 8.230155623983592, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2260, "loss": 4.6410746574401855, "lr": 0.0002, "elapsed_sec": 18731.03151535988, "step_time_sec": 8.23115127699566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2261, "loss": 4.9842529296875, "lr": 0.0002, "elapsed_sec": 18739.26133465767, "step_time_sec": 8.229593435011338, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2262, "loss": 4.841548442840576, "lr": 0.0002, "elapsed_sec": 18747.49051785469, "step_time_sec": 8.229071364010451, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2263, "loss": 4.947880268096924, "lr": 0.0002, "elapsed_sec": 18755.719839334488, "step_time_sec": 8.229192900995258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2264, "loss": 4.7924065589904785, "lr": 0.0002, "elapsed_sec": 18763.948459148407, "step_time_sec": 8.228395416983403, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2265, "loss": 4.8202128410339355, "lr": 0.0002, "elapsed_sec": 18772.177578687668, "step_time_sec": 8.228946481016465, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2266, "loss": 4.899280071258545, "lr": 0.0002, "elapsed_sec": 18780.40721821785, "step_time_sec": 8.22952704198542, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2267, "loss": 4.936580181121826, "lr": 0.0002, "elapsed_sec": 18788.6372756958, "step_time_sec": 8.229832920973422, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2268, "loss": 4.87842321395874, "lr": 0.0002, "elapsed_sec": 18796.868629217148, "step_time_sec": 8.231218574015656, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2269, "loss": 4.831888198852539, "lr": 0.0002, "elapsed_sec": 18805.09877061844, "step_time_sec": 8.229953432979528, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2270, "loss": 4.917411804199219, "lr": 0.0002, "elapsed_sec": 18813.329909324646, "step_time_sec": 8.231019375991309, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2271, "loss": 4.722024440765381, "lr": 0.0002, "elapsed_sec": 18821.561265468597, "step_time_sec": 8.23115686600795, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2272, "loss": 4.784018516540527, "lr": 0.0002, "elapsed_sec": 18829.792846679688, "step_time_sec": 8.231493783008773, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2273, "loss": 4.813552379608154, "lr": 0.0002, "elapsed_sec": 18838.023223876953, "step_time_sec": 8.230235529015772, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2274, "loss": 4.7305588722229, "lr": 0.0002, "elapsed_sec": 18846.2546274662, "step_time_sec": 8.23123116401257, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2275, "loss": 4.756203651428223, "lr": 0.0002, "elapsed_sec": 18854.485694408417, "step_time_sec": 8.230834215995856, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2276, "loss": 4.801923751831055, "lr": 0.0002, "elapsed_sec": 18862.71655368805, "step_time_sec": 8.23074194698711, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2277, "loss": 4.911331653594971, "lr": 0.0002, "elapsed_sec": 18870.94772219658, "step_time_sec": 8.230977825005539, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2278, "loss": 4.738588333129883, "lr": 0.0002, "elapsed_sec": 18879.17780470848, "step_time_sec": 8.229936657997314, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2279, "loss": 4.796480178833008, "lr": 0.0002, "elapsed_sec": 18887.40896129608, "step_time_sec": 8.23101675498765, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2280, "loss": 4.762732982635498, "lr": 0.0002, "elapsed_sec": 18895.63914346695, "step_time_sec": 8.229969789012102, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2281, "loss": 4.84388542175293, "lr": 0.0002, "elapsed_sec": 18903.869485378265, "step_time_sec": 8.230202950013336, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2282, "loss": 4.769160747528076, "lr": 0.0002, "elapsed_sec": 18912.100565195084, "step_time_sec": 8.23091467199265, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2283, "loss": 4.800039768218994, "lr": 0.0002, "elapsed_sec": 18920.331569433212, "step_time_sec": 8.230900132009992, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2284, "loss": 4.577500820159912, "lr": 0.0002, "elapsed_sec": 18928.56236219406, "step_time_sec": 8.230613382009324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2285, "loss": 4.745461940765381, "lr": 0.0002, "elapsed_sec": 18936.792929172516, "step_time_sec": 8.230411601020023, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2286, "loss": 4.9296956062316895, "lr": 0.0002, "elapsed_sec": 18945.024091959, "step_time_sec": 8.230955420003738, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2287, "loss": 4.843530178070068, "lr": 0.0002, "elapsed_sec": 18953.252767562866, "step_time_sec": 8.228567247017054, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2288, "loss": 4.808889865875244, "lr": 0.0002, "elapsed_sec": 18961.482862472534, "step_time_sec": 8.229900677019032, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2289, "loss": 4.715086460113525, "lr": 0.0002, "elapsed_sec": 18969.71359229088, "step_time_sec": 8.230571392021375, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2290, "loss": 4.795016765594482, "lr": 0.0002, "elapsed_sec": 18977.942490816116, "step_time_sec": 8.228702410007827, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2291, "loss": 4.896559715270996, "lr": 0.0002, "elapsed_sec": 18986.1715528965, "step_time_sec": 8.228891737002414, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2292, "loss": 4.871973991394043, "lr": 0.0002, "elapsed_sec": 18994.400887966156, "step_time_sec": 8.229201967013068, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2293, "loss": 4.886851787567139, "lr": 0.0002, "elapsed_sec": 19002.631474018097, "step_time_sec": 8.230469815985998, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2294, "loss": 5.040112495422363, "lr": 0.0002, "elapsed_sec": 19010.86383652687, "step_time_sec": 8.232198412995785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2295, "loss": 4.92019510269165, "lr": 0.0002, "elapsed_sec": 19019.093718767166, "step_time_sec": 8.22967680299189, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2296, "loss": 4.798545837402344, "lr": 0.0002, "elapsed_sec": 19027.324726343155, "step_time_sec": 8.230861547024688, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2297, "loss": 4.828321933746338, "lr": 0.0002, "elapsed_sec": 19035.55520749092, "step_time_sec": 8.230382720998023, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2298, "loss": 4.881422519683838, "lr": 0.0002, "elapsed_sec": 19043.786579847336, "step_time_sec": 8.231140827992931, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2299, "loss": 4.756677627563477, "lr": 0.0002, "elapsed_sec": 19052.016054868698, "step_time_sec": 8.229308668000158, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2300, "loss": 4.835336208343506, "lr": 0.0002, "elapsed_sec": 19060.244488954544, "step_time_sec": 8.22829167699092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2301, "loss": 4.724372863769531, "lr": 0.0002, "elapsed_sec": 19068.475511312485, "step_time_sec": 8.230903102026787, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2302, "loss": 4.736508846282959, "lr": 0.0002, "elapsed_sec": 19076.706547260284, "step_time_sec": 8.230823655001586, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2303, "loss": 4.765850067138672, "lr": 0.0002, "elapsed_sec": 19084.93567943573, "step_time_sec": 8.228976746991975, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2304, "loss": 4.691445350646973, "lr": 0.0002, "elapsed_sec": 19093.16407084465, "step_time_sec": 8.228245870996034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2305, "loss": 4.745620250701904, "lr": 0.0002, "elapsed_sec": 19101.392572164536, "step_time_sec": 8.22833112400258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2306, "loss": 4.870036602020264, "lr": 0.0002, "elapsed_sec": 19109.622022628784, "step_time_sec": 8.229320598999038, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2307, "loss": 4.7810773849487305, "lr": 0.0002, "elapsed_sec": 19117.852553606033, "step_time_sec": 8.230379822984105, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2308, "loss": 4.898680210113525, "lr": 0.0002, "elapsed_sec": 19126.081876277924, "step_time_sec": 8.229135044995928, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2309, "loss": 4.803769588470459, "lr": 0.0002, "elapsed_sec": 19134.31226849556, "step_time_sec": 8.230219852994196, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2310, "loss": 4.7458648681640625, "lr": 0.0002, "elapsed_sec": 19142.540450572968, "step_time_sec": 8.228011546016205, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2311, "loss": 4.727264404296875, "lr": 0.0002, "elapsed_sec": 19150.77096056938, "step_time_sec": 8.230355598992901, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2312, "loss": 4.737241744995117, "lr": 0.0002, "elapsed_sec": 19159.002520799637, "step_time_sec": 8.231369483983144, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2313, "loss": 4.774022102355957, "lr": 0.0002, "elapsed_sec": 19167.23127889633, "step_time_sec": 8.228640787972836, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2314, "loss": 4.7937445640563965, "lr": 0.0002, "elapsed_sec": 19175.461001634598, "step_time_sec": 8.229540832020575, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2315, "loss": 4.8584794998168945, "lr": 0.0002, "elapsed_sec": 19183.690865039825, "step_time_sec": 8.229741988005117, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2316, "loss": 4.711976051330566, "lr": 0.0002, "elapsed_sec": 19191.92036652565, "step_time_sec": 8.229284158995142, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2317, "loss": 4.713544845581055, "lr": 0.0002, "elapsed_sec": 19200.149309635162, "step_time_sec": 8.228802241006633, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2318, "loss": 4.872134208679199, "lr": 0.0002, "elapsed_sec": 19208.378608226776, "step_time_sec": 8.229146301018773, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2319, "loss": 4.780914306640625, "lr": 0.0002, "elapsed_sec": 19216.608791589737, "step_time_sec": 8.230062389018713, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2320, "loss": 4.775059700012207, "lr": 0.0002, "elapsed_sec": 19224.839712142944, "step_time_sec": 8.230722086009337, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2321, "loss": 4.749627113342285, "lr": 0.0002, "elapsed_sec": 19233.070664167404, "step_time_sec": 8.230820465978468, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2322, "loss": 4.836220741271973, "lr": 0.0002, "elapsed_sec": 19241.30196595192, "step_time_sec": 8.231152484018821, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2323, "loss": 4.58985710144043, "lr": 0.0002, "elapsed_sec": 19249.533225297928, "step_time_sec": 8.231084927014308, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2324, "loss": 4.8482747077941895, "lr": 0.0002, "elapsed_sec": 19257.763398885727, "step_time_sec": 8.230054785002721, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2325, "loss": 4.86237096786499, "lr": 0.0002, "elapsed_sec": 19265.995031118393, "step_time_sec": 8.231397462019231, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2326, "loss": 4.704056739807129, "lr": 0.0002, "elapsed_sec": 19274.22533106804, "step_time_sec": 8.230165590997785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2327, "loss": 4.654994487762451, "lr": 0.0002, "elapsed_sec": 19282.45626115799, "step_time_sec": 8.230754202028038, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2328, "loss": 4.741962432861328, "lr": 0.0002, "elapsed_sec": 19290.687625169754, "step_time_sec": 8.231228164979257, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2329, "loss": 4.786570072174072, "lr": 0.0002, "elapsed_sec": 19298.918853521347, "step_time_sec": 8.231106338003883, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2330, "loss": 4.761305332183838, "lr": 0.0002, "elapsed_sec": 19307.147225618362, "step_time_sec": 8.22817647899501, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2331, "loss": 4.865091800689697, "lr": 0.0002, "elapsed_sec": 19315.376630544662, "step_time_sec": 8.229252616001759, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2332, "loss": 4.845639228820801, "lr": 0.0002, "elapsed_sec": 19323.607513666153, "step_time_sec": 8.230784106010105, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2333, "loss": 4.698814868927002, "lr": 0.0002, "elapsed_sec": 19331.837917804718, "step_time_sec": 8.230225479986984, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2334, "loss": 4.707276821136475, "lr": 0.0002, "elapsed_sec": 19340.068511009216, "step_time_sec": 8.230387574003544, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2335, "loss": 4.751241207122803, "lr": 0.0002, "elapsed_sec": 19348.298630475998, "step_time_sec": 8.22998295200523, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2336, "loss": 4.858397483825684, "lr": 0.0002, "elapsed_sec": 19356.52729678154, "step_time_sec": 8.22856759800925, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2337, "loss": 4.725650310516357, "lr": 0.0002, "elapsed_sec": 19364.7570810318, "step_time_sec": 8.229561660991749, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2338, "loss": 4.779136657714844, "lr": 0.0002, "elapsed_sec": 19372.985991477966, "step_time_sec": 8.228807274979772, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2339, "loss": 4.787586688995361, "lr": 0.0002, "elapsed_sec": 19381.216473340988, "step_time_sec": 8.230256996990647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2340, "loss": 4.8608622550964355, "lr": 0.0002, "elapsed_sec": 19389.447409391403, "step_time_sec": 8.230809866974596, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2341, "loss": 4.691782474517822, "lr": 0.0002, "elapsed_sec": 19397.678426265717, "step_time_sec": 8.230860280978959, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2342, "loss": 4.816005706787109, "lr": 0.0002, "elapsed_sec": 19405.908870458603, "step_time_sec": 8.230255857022712, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2343, "loss": 4.768830299377441, "lr": 0.0002, "elapsed_sec": 19414.139772892, "step_time_sec": 8.230821297998773, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2344, "loss": 4.769991874694824, "lr": 0.0002, "elapsed_sec": 19422.371205091476, "step_time_sec": 8.23121861400432, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2345, "loss": 4.65903902053833, "lr": 0.0002, "elapsed_sec": 19430.599168539047, "step_time_sec": 8.227826197980903, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2346, "loss": 4.77360200881958, "lr": 0.0002, "elapsed_sec": 19438.828580379486, "step_time_sec": 8.229276281985221, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2347, "loss": 4.753153324127197, "lr": 0.0002, "elapsed_sec": 19447.05860209465, "step_time_sec": 8.229848514980404, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2348, "loss": 4.865962982177734, "lr": 0.0002, "elapsed_sec": 19455.288746118546, "step_time_sec": 8.2300435929792, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2349, "loss": 4.852320194244385, "lr": 0.0002, "elapsed_sec": 19463.520123004913, "step_time_sec": 8.231143543001963, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2350, "loss": 4.687441825866699, "lr": 0.0002, "elapsed_sec": 19471.75065636635, "step_time_sec": 8.23039528899244, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2351, "loss": 4.641199111938477, "lr": 0.0002, "elapsed_sec": 19479.98077273369, "step_time_sec": 8.229964233993087, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2352, "loss": 4.773565769195557, "lr": 0.0002, "elapsed_sec": 19488.208938121796, "step_time_sec": 8.228072309022537, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2353, "loss": 4.77940034866333, "lr": 0.0002, "elapsed_sec": 19496.4380402565, "step_time_sec": 8.228871725994395, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2354, "loss": 4.749731063842773, "lr": 0.0002, "elapsed_sec": 19504.667098760605, "step_time_sec": 8.22888970299391, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2355, "loss": 4.765299320220947, "lr": 0.0002, "elapsed_sec": 19512.895028829575, "step_time_sec": 8.227828030998353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2356, "loss": 4.837575435638428, "lr": 0.0002, "elapsed_sec": 19521.12397146225, "step_time_sec": 8.228715315024601, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2357, "loss": 4.798303127288818, "lr": 0.0002, "elapsed_sec": 19529.354865312576, "step_time_sec": 8.230746891989838, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2358, "loss": 4.671627044677734, "lr": 0.0002, "elapsed_sec": 19537.584696292877, "step_time_sec": 8.229720843984978, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2359, "loss": 4.807071685791016, "lr": 0.0002, "elapsed_sec": 19545.815420627594, "step_time_sec": 8.2305179979885, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2360, "loss": 4.613560676574707, "lr": 0.0002, "elapsed_sec": 19554.04495859146, "step_time_sec": 8.229422813019482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2361, "loss": 4.691130638122559, "lr": 0.0002, "elapsed_sec": 19562.2736825943, "step_time_sec": 8.228546525991987, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2362, "loss": 4.858081817626953, "lr": 0.0002, "elapsed_sec": 19570.504650592804, "step_time_sec": 8.230794275994413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2363, "loss": 4.79318904876709, "lr": 0.0002, "elapsed_sec": 19578.735916614532, "step_time_sec": 8.23112217898597, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2364, "loss": 4.643394947052002, "lr": 0.0002, "elapsed_sec": 19586.966446876526, "step_time_sec": 8.230354776984314, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2365, "loss": 4.688083648681641, "lr": 0.0002, "elapsed_sec": 19595.196723222733, "step_time_sec": 8.23012093998841, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2366, "loss": 4.781771183013916, "lr": 0.0002, "elapsed_sec": 19603.42808699608, "step_time_sec": 8.231203208008083, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2367, "loss": 4.765212535858154, "lr": 0.0002, "elapsed_sec": 19611.658438444138, "step_time_sec": 8.230190294998465, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2368, "loss": 4.705460071563721, "lr": 0.0002, "elapsed_sec": 19619.888051748276, "step_time_sec": 8.229509906988824, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2369, "loss": 4.627439022064209, "lr": 0.0002, "elapsed_sec": 19628.116582632065, "step_time_sec": 8.228358783002477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2370, "loss": 4.787360668182373, "lr": 0.0002, "elapsed_sec": 19636.346708536148, "step_time_sec": 8.229910609981744, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2371, "loss": 4.728940486907959, "lr": 0.0002, "elapsed_sec": 19644.57829475403, "step_time_sec": 8.231410603999393, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2372, "loss": 4.744108200073242, "lr": 0.0002, "elapsed_sec": 19652.80833864212, "step_time_sec": 8.229882283019833, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2373, "loss": 4.637429237365723, "lr": 0.0002, "elapsed_sec": 19661.038973093033, "step_time_sec": 8.230443771986756, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2374, "loss": 4.731886863708496, "lr": 0.0002, "elapsed_sec": 19669.270052671432, "step_time_sec": 8.230943023983855, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2375, "loss": 4.7091522216796875, "lr": 0.0002, "elapsed_sec": 19677.499387025833, "step_time_sec": 8.229121929005487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2376, "loss": 4.753643989562988, "lr": 0.0002, "elapsed_sec": 19685.728912353516, "step_time_sec": 8.229359403019771, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2377, "loss": 4.7645182609558105, "lr": 0.0002, "elapsed_sec": 19693.958233118057, "step_time_sec": 8.229183616000228, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2378, "loss": 4.692090034484863, "lr": 0.0002, "elapsed_sec": 19702.189049720764, "step_time_sec": 8.230626963981194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2379, "loss": 4.663738250732422, "lr": 0.0002, "elapsed_sec": 19710.419360876083, "step_time_sec": 8.230165151995607, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2380, "loss": 4.692519187927246, "lr": 0.0002, "elapsed_sec": 19718.649361610413, "step_time_sec": 8.229815636994317, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2381, "loss": 4.82426118850708, "lr": 0.0002, "elapsed_sec": 19726.879484176636, "step_time_sec": 8.230008215992711, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2382, "loss": 4.753925323486328, "lr": 0.0002, "elapsed_sec": 19735.110205888748, "step_time_sec": 8.230539420997957, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2383, "loss": 4.651071548461914, "lr": 0.0002, "elapsed_sec": 19743.340978622437, "step_time_sec": 8.230623947019922, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2384, "loss": 4.762782573699951, "lr": 0.0002, "elapsed_sec": 19751.571607112885, "step_time_sec": 8.230481747013982, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2385, "loss": 4.779146194458008, "lr": 0.0002, "elapsed_sec": 19759.802931547165, "step_time_sec": 8.231076075986493, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2386, "loss": 4.661280155181885, "lr": 0.0002, "elapsed_sec": 19768.03251194954, "step_time_sec": 8.229502117988886, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2387, "loss": 4.733428955078125, "lr": 0.0002, "elapsed_sec": 19776.262954950333, "step_time_sec": 8.23024604501552, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2388, "loss": 4.747662544250488, "lr": 0.0002, "elapsed_sec": 19784.4936337471, "step_time_sec": 8.230567869002698, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2389, "loss": 4.728288173675537, "lr": 0.0002, "elapsed_sec": 19792.724373579025, "step_time_sec": 8.230568359984318, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2390, "loss": 4.865763187408447, "lr": 0.0002, "elapsed_sec": 19800.955919265747, "step_time_sec": 8.231364798994036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2391, "loss": 4.767153739929199, "lr": 0.0002, "elapsed_sec": 19809.186064004898, "step_time_sec": 8.229996106994804, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2392, "loss": 4.839120864868164, "lr": 0.0002, "elapsed_sec": 19817.41578936577, "step_time_sec": 8.22952526397421, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2393, "loss": 4.697601318359375, "lr": 0.0002, "elapsed_sec": 19825.646631240845, "step_time_sec": 8.23070489501697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2394, "loss": 4.598567008972168, "lr": 0.0002, "elapsed_sec": 19833.878626346588, "step_time_sec": 8.231827200012049, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2395, "loss": 4.876355171203613, "lr": 0.0002, "elapsed_sec": 19842.109034776688, "step_time_sec": 8.230224960978376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2396, "loss": 4.79788064956665, "lr": 0.0002, "elapsed_sec": 19850.33989715576, "step_time_sec": 8.230708931980189, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2397, "loss": 4.673642158508301, "lr": 0.0002, "elapsed_sec": 19858.571119070053, "step_time_sec": 8.231109963002382, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2398, "loss": 4.578042507171631, "lr": 0.0002, "elapsed_sec": 19866.800609588623, "step_time_sec": 8.22926956199808, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2399, "loss": 4.801578998565674, "lr": 0.0002, "elapsed_sec": 19875.03054881096, "step_time_sec": 8.229850023984909, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2400, "loss": 4.775453567504883, "lr": 0.0002, "elapsed_sec": 19883.260108470917, "step_time_sec": 8.229385403014021, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2401, "loss": 4.744487285614014, "lr": 0.0002, "elapsed_sec": 19891.490403175354, "step_time_sec": 8.23008745801053, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2402, "loss": 4.734560489654541, "lr": 0.0002, "elapsed_sec": 19899.721970319748, "step_time_sec": 8.231386928004213, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2403, "loss": 4.720035076141357, "lr": 0.0002, "elapsed_sec": 19907.95307302475, "step_time_sec": 8.230928664997919, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2404, "loss": 4.732629299163818, "lr": 0.0002, "elapsed_sec": 19916.182829380035, "step_time_sec": 8.229575164004928, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2405, "loss": 4.696537017822266, "lr": 0.0002, "elapsed_sec": 19924.41170191765, "step_time_sec": 8.228776657982962, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2406, "loss": 4.689704418182373, "lr": 0.0002, "elapsed_sec": 19932.64075088501, "step_time_sec": 8.228848931001266, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2407, "loss": 4.7922773361206055, "lr": 0.0002, "elapsed_sec": 19940.871073007584, "step_time_sec": 8.230191582988482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2408, "loss": 4.7810258865356445, "lr": 0.0002, "elapsed_sec": 19949.100251674652, "step_time_sec": 8.228974969999399, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2409, "loss": 4.716061592102051, "lr": 0.0002, "elapsed_sec": 19957.331266880035, "step_time_sec": 8.23091760801617, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2410, "loss": 4.809371471405029, "lr": 0.0002, "elapsed_sec": 19965.562253952026, "step_time_sec": 8.230801143014105, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2411, "loss": 4.666511058807373, "lr": 0.0002, "elapsed_sec": 19973.793583869934, "step_time_sec": 8.231130033003865, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2412, "loss": 4.667832851409912, "lr": 0.0002, "elapsed_sec": 19982.023076295853, "step_time_sec": 8.229349025001284, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2413, "loss": 4.7012763023376465, "lr": 0.0002, "elapsed_sec": 19990.251352787018, "step_time_sec": 8.228133133001393, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2414, "loss": 4.592141151428223, "lr": 0.0002, "elapsed_sec": 19998.482642412186, "step_time_sec": 8.231171221996192, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2415, "loss": 4.739987850189209, "lr": 0.0002, "elapsed_sec": 20006.713354825974, "step_time_sec": 8.230533947993536, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2416, "loss": 4.7422380447387695, "lr": 0.0002, "elapsed_sec": 20014.944942474365, "step_time_sec": 8.23141428900999, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2417, "loss": 4.823827266693115, "lr": 0.0002, "elapsed_sec": 20023.175931453705, "step_time_sec": 8.230820383003447, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2418, "loss": 4.802988529205322, "lr": 0.0002, "elapsed_sec": 20031.408324480057, "step_time_sec": 8.232287022983655, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2419, "loss": 4.597049236297607, "lr": 0.0002, "elapsed_sec": 20039.638504743576, "step_time_sec": 8.230053709994536, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2420, "loss": 4.739407539367676, "lr": 0.0002, "elapsed_sec": 20047.868310928345, "step_time_sec": 8.229598462989088, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2421, "loss": 4.758183479309082, "lr": 0.0002, "elapsed_sec": 20056.099647521973, "step_time_sec": 8.231183914002031, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2422, "loss": 4.652473449707031, "lr": 0.0002, "elapsed_sec": 20064.33035993576, "step_time_sec": 8.230530091008404, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2423, "loss": 4.697491645812988, "lr": 0.0002, "elapsed_sec": 20072.55889439583, "step_time_sec": 8.228384350019041, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2424, "loss": 4.804382801055908, "lr": 0.0002, "elapsed_sec": 20080.787601947784, "step_time_sec": 8.22855790000176, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2425, "loss": 4.598942279815674, "lr": 0.0002, "elapsed_sec": 20089.017970323563, "step_time_sec": 8.230197085998952, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2426, "loss": 4.700214385986328, "lr": 0.0002, "elapsed_sec": 20097.24793934822, "step_time_sec": 8.229825631977292, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2427, "loss": 4.844563961029053, "lr": 0.0002, "elapsed_sec": 20105.478910923004, "step_time_sec": 8.230827463994501, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2428, "loss": 4.616761207580566, "lr": 0.0002, "elapsed_sec": 20113.71055674553, "step_time_sec": 8.231537626008503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2429, "loss": 4.730472564697266, "lr": 0.0002, "elapsed_sec": 20121.94130396843, "step_time_sec": 8.230513711983804, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2430, "loss": 4.6991472244262695, "lr": 0.0002, "elapsed_sec": 20130.172922372818, "step_time_sec": 8.231470767990686, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2431, "loss": 4.689359188079834, "lr": 0.0002, "elapsed_sec": 20138.403270959854, "step_time_sec": 8.230239480995806, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2432, "loss": 4.750843524932861, "lr": 0.0002, "elapsed_sec": 20146.63220334053, "step_time_sec": 8.228784970007837, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2433, "loss": 4.714248180389404, "lr": 0.0002, "elapsed_sec": 20154.860279798508, "step_time_sec": 8.227926065010251, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2434, "loss": 4.774833679199219, "lr": 0.0002, "elapsed_sec": 20163.088545560837, "step_time_sec": 8.228058850974776, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2435, "loss": 4.709483623504639, "lr": 0.0002, "elapsed_sec": 20171.320458173752, "step_time_sec": 8.231748186022742, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2436, "loss": 4.653628349304199, "lr": 0.0002, "elapsed_sec": 20179.550728797913, "step_time_sec": 8.23018959100591, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2437, "loss": 4.657154560089111, "lr": 0.0002, "elapsed_sec": 20197.149193763733, "step_time_sec": 17.598248241003603, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2438, "loss": 4.684458255767822, "lr": 0.0002, "elapsed_sec": 20205.367732524872, "step_time_sec": 8.218399988982128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2439, "loss": 4.715842247009277, "lr": 0.0002, "elapsed_sec": 20213.596985578537, "step_time_sec": 8.229112475994043, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2440, "loss": 4.757926940917969, "lr": 0.0002, "elapsed_sec": 20221.827652454376, "step_time_sec": 8.230454161006492, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2441, "loss": 4.888769149780273, "lr": 0.0002, "elapsed_sec": 20230.0587849617, "step_time_sec": 8.230966567003634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2442, "loss": 4.735754489898682, "lr": 0.0002, "elapsed_sec": 20238.28967308998, "step_time_sec": 8.230747308989521, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2443, "loss": 4.660858154296875, "lr": 0.0002, "elapsed_sec": 20246.52059006691, "step_time_sec": 8.230755311989924, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2444, "loss": 4.71430778503418, "lr": 0.0002, "elapsed_sec": 20254.751172542572, "step_time_sec": 8.230503753002267, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2445, "loss": 4.673466205596924, "lr": 0.0002, "elapsed_sec": 20262.980777978897, "step_time_sec": 8.229374032001942, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2446, "loss": 4.7667694091796875, "lr": 0.0002, "elapsed_sec": 20271.209970235825, "step_time_sec": 8.229029158013873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2447, "loss": 4.513029098510742, "lr": 0.0002, "elapsed_sec": 20279.440148115158, "step_time_sec": 8.230035842978396, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2448, "loss": 4.7402873039245605, "lr": 0.0002, "elapsed_sec": 20287.67178750038, "step_time_sec": 8.231466685014311, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2449, "loss": 4.708887577056885, "lr": 0.0002, "elapsed_sec": 20295.90280842781, "step_time_sec": 8.230866274010623, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2450, "loss": 4.673701763153076, "lr": 0.0002, "elapsed_sec": 20304.13413953781, "step_time_sec": 8.231212528015021, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2451, "loss": 4.803501129150391, "lr": 0.0002, "elapsed_sec": 20312.364657640457, "step_time_sec": 8.230322084011277, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2452, "loss": 4.684258460998535, "lr": 0.0002, "elapsed_sec": 20320.593879699707, "step_time_sec": 8.2290649569768, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2453, "loss": 4.88509464263916, "lr": 0.0002, "elapsed_sec": 20328.825271844864, "step_time_sec": 8.231309898983454, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2454, "loss": 4.617535591125488, "lr": 0.0002, "elapsed_sec": 20337.053260326385, "step_time_sec": 8.227796494000359, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2455, "loss": 4.657193183898926, "lr": 0.0002, "elapsed_sec": 20345.282782316208, "step_time_sec": 8.229348329012282, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2456, "loss": 4.591772556304932, "lr": 0.0002, "elapsed_sec": 20353.51095843315, "step_time_sec": 8.228093142010039, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2457, "loss": 4.721952438354492, "lr": 0.0002, "elapsed_sec": 20361.741359472275, "step_time_sec": 8.230165124987252, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2458, "loss": 4.7874369621276855, "lr": 0.0002, "elapsed_sec": 20369.970462799072, "step_time_sec": 8.228978620987618, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2459, "loss": 4.643916130065918, "lr": 0.0002, "elapsed_sec": 20378.200571775436, "step_time_sec": 8.229920852987561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2460, "loss": 4.6734395027160645, "lr": 0.0002, "elapsed_sec": 20386.431622982025, "step_time_sec": 8.230899462010711, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2461, "loss": 4.604076385498047, "lr": 0.0002, "elapsed_sec": 20394.660897016525, "step_time_sec": 8.229088000982301, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2462, "loss": 4.693843364715576, "lr": 0.0002, "elapsed_sec": 20402.89305496216, "step_time_sec": 8.232022406009492, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2463, "loss": 4.74310302734375, "lr": 0.0002, "elapsed_sec": 20411.121990919113, "step_time_sec": 8.228757086006226, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2464, "loss": 4.692093849182129, "lr": 0.0002, "elapsed_sec": 20419.34997034073, "step_time_sec": 8.227823449997231, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2465, "loss": 4.726443290710449, "lr": 0.0002, "elapsed_sec": 20427.58145427704, "step_time_sec": 8.231341652019182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2466, "loss": 4.629110813140869, "lr": 0.0002, "elapsed_sec": 20435.81245946884, "step_time_sec": 8.230811358982464, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2467, "loss": 4.7203216552734375, "lr": 0.0002, "elapsed_sec": 20444.041030406952, "step_time_sec": 8.228449535992695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2468, "loss": 4.542294025421143, "lr": 0.0002, "elapsed_sec": 20452.269689798355, "step_time_sec": 8.228474782023113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2469, "loss": 5.092080593109131, "lr": 0.0002, "elapsed_sec": 20460.49797987938, "step_time_sec": 8.228148522000993, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2470, "loss": 4.639532566070557, "lr": 0.0002, "elapsed_sec": 20468.728115081787, "step_time_sec": 8.230018392001512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2471, "loss": 3.8192126750946045, "lr": 0.0002, "elapsed_sec": 20476.95868206024, "step_time_sec": 8.230400906992145, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2472, "loss": 4.711207866668701, "lr": 0.0002, "elapsed_sec": 20485.1871008873, "step_time_sec": 8.228282316995319, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2473, "loss": 4.881582260131836, "lr": 0.0002, "elapsed_sec": 20493.41689157486, "step_time_sec": 8.229589541995665, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2474, "loss": 4.726922988891602, "lr": 0.0002, "elapsed_sec": 20501.645898103714, "step_time_sec": 8.228858941001818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2475, "loss": 4.664163112640381, "lr": 0.0002, "elapsed_sec": 20509.87681531906, "step_time_sec": 8.230760912003461, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2476, "loss": 4.615838050842285, "lr": 0.0002, "elapsed_sec": 20518.107586860657, "step_time_sec": 8.230622410017531, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2477, "loss": 4.632724761962891, "lr": 0.0002, "elapsed_sec": 20526.336456298828, "step_time_sec": 8.228703145985492, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2478, "loss": 4.845983028411865, "lr": 0.0002, "elapsed_sec": 20534.5643491745, "step_time_sec": 8.227780859015184, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2479, "loss": 4.660930156707764, "lr": 0.0002, "elapsed_sec": 20542.793870449066, "step_time_sec": 8.229332228977, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2480, "loss": 4.685457706451416, "lr": 0.0002, "elapsed_sec": 20551.023827791214, "step_time_sec": 8.229804943024646, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2481, "loss": 4.726800441741943, "lr": 0.0002, "elapsed_sec": 20559.254749536514, "step_time_sec": 8.230739755003015, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2482, "loss": 4.787102222442627, "lr": 0.0002, "elapsed_sec": 20567.483985185623, "step_time_sec": 8.229059492994566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2483, "loss": 4.611999988555908, "lr": 0.0002, "elapsed_sec": 20575.71486544609, "step_time_sec": 8.230702783010202, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2484, "loss": 4.64902400970459, "lr": 0.0002, "elapsed_sec": 20583.945781946182, "step_time_sec": 8.23082992999116, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2485, "loss": 4.6134843826293945, "lr": 0.0002, "elapsed_sec": 20592.17595553398, "step_time_sec": 8.22997239799588, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2486, "loss": 4.655436992645264, "lr": 0.0002, "elapsed_sec": 20600.406816005707, "step_time_sec": 8.230751364986645, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2487, "loss": 4.5939040184021, "lr": 0.0002, "elapsed_sec": 20608.638036727905, "step_time_sec": 8.23102865400142, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2488, "loss": 4.616866588592529, "lr": 0.0002, "elapsed_sec": 20616.869569301605, "step_time_sec": 8.231412577006267, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2489, "loss": 4.654376983642578, "lr": 0.0002, "elapsed_sec": 20625.098954439163, "step_time_sec": 8.229247445007786, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2490, "loss": 4.7371368408203125, "lr": 0.0002, "elapsed_sec": 20633.330352544785, "step_time_sec": 8.231175502005499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2491, "loss": 4.628170967102051, "lr": 0.0002, "elapsed_sec": 20641.633157491684, "step_time_sec": 8.245976831996813, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2492, "loss": 4.732179641723633, "lr": 0.0002, "elapsed_sec": 20649.86480307579, "step_time_sec": 8.231462828989606, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2493, "loss": 4.812403678894043, "lr": 0.0002, "elapsed_sec": 20658.095897436142, "step_time_sec": 8.230950120982016, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2494, "loss": 4.794934272766113, "lr": 0.0002, "elapsed_sec": 20666.326992988586, "step_time_sec": 8.230945390998386, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2495, "loss": 4.7637834548950195, "lr": 0.0002, "elapsed_sec": 20674.558573246002, "step_time_sec": 8.23140992000117, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2496, "loss": 4.667583465576172, "lr": 0.0002, "elapsed_sec": 20682.789156913757, "step_time_sec": 8.2304472370015, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2497, "loss": 4.941985130310059, "lr": 0.0002, "elapsed_sec": 20691.022480249405, "step_time_sec": 8.233157134003704, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2498, "loss": 4.685947895050049, "lr": 0.0002, "elapsed_sec": 20699.25353193283, "step_time_sec": 8.230913346982561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2499, "loss": 4.732772350311279, "lr": 0.0002, "elapsed_sec": 20707.484522342682, "step_time_sec": 8.230861895979615, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2500, "loss": 4.678304195404053, "lr": 0.0002, "elapsed_sec": 20715.715636491776, "step_time_sec": 29.56478295198758, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2501, "loss": 4.529882907867432, "lr": 0.0002, "elapsed_sec": 20745.289197206497, "step_time_sec": 8.239289996010484, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2502, "loss": 4.6333489418029785, "lr": 0.0002, "elapsed_sec": 20753.505652189255, "step_time_sec": 8.21622254999238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2503, "loss": 4.634544849395752, "lr": 0.0002, "elapsed_sec": 20761.72230243683, "step_time_sec": 8.216492389008636, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2504, "loss": 4.855292320251465, "lr": 0.0002, "elapsed_sec": 20769.939398765564, "step_time_sec": 8.216933222982334, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2505, "loss": 4.85091495513916, "lr": 0.0002, "elapsed_sec": 20778.165883541107, "step_time_sec": 8.226337846019305, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2506, "loss": 4.634377479553223, "lr": 0.0002, "elapsed_sec": 20786.394384860992, "step_time_sec": 8.228368048003176, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2507, "loss": 4.752255916595459, "lr": 0.0002, "elapsed_sec": 20794.623589754105, "step_time_sec": 8.229037137003615, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2508, "loss": 4.881867408752441, "lr": 0.0002, "elapsed_sec": 20802.85084271431, "step_time_sec": 8.227113029017346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2509, "loss": 4.751545429229736, "lr": 0.0002, "elapsed_sec": 20811.07976412773, "step_time_sec": 8.228771018009866, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2510, "loss": 4.670376777648926, "lr": 0.0002, "elapsed_sec": 20819.311673164368, "step_time_sec": 8.23177168899565, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2511, "loss": 4.762570858001709, "lr": 0.0002, "elapsed_sec": 20827.542033433914, "step_time_sec": 8.23021052198601, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2512, "loss": 4.680673122406006, "lr": 0.0002, "elapsed_sec": 20835.773950576782, "step_time_sec": 8.231760391005082, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2513, "loss": 4.682845115661621, "lr": 0.0002, "elapsed_sec": 20844.004600048065, "step_time_sec": 8.230456940014847, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2514, "loss": 4.648585796356201, "lr": 0.0002, "elapsed_sec": 20852.235347270966, "step_time_sec": 8.230569256993476, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2515, "loss": 4.675529956817627, "lr": 0.0002, "elapsed_sec": 20860.466279506683, "step_time_sec": 8.230826077982783, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2516, "loss": 4.720098495483398, "lr": 0.0002, "elapsed_sec": 20868.697526454926, "step_time_sec": 8.23105074398336, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2517, "loss": 4.581453800201416, "lr": 0.0002, "elapsed_sec": 20876.928188323975, "step_time_sec": 8.23052276999806, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2518, "loss": 4.859967231750488, "lr": 0.0002, "elapsed_sec": 20885.158184289932, "step_time_sec": 8.229824704991188, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2519, "loss": 4.693700313568115, "lr": 0.0002, "elapsed_sec": 20893.38755941391, "step_time_sec": 8.22922144801123, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2520, "loss": 4.663669109344482, "lr": 0.0002, "elapsed_sec": 20901.61795425415, "step_time_sec": 8.230263237986946, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2521, "loss": 4.692607402801514, "lr": 0.0002, "elapsed_sec": 20909.848479747772, "step_time_sec": 8.230365001014434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2522, "loss": 4.599253177642822, "lr": 0.0002, "elapsed_sec": 20918.079802274704, "step_time_sec": 8.231168437981978, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2523, "loss": 4.613367557525635, "lr": 0.0002, "elapsed_sec": 20926.310163021088, "step_time_sec": 8.230221166013507, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2524, "loss": 4.872710704803467, "lr": 0.0002, "elapsed_sec": 20934.54141306877, "step_time_sec": 8.231053918978432, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2525, "loss": 4.560860633850098, "lr": 0.0002, "elapsed_sec": 20942.77239060402, "step_time_sec": 8.230860865005525, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2526, "loss": 4.586683750152588, "lr": 0.0002, "elapsed_sec": 20951.001487255096, "step_time_sec": 8.228911531972699, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2527, "loss": 4.596395969390869, "lr": 0.0002, "elapsed_sec": 20959.231386899948, "step_time_sec": 8.229800197994336, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2528, "loss": 4.784695148468018, "lr": 0.0002, "elapsed_sec": 20967.461206674576, "step_time_sec": 8.229635907016927, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2529, "loss": 4.692692756652832, "lr": 0.0002, "elapsed_sec": 20975.691868305206, "step_time_sec": 8.230544836027548, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2530, "loss": 4.678190231323242, "lr": 0.0002, "elapsed_sec": 20983.92183971405, "step_time_sec": 8.229798738990212, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2531, "loss": 4.745083808898926, "lr": 0.0002, "elapsed_sec": 20992.15326499939, "step_time_sec": 8.231251616991358, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2532, "loss": 4.77344274520874, "lr": 0.0002, "elapsed_sec": 21000.383539915085, "step_time_sec": 8.230131123011233, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2533, "loss": 4.6612372398376465, "lr": 0.0002, "elapsed_sec": 21008.61308979988, "step_time_sec": 8.229394005989889, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2534, "loss": 4.726983547210693, "lr": 0.0002, "elapsed_sec": 21016.84197282791, "step_time_sec": 8.228766462998465, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2535, "loss": 4.661275863647461, "lr": 0.0002, "elapsed_sec": 21025.069757938385, "step_time_sec": 8.227564870991046, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2536, "loss": 4.969485282897949, "lr": 0.0002, "elapsed_sec": 21033.30073404312, "step_time_sec": 8.23082838600385, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2537, "loss": 4.7004313468933105, "lr": 0.0002, "elapsed_sec": 21041.53147625923, "step_time_sec": 8.230652814003406, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2538, "loss": 4.860958099365234, "lr": 0.0002, "elapsed_sec": 21049.760486602783, "step_time_sec": 8.228791660018032, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2539, "loss": 4.647279739379883, "lr": 0.0002, "elapsed_sec": 21057.987874269485, "step_time_sec": 8.227246846014168, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2540, "loss": 4.726660251617432, "lr": 0.0002, "elapsed_sec": 21066.217535972595, "step_time_sec": 8.229534566984512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2541, "loss": 4.7429656982421875, "lr": 0.0002, "elapsed_sec": 21074.44710111618, "step_time_sec": 8.229386392980814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2542, "loss": 4.688119411468506, "lr": 0.0002, "elapsed_sec": 21082.677938699722, "step_time_sec": 8.230674659018405, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2543, "loss": 4.984662055969238, "lr": 0.0002, "elapsed_sec": 21090.907948732376, "step_time_sec": 8.229895677010063, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2544, "loss": 4.724640369415283, "lr": 0.0002, "elapsed_sec": 21099.137050390244, "step_time_sec": 8.228938290994847, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2545, "loss": 4.680286884307861, "lr": 0.0002, "elapsed_sec": 21107.367220163345, "step_time_sec": 8.229923309001606, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2546, "loss": 4.692224979400635, "lr": 0.0002, "elapsed_sec": 21115.598438739777, "step_time_sec": 8.231058079982176, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2547, "loss": 4.601898193359375, "lr": 0.0002, "elapsed_sec": 21123.829372406006, "step_time_sec": 8.230785166000715, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2548, "loss": 4.61082124710083, "lr": 0.0002, "elapsed_sec": 21132.061073303223, "step_time_sec": 8.231546805007383, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2549, "loss": 4.630067348480225, "lr": 0.0002, "elapsed_sec": 21140.290061712265, "step_time_sec": 8.228827438986627, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2550, "loss": 4.72283935546875, "lr": 0.0002, "elapsed_sec": 21148.518138170242, "step_time_sec": 8.227918151998892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2551, "loss": 4.7213029861450195, "lr": 0.0002, "elapsed_sec": 21156.74951314926, "step_time_sec": 8.231282987020677, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2552, "loss": 4.695431232452393, "lr": 0.0002, "elapsed_sec": 21164.980233430862, "step_time_sec": 8.230516851996072, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2553, "loss": 4.700077056884766, "lr": 0.0002, "elapsed_sec": 21173.2114238739, "step_time_sec": 8.231036868994124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2554, "loss": 4.718552112579346, "lr": 0.0002, "elapsed_sec": 21181.442183494568, "step_time_sec": 8.230589091021102, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2555, "loss": 4.9917893409729, "lr": 0.0002, "elapsed_sec": 21189.672630786896, "step_time_sec": 8.230307944992092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2556, "loss": 4.77323055267334, "lr": 0.0002, "elapsed_sec": 21197.902119636536, "step_time_sec": 8.22932318499079, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2557, "loss": 4.623552322387695, "lr": 0.0002, "elapsed_sec": 21206.130598306656, "step_time_sec": 8.228347035998013, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2558, "loss": 4.5437703132629395, "lr": 0.0002, "elapsed_sec": 21214.359493017197, "step_time_sec": 8.228739555983339, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2559, "loss": 4.661116600036621, "lr": 0.0002, "elapsed_sec": 21222.590391874313, "step_time_sec": 8.230713122000452, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2560, "loss": 4.68331241607666, "lr": 0.0002, "elapsed_sec": 21230.822286844254, "step_time_sec": 8.231811570993159, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2561, "loss": 4.580930233001709, "lr": 0.0002, "elapsed_sec": 21239.052270889282, "step_time_sec": 8.229764448013157, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2562, "loss": 4.780625820159912, "lr": 0.0002, "elapsed_sec": 21247.282091140747, "step_time_sec": 8.229659889009781, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2563, "loss": 4.663039684295654, "lr": 0.0002, "elapsed_sec": 21255.51168036461, "step_time_sec": 8.229498628992587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2564, "loss": 4.697241306304932, "lr": 0.0002, "elapsed_sec": 21263.742024183273, "step_time_sec": 8.230107517010765, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2565, "loss": 4.580911636352539, "lr": 0.0002, "elapsed_sec": 21271.972855329514, "step_time_sec": 8.230674991995329, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2566, "loss": 4.645091533660889, "lr": 0.0002, "elapsed_sec": 21280.200042963028, "step_time_sec": 8.22704165900359, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2567, "loss": 4.609678745269775, "lr": 0.0002, "elapsed_sec": 21288.429914712906, "step_time_sec": 8.229746905999491, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2568, "loss": 4.68795919418335, "lr": 0.0002, "elapsed_sec": 21296.658187627792, "step_time_sec": 8.228119866020279, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2569, "loss": 4.664326190948486, "lr": 0.0002, "elapsed_sec": 21304.88682293892, "step_time_sec": 8.228423520020442, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2570, "loss": 4.736649036407471, "lr": 0.0002, "elapsed_sec": 21313.117413520813, "step_time_sec": 8.230464149004547, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2571, "loss": 4.700524806976318, "lr": 0.0002, "elapsed_sec": 21321.34795641899, "step_time_sec": 8.230430506984703, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2572, "loss": 4.730429649353027, "lr": 0.0002, "elapsed_sec": 21329.578140735626, "step_time_sec": 8.230021225987002, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2573, "loss": 4.662782669067383, "lr": 0.0002, "elapsed_sec": 21337.806821346283, "step_time_sec": 8.228484117978951, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2574, "loss": 4.724783897399902, "lr": 0.0002, "elapsed_sec": 21346.036673069, "step_time_sec": 8.229752109997207, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2575, "loss": 4.606544494628906, "lr": 0.0002, "elapsed_sec": 21354.26834344864, "step_time_sec": 8.231522027985193, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2576, "loss": 4.770145416259766, "lr": 0.0002, "elapsed_sec": 21362.498296022415, "step_time_sec": 8.229757984983735, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2577, "loss": 4.719069004058838, "lr": 0.0002, "elapsed_sec": 21370.72967028618, "step_time_sec": 8.231192091014236, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2578, "loss": 4.668900966644287, "lr": 0.0002, "elapsed_sec": 21378.960899591446, "step_time_sec": 8.231067891989369, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2579, "loss": 4.811873912811279, "lr": 0.0002, "elapsed_sec": 21387.191952466965, "step_time_sec": 8.23094075798872, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2580, "loss": 4.625147342681885, "lr": 0.0002, "elapsed_sec": 21395.422879695892, "step_time_sec": 8.2307176690083, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2581, "loss": 4.600000381469727, "lr": 0.0002, "elapsed_sec": 21403.653621912003, "step_time_sec": 8.230639718007296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2582, "loss": 4.678141117095947, "lr": 0.0002, "elapsed_sec": 21411.884307861328, "step_time_sec": 8.23051370898611, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2583, "loss": 4.755406379699707, "lr": 0.0002, "elapsed_sec": 21420.115113973618, "step_time_sec": 8.230615784006659, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2584, "loss": 4.654372215270996, "lr": 0.0002, "elapsed_sec": 21428.347312927246, "step_time_sec": 8.23207908301265, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2585, "loss": 4.598785400390625, "lr": 0.0002, "elapsed_sec": 21436.577912330627, "step_time_sec": 8.230423575005261, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2586, "loss": 4.7328314781188965, "lr": 0.0002, "elapsed_sec": 21444.808506011963, "step_time_sec": 8.230402584013063, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2587, "loss": 4.717775821685791, "lr": 0.0002, "elapsed_sec": 21453.036736011505, "step_time_sec": 8.228074655984528, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2588, "loss": 4.811224937438965, "lr": 0.0002, "elapsed_sec": 21461.26575231552, "step_time_sec": 8.22890412400011, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2589, "loss": 4.664462566375732, "lr": 0.0002, "elapsed_sec": 21469.493919610977, "step_time_sec": 8.228040716989199, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2590, "loss": 4.726855754852295, "lr": 0.0002, "elapsed_sec": 21477.72396683693, "step_time_sec": 8.229794162005419, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2591, "loss": 4.654155254364014, "lr": 0.0002, "elapsed_sec": 21485.952689170837, "step_time_sec": 8.228567427984672, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2592, "loss": 4.756533145904541, "lr": 0.0002, "elapsed_sec": 21494.184183597565, "step_time_sec": 8.231366010004422, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2593, "loss": 4.6141533851623535, "lr": 0.0002, "elapsed_sec": 21502.4148542881, "step_time_sec": 8.230507291998947, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2594, "loss": 4.5691609382629395, "lr": 0.0002, "elapsed_sec": 21510.645686864853, "step_time_sec": 8.230696446989896, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2595, "loss": 4.4918646812438965, "lr": 0.0002, "elapsed_sec": 21518.875328302383, "step_time_sec": 8.229472976992838, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2596, "loss": 4.675387859344482, "lr": 0.0002, "elapsed_sec": 21527.10465669632, "step_time_sec": 8.229215247003594, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2597, "loss": 4.660126686096191, "lr": 0.0002, "elapsed_sec": 21535.33562898636, "step_time_sec": 8.230772846989566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2598, "loss": 4.700488567352295, "lr": 0.0002, "elapsed_sec": 21543.566388130188, "step_time_sec": 8.23060859600082, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2599, "loss": 4.596510410308838, "lr": 0.0002, "elapsed_sec": 21551.79771900177, "step_time_sec": 8.231151699001202, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2600, "loss": 4.6179094314575195, "lr": 0.0002, "elapsed_sec": 21560.026916503906, "step_time_sec": 8.22904623800423, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2601, "loss": 4.627079963684082, "lr": 0.0002, "elapsed_sec": 21568.255653619766, "step_time_sec": 8.228619853995042, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2602, "loss": 4.606199741363525, "lr": 0.0002, "elapsed_sec": 21576.484599113464, "step_time_sec": 8.228806505998364, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2603, "loss": 4.655127048492432, "lr": 0.0002, "elapsed_sec": 21584.71322965622, "step_time_sec": 8.2284740509931, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2604, "loss": 4.612159252166748, "lr": 0.0002, "elapsed_sec": 21592.943743944168, "step_time_sec": 8.230330780002987, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2605, "loss": 4.61944055557251, "lr": 0.0002, "elapsed_sec": 21601.17413210869, "step_time_sec": 8.230214926996268, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2606, "loss": 4.637219429016113, "lr": 0.0002, "elapsed_sec": 21609.405188798904, "step_time_sec": 8.230969264986925, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2607, "loss": 4.688307762145996, "lr": 0.0002, "elapsed_sec": 21617.634783267975, "step_time_sec": 8.229405500984285, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2608, "loss": 4.747183322906494, "lr": 0.0002, "elapsed_sec": 21625.864212036133, "step_time_sec": 8.229230243014172, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2609, "loss": 4.6356072425842285, "lr": 0.0002, "elapsed_sec": 21634.09403538704, "step_time_sec": 8.229685903002974, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2610, "loss": 4.626055717468262, "lr": 0.0002, "elapsed_sec": 21642.323616981506, "step_time_sec": 8.229411905980669, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2611, "loss": 4.82187032699585, "lr": 0.0002, "elapsed_sec": 21650.55196928978, "step_time_sec": 8.228207475011004, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2612, "loss": 4.712846755981445, "lr": 0.0002, "elapsed_sec": 21658.780943870544, "step_time_sec": 8.228819612006191, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2613, "loss": 4.71159029006958, "lr": 0.0002, "elapsed_sec": 21667.011424303055, "step_time_sec": 8.230325411015656, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2614, "loss": 4.729896545410156, "lr": 0.0002, "elapsed_sec": 21675.24093270302, "step_time_sec": 8.229424638004275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2615, "loss": 4.594387531280518, "lr": 0.0002, "elapsed_sec": 21683.470500946045, "step_time_sec": 8.229337098018732, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2616, "loss": 4.730456829071045, "lr": 0.0002, "elapsed_sec": 21691.70119547844, "step_time_sec": 8.230580956995254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2617, "loss": 4.605047702789307, "lr": 0.0002, "elapsed_sec": 21699.92798614502, "step_time_sec": 8.226594245992601, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2618, "loss": 4.530078411102295, "lr": 0.0002, "elapsed_sec": 21708.159139871597, "step_time_sec": 8.231036832003156, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2619, "loss": 4.708760738372803, "lr": 0.0002, "elapsed_sec": 21716.38753938675, "step_time_sec": 8.228186811000342, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2620, "loss": 4.567300319671631, "lr": 0.0002, "elapsed_sec": 21724.618438243866, "step_time_sec": 8.230796738993376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2621, "loss": 4.691983222961426, "lr": 0.0002, "elapsed_sec": 21732.851629257202, "step_time_sec": 8.233003400993766, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2622, "loss": 4.602262496948242, "lr": 0.0002, "elapsed_sec": 21741.08264732361, "step_time_sec": 8.230850774009014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2623, "loss": 4.105885028839111, "lr": 0.0002, "elapsed_sec": 21749.310264587402, "step_time_sec": 8.227479627996217, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2624, "loss": 4.662384510040283, "lr": 0.0002, "elapsed_sec": 21757.538960695267, "step_time_sec": 8.228543530014576, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2625, "loss": 4.814426898956299, "lr": 0.0002, "elapsed_sec": 21765.76645231247, "step_time_sec": 8.227320559002692, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2626, "loss": 4.652174472808838, "lr": 0.0002, "elapsed_sec": 21773.995478630066, "step_time_sec": 8.228864799981238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2627, "loss": 4.620194911956787, "lr": 0.0002, "elapsed_sec": 21782.225911140442, "step_time_sec": 8.23028338100994, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2628, "loss": 4.727962017059326, "lr": 0.0002, "elapsed_sec": 21790.457227230072, "step_time_sec": 8.231172186002368, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2629, "loss": 4.6507248878479, "lr": 0.0002, "elapsed_sec": 21798.687941789627, "step_time_sec": 8.230580905015813, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2630, "loss": 4.432463645935059, "lr": 0.0002, "elapsed_sec": 21806.918046474457, "step_time_sec": 8.22998206198099, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2631, "loss": 4.596065521240234, "lr": 0.0002, "elapsed_sec": 21815.14936065674, "step_time_sec": 8.231118603987852, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2632, "loss": 4.688173294067383, "lr": 0.0002, "elapsed_sec": 21823.380496501923, "step_time_sec": 8.231021151994355, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2633, "loss": 4.720455169677734, "lr": 0.0002, "elapsed_sec": 21831.610727787018, "step_time_sec": 8.230039059009869, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2634, "loss": 4.730831623077393, "lr": 0.0002, "elapsed_sec": 21839.840185165405, "step_time_sec": 8.229309163987637, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2635, "loss": 4.5597310066223145, "lr": 0.0002, "elapsed_sec": 21848.07200694084, "step_time_sec": 8.231674218986882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2636, "loss": 4.620671272277832, "lr": 0.0002, "elapsed_sec": 21856.302100419998, "step_time_sec": 8.229874265001854, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2637, "loss": 4.601119518280029, "lr": 0.0002, "elapsed_sec": 21864.529428958893, "step_time_sec": 8.227172814978985, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2638, "loss": 4.638670444488525, "lr": 0.0002, "elapsed_sec": 21872.75823187828, "step_time_sec": 8.228673259000061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2639, "loss": 4.776505947113037, "lr": 0.0002, "elapsed_sec": 21880.987654447556, "step_time_sec": 8.229284304979956, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2640, "loss": 4.669657230377197, "lr": 0.0002, "elapsed_sec": 21889.218505859375, "step_time_sec": 8.230688818002818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2641, "loss": 4.614442825317383, "lr": 0.0002, "elapsed_sec": 21897.448743581772, "step_time_sec": 8.230105946975527, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2642, "loss": 5.027438163757324, "lr": 0.0002, "elapsed_sec": 21905.679966926575, "step_time_sec": 8.23102176899556, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2643, "loss": 4.55946683883667, "lr": 0.0002, "elapsed_sec": 21913.910585165024, "step_time_sec": 8.230533855006797, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2644, "loss": 4.617366790771484, "lr": 0.0002, "elapsed_sec": 21922.141796827316, "step_time_sec": 8.231021045008674, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2645, "loss": 4.748910903930664, "lr": 0.0002, "elapsed_sec": 21930.373465538025, "step_time_sec": 8.231478471017908, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2646, "loss": 4.621723175048828, "lr": 0.0002, "elapsed_sec": 21938.60302376747, "step_time_sec": 8.229419718001736, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2647, "loss": 4.6339311599731445, "lr": 0.0002, "elapsed_sec": 21946.83211040497, "step_time_sec": 8.228972821001662, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2648, "loss": 4.823899269104004, "lr": 0.0002, "elapsed_sec": 21955.063004732132, "step_time_sec": 8.230722245993093, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2649, "loss": 4.6365742683410645, "lr": 0.0002, "elapsed_sec": 21963.294288635254, "step_time_sec": 8.231083535996731, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2650, "loss": 4.629298210144043, "lr": 0.0002, "elapsed_sec": 21971.52578997612, "step_time_sec": 8.231357545009814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2651, "loss": 4.610926628112793, "lr": 0.0002, "elapsed_sec": 21979.756067037582, "step_time_sec": 8.23016295497655, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2652, "loss": 4.531740188598633, "lr": 0.0002, "elapsed_sec": 21987.986432790756, "step_time_sec": 8.230195466981968, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2653, "loss": 4.730116367340088, "lr": 0.0002, "elapsed_sec": 21996.213286161423, "step_time_sec": 8.226662955014035, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2654, "loss": 4.682662010192871, "lr": 0.0002, "elapsed_sec": 22004.442907094955, "step_time_sec": 8.22946572600631, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2655, "loss": 4.617262363433838, "lr": 0.0002, "elapsed_sec": 22012.67364692688, "step_time_sec": 8.230594810011098, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2656, "loss": 4.713841438293457, "lr": 0.0002, "elapsed_sec": 22020.90214252472, "step_time_sec": 8.228359651984647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2657, "loss": 4.699833869934082, "lr": 0.0002, "elapsed_sec": 22029.131516218185, "step_time_sec": 8.229232117009815, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2658, "loss": 4.648962020874023, "lr": 0.0002, "elapsed_sec": 22037.359743833542, "step_time_sec": 8.228029024001444, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2659, "loss": 4.700395107269287, "lr": 0.0002, "elapsed_sec": 22045.588891983032, "step_time_sec": 8.229009518021485, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2660, "loss": 4.621579647064209, "lr": 0.0002, "elapsed_sec": 22053.81971859932, "step_time_sec": 8.230651213991223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2661, "loss": 4.663940906524658, "lr": 0.0002, "elapsed_sec": 22062.05061531067, "step_time_sec": 8.230793202994391, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2662, "loss": 4.670229434967041, "lr": 0.0002, "elapsed_sec": 22070.281925201416, "step_time_sec": 8.231112028995994, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2663, "loss": 4.608488082885742, "lr": 0.0002, "elapsed_sec": 22078.51246213913, "step_time_sec": 8.23045036700205, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2664, "loss": 4.862466812133789, "lr": 0.0002, "elapsed_sec": 22086.74206995964, "step_time_sec": 8.229436446999898, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2665, "loss": 4.475545406341553, "lr": 0.0002, "elapsed_sec": 22094.97203731537, "step_time_sec": 8.229806744027883, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2666, "loss": 4.537258625030518, "lr": 0.0002, "elapsed_sec": 22103.202634572983, "step_time_sec": 8.230376065999735, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2667, "loss": 4.528110027313232, "lr": 0.0002, "elapsed_sec": 22111.433309078217, "step_time_sec": 8.23053003899986, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2668, "loss": 4.468289375305176, "lr": 0.0002, "elapsed_sec": 22119.664470672607, "step_time_sec": 8.23100247498951, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2669, "loss": 4.529934406280518, "lr": 0.0002, "elapsed_sec": 22127.89569067955, "step_time_sec": 8.231050709000556, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2670, "loss": 4.586321830749512, "lr": 0.0002, "elapsed_sec": 22136.12693309784, "step_time_sec": 8.231079449004028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2671, "loss": 4.7221455574035645, "lr": 0.0002, "elapsed_sec": 22144.358286619186, "step_time_sec": 8.231241011992097, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2672, "loss": 4.636501312255859, "lr": 0.0002, "elapsed_sec": 22152.58907699585, "step_time_sec": 8.2306703949871, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2673, "loss": 4.676424503326416, "lr": 0.0002, "elapsed_sec": 22160.821268320084, "step_time_sec": 8.231971061002696, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2674, "loss": 4.640921115875244, "lr": 0.0002, "elapsed_sec": 22169.05210161209, "step_time_sec": 8.230720972002018, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2675, "loss": 4.537862777709961, "lr": 0.0002, "elapsed_sec": 22177.283730745316, "step_time_sec": 8.231463278993033, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2676, "loss": 4.728183746337891, "lr": 0.0002, "elapsed_sec": 22185.515308618546, "step_time_sec": 8.231409508007346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2677, "loss": 4.682427883148193, "lr": 0.0002, "elapsed_sec": 22193.743991613388, "step_time_sec": 8.228488564986037, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2678, "loss": 4.564118385314941, "lr": 0.0002, "elapsed_sec": 22201.973299264908, "step_time_sec": 8.22916000999976, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2679, "loss": 4.67188835144043, "lr": 0.0002, "elapsed_sec": 22210.202088356018, "step_time_sec": 8.228680308995536, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2680, "loss": 4.745395183563232, "lr": 0.0002, "elapsed_sec": 22218.432214021683, "step_time_sec": 8.229926369996974, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2681, "loss": 4.568253517150879, "lr": 0.0002, "elapsed_sec": 22226.659366369247, "step_time_sec": 8.227006299013738, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2682, "loss": 4.719239234924316, "lr": 0.0002, "elapsed_sec": 22234.88881111145, "step_time_sec": 8.229306112014456, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2683, "loss": 4.768899440765381, "lr": 0.0002, "elapsed_sec": 22243.117968082428, "step_time_sec": 8.228985008987365, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2684, "loss": 4.6497392654418945, "lr": 0.0002, "elapsed_sec": 22251.348204135895, "step_time_sec": 8.2300802259997, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2685, "loss": 4.601049423217773, "lr": 0.0002, "elapsed_sec": 22259.57985472679, "step_time_sec": 8.23149225799716, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2686, "loss": 4.662336826324463, "lr": 0.0002, "elapsed_sec": 22267.810270547867, "step_time_sec": 8.230251064000186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2687, "loss": 4.661563873291016, "lr": 0.0002, "elapsed_sec": 22276.040052890778, "step_time_sec": 8.229618148994632, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2688, "loss": 4.704708099365234, "lr": 0.0002, "elapsed_sec": 22284.269206762314, "step_time_sec": 8.229000630002702, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2689, "loss": 4.624760627746582, "lr": 0.0002, "elapsed_sec": 22292.49984049797, "step_time_sec": 8.230514085997129, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2690, "loss": 4.659230709075928, "lr": 0.0002, "elapsed_sec": 22300.731439828873, "step_time_sec": 8.231395736016566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2691, "loss": 4.579570770263672, "lr": 0.0002, "elapsed_sec": 22308.9640750885, "step_time_sec": 8.23252560099354, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2692, "loss": 4.575830459594727, "lr": 0.0002, "elapsed_sec": 22317.19514107704, "step_time_sec": 8.23094874198432, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2693, "loss": 4.772308826446533, "lr": 0.0002, "elapsed_sec": 22325.42601275444, "step_time_sec": 8.230650345009053, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2694, "loss": 4.521655559539795, "lr": 0.0002, "elapsed_sec": 22333.65572309494, "step_time_sec": 8.229595709010027, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2695, "loss": 4.577195644378662, "lr": 0.0002, "elapsed_sec": 22341.886331796646, "step_time_sec": 8.230357348016696, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2696, "loss": 4.587441921234131, "lr": 0.0002, "elapsed_sec": 22350.11513710022, "step_time_sec": 8.228698488994269, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2697, "loss": 4.653411388397217, "lr": 0.0002, "elapsed_sec": 22358.346219539642, "step_time_sec": 8.230949259013869, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2698, "loss": 4.542043209075928, "lr": 0.0002, "elapsed_sec": 22366.57622385025, "step_time_sec": 8.22984699299559, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2699, "loss": 4.666797637939453, "lr": 0.0002, "elapsed_sec": 22374.805875062943, "step_time_sec": 8.229444370022975, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2700, "loss": 4.6574931144714355, "lr": 0.0002, "elapsed_sec": 22383.03650689125, "step_time_sec": 8.230499681987567, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2701, "loss": 4.714599609375, "lr": 0.0002, "elapsed_sec": 22391.26753807068, "step_time_sec": 8.230883497017203, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2702, "loss": 4.760054588317871, "lr": 0.0002, "elapsed_sec": 22399.498386621475, "step_time_sec": 8.230730701994617, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2703, "loss": 4.634001731872559, "lr": 0.0002, "elapsed_sec": 22407.72903084755, "step_time_sec": 8.230483571009245, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2704, "loss": 4.68281888961792, "lr": 0.0002, "elapsed_sec": 22415.960400104523, "step_time_sec": 8.231153041997459, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2705, "loss": 4.5756449699401855, "lr": 0.0002, "elapsed_sec": 22424.188725471497, "step_time_sec": 8.228180656005861, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2706, "loss": 4.600033760070801, "lr": 0.0002, "elapsed_sec": 22432.41911816597, "step_time_sec": 8.23028230201453, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2707, "loss": 4.426373481750488, "lr": 0.0002, "elapsed_sec": 22440.647121667862, "step_time_sec": 8.227803204994416, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2708, "loss": 4.495386600494385, "lr": 0.0002, "elapsed_sec": 22448.875755786896, "step_time_sec": 8.228456873010146, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2709, "loss": 4.586201190948486, "lr": 0.0002, "elapsed_sec": 22457.10613489151, "step_time_sec": 8.230217428994365, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2710, "loss": 4.66021728515625, "lr": 0.0002, "elapsed_sec": 22465.333903074265, "step_time_sec": 8.227666538004996, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2711, "loss": 4.675652503967285, "lr": 0.0002, "elapsed_sec": 22473.56125998497, "step_time_sec": 8.227162590017542, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2712, "loss": 4.6048383712768555, "lr": 0.0002, "elapsed_sec": 22481.790625810623, "step_time_sec": 8.229214805993252, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2713, "loss": 4.569057464599609, "lr": 0.0002, "elapsed_sec": 22490.01989722252, "step_time_sec": 8.22908527700929, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2714, "loss": 4.6980509757995605, "lr": 0.0002, "elapsed_sec": 22498.248014211655, "step_time_sec": 8.228060820983956, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2715, "loss": 4.616293907165527, "lr": 0.0002, "elapsed_sec": 22506.479645490646, "step_time_sec": 8.231402955017984, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2716, "loss": 4.6272873878479, "lr": 0.0002, "elapsed_sec": 22514.70993757248, "step_time_sec": 8.230138343002182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2717, "loss": 4.746583461761475, "lr": 0.0002, "elapsed_sec": 22522.94185948372, "step_time_sec": 8.231760855996981, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2718, "loss": 4.742059707641602, "lr": 0.0002, "elapsed_sec": 22531.173167467117, "step_time_sec": 8.231139278999763, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2719, "loss": 4.663022518157959, "lr": 0.0002, "elapsed_sec": 22539.404759645462, "step_time_sec": 8.231469752994599, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2720, "loss": 4.566410064697266, "lr": 0.0002, "elapsed_sec": 22547.635110378265, "step_time_sec": 8.230213011993328, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2721, "loss": 4.567693710327148, "lr": 0.0002, "elapsed_sec": 22555.865895986557, "step_time_sec": 8.23057611900731, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2722, "loss": 4.72245979309082, "lr": 0.0002, "elapsed_sec": 22564.0970389843, "step_time_sec": 8.230963395006256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2723, "loss": 4.5562052726745605, "lr": 0.0002, "elapsed_sec": 22572.326749801636, "step_time_sec": 8.229540286993142, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2724, "loss": 4.58851957321167, "lr": 0.0002, "elapsed_sec": 22580.556458234787, "step_time_sec": 8.229595710989088, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2725, "loss": 4.566022872924805, "lr": 0.0002, "elapsed_sec": 22588.786353349686, "step_time_sec": 8.229705163015751, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2726, "loss": 4.526535987854004, "lr": 0.0002, "elapsed_sec": 22597.01707148552, "step_time_sec": 8.230592027015518, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2727, "loss": 4.607754707336426, "lr": 0.0002, "elapsed_sec": 22605.247096776962, "step_time_sec": 8.22993532998953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2728, "loss": 4.585367679595947, "lr": 0.0002, "elapsed_sec": 22613.478293180466, "step_time_sec": 8.230954679020215, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2729, "loss": 4.537492275238037, "lr": 0.0002, "elapsed_sec": 22621.70946955681, "step_time_sec": 8.231039817008423, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2730, "loss": 4.54507303237915, "lr": 0.0002, "elapsed_sec": 22629.940396785736, "step_time_sec": 8.230813193978975, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2731, "loss": 4.628140926361084, "lr": 0.0002, "elapsed_sec": 22638.16990661621, "step_time_sec": 8.22930150499451, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2732, "loss": 4.62401008605957, "lr": 0.0002, "elapsed_sec": 22646.39813041687, "step_time_sec": 8.2281063060218, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2733, "loss": 4.565721035003662, "lr": 0.0002, "elapsed_sec": 22654.629601955414, "step_time_sec": 8.231282067979919, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2734, "loss": 4.553305625915527, "lr": 0.0002, "elapsed_sec": 22662.860395669937, "step_time_sec": 8.230693980993237, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2735, "loss": 4.651305198669434, "lr": 0.0002, "elapsed_sec": 22671.09261417389, "step_time_sec": 8.232004258985398, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2736, "loss": 4.481616973876953, "lr": 0.0002, "elapsed_sec": 22679.32215833664, "step_time_sec": 8.22941508499207, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2737, "loss": 4.650973796844482, "lr": 0.0002, "elapsed_sec": 22687.551506519318, "step_time_sec": 8.229148782993434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2738, "loss": 4.630519866943359, "lr": 0.0002, "elapsed_sec": 22695.77924132347, "step_time_sec": 8.227648656000383, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2739, "loss": 4.645529747009277, "lr": 0.0002, "elapsed_sec": 22704.010151147842, "step_time_sec": 8.230663477996131, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2740, "loss": 4.587497711181641, "lr": 0.0002, "elapsed_sec": 22712.239881277084, "step_time_sec": 8.229539584979648, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2741, "loss": 4.597281455993652, "lr": 0.0002, "elapsed_sec": 22720.47097802162, "step_time_sec": 8.23099958000239, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2742, "loss": 4.526557922363281, "lr": 0.0002, "elapsed_sec": 22728.701763868332, "step_time_sec": 8.230563163990155, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2743, "loss": 4.494027137756348, "lr": 0.0002, "elapsed_sec": 22736.93272638321, "step_time_sec": 8.230802019999828, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2744, "loss": 4.708230018615723, "lr": 0.0002, "elapsed_sec": 22745.16198706627, "step_time_sec": 8.229119285999332, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2745, "loss": 4.820937633514404, "lr": 0.0002, "elapsed_sec": 22753.3918569088, "step_time_sec": 8.229719322000165, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2746, "loss": 4.584656715393066, "lr": 0.0002, "elapsed_sec": 22761.621297597885, "step_time_sec": 8.229302506020758, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2747, "loss": 4.530609607696533, "lr": 0.0002, "elapsed_sec": 22769.850813627243, "step_time_sec": 8.229360102006467, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2748, "loss": 4.644809722900391, "lr": 0.0002, "elapsed_sec": 22778.081036806107, "step_time_sec": 8.230106172006344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2749, "loss": 4.611154556274414, "lr": 0.0002, "elapsed_sec": 22786.31198811531, "step_time_sec": 8.230730706010945, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2750, "loss": 4.5878801345825195, "lr": 0.0002, "elapsed_sec": 22794.543261051178, "step_time_sec": 8.23114194200025, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2751, "loss": 4.565774440765381, "lr": 0.0002, "elapsed_sec": 22802.771726608276, "step_time_sec": 8.228353829996195, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2752, "loss": 4.651994228363037, "lr": 0.0002, "elapsed_sec": 22811.00104880333, "step_time_sec": 8.229123325989349, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2753, "loss": 4.2689528465271, "lr": 0.0002, "elapsed_sec": 22819.22880911827, "step_time_sec": 8.227612875023624, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2754, "loss": 4.577883243560791, "lr": 0.0002, "elapsed_sec": 22827.45783996582, "step_time_sec": 8.22887072301819, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2755, "loss": 4.537928104400635, "lr": 0.0002, "elapsed_sec": 22835.68708872795, "step_time_sec": 8.229146820987808, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2756, "loss": 4.6617279052734375, "lr": 0.0002, "elapsed_sec": 22843.914247989655, "step_time_sec": 8.226988306007115, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2757, "loss": 4.626819610595703, "lr": 0.0002, "elapsed_sec": 22852.143983364105, "step_time_sec": 8.229547010996612, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2758, "loss": 4.596529006958008, "lr": 0.0002, "elapsed_sec": 22860.371616125107, "step_time_sec": 8.227558275975753, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2759, "loss": 4.610722541809082, "lr": 0.0002, "elapsed_sec": 22868.60131430626, "step_time_sec": 8.229473955987487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2760, "loss": 4.6033830642700195, "lr": 0.0002, "elapsed_sec": 22876.832721948624, "step_time_sec": 8.23128318900126, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2761, "loss": 4.534388542175293, "lr": 0.0002, "elapsed_sec": 22885.06226682663, "step_time_sec": 8.229346453998005, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2762, "loss": 4.614955425262451, "lr": 0.0002, "elapsed_sec": 22893.29149246216, "step_time_sec": 8.229122479009675, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2763, "loss": 4.594147682189941, "lr": 0.0002, "elapsed_sec": 22901.51940560341, "step_time_sec": 8.227713250991656, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2764, "loss": 4.590221405029297, "lr": 0.0002, "elapsed_sec": 22909.75002837181, "step_time_sec": 8.230492806993425, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2765, "loss": 4.5770673751831055, "lr": 0.0002, "elapsed_sec": 22917.98116183281, "step_time_sec": 8.231016656005522, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2766, "loss": 4.4424333572387695, "lr": 0.0002, "elapsed_sec": 22926.211072444916, "step_time_sec": 8.229699240997434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2767, "loss": 4.556941032409668, "lr": 0.0002, "elapsed_sec": 22934.440272569656, "step_time_sec": 8.229044979991158, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2768, "loss": 4.5262298583984375, "lr": 0.0002, "elapsed_sec": 22942.670517921448, "step_time_sec": 8.230095712002367, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2769, "loss": 4.609014511108398, "lr": 0.0002, "elapsed_sec": 22950.90019083023, "step_time_sec": 8.229607590998057, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2770, "loss": 4.542576313018799, "lr": 0.0002, "elapsed_sec": 22959.131595373154, "step_time_sec": 8.231157584988978, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2771, "loss": 4.568789958953857, "lr": 0.0002, "elapsed_sec": 22967.361619472504, "step_time_sec": 8.229879670980154, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2772, "loss": 4.5131964683532715, "lr": 0.0002, "elapsed_sec": 22975.59406709671, "step_time_sec": 8.23231912299525, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2773, "loss": 4.5905985832214355, "lr": 0.0002, "elapsed_sec": 22983.824645280838, "step_time_sec": 8.230400558997644, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2774, "loss": 4.462674140930176, "lr": 0.0002, "elapsed_sec": 22992.053997039795, "step_time_sec": 8.229198603978148, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2775, "loss": 4.526019096374512, "lr": 0.0002, "elapsed_sec": 23000.284041404724, "step_time_sec": 8.229882340994664, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2776, "loss": 4.545140743255615, "lr": 0.0002, "elapsed_sec": 23008.511626958847, "step_time_sec": 8.227409081009682, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2777, "loss": 4.539638996124268, "lr": 0.0002, "elapsed_sec": 23016.743133068085, "step_time_sec": 8.231408417021157, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2778, "loss": 4.536755561828613, "lr": 0.0002, "elapsed_sec": 23024.973731040955, "step_time_sec": 8.230390192009509, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2779, "loss": 4.5244879722595215, "lr": 0.0002, "elapsed_sec": 23033.20347046852, "step_time_sec": 8.229639366007177, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2780, "loss": 4.557704925537109, "lr": 0.0002, "elapsed_sec": 23041.432372808456, "step_time_sec": 8.228713771997718, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2781, "loss": 4.620666027069092, "lr": 0.0002, "elapsed_sec": 23049.659618377686, "step_time_sec": 8.227094080997631, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2782, "loss": 4.615739822387695, "lr": 0.0002, "elapsed_sec": 23057.89077615738, "step_time_sec": 8.230980292020831, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2783, "loss": 4.542320728302002, "lr": 0.0002, "elapsed_sec": 23066.121305704117, "step_time_sec": 8.230389383010333, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2784, "loss": 4.754216194152832, "lr": 0.0002, "elapsed_sec": 23074.352030277252, "step_time_sec": 8.230552297987742, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2785, "loss": 4.634716033935547, "lr": 0.0002, "elapsed_sec": 23082.583786010742, "step_time_sec": 8.231583668995881, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2786, "loss": 4.551044464111328, "lr": 0.0002, "elapsed_sec": 23090.814831256866, "step_time_sec": 8.230933073995402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2787, "loss": 4.519670486450195, "lr": 0.0002, "elapsed_sec": 23099.04557275772, "step_time_sec": 8.230556786002126, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2788, "loss": 4.603142261505127, "lr": 0.0002, "elapsed_sec": 23107.275986909866, "step_time_sec": 8.230282194999745, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2789, "loss": 4.625524044036865, "lr": 0.0002, "elapsed_sec": 23115.50769495964, "step_time_sec": 8.2315861590032, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2790, "loss": 4.544651031494141, "lr": 0.0002, "elapsed_sec": 23123.737258911133, "step_time_sec": 8.229343790007988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2791, "loss": 4.577003002166748, "lr": 0.0002, "elapsed_sec": 23131.964269161224, "step_time_sec": 8.226858138979878, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2792, "loss": 4.607483386993408, "lr": 0.0002, "elapsed_sec": 23140.192984104156, "step_time_sec": 8.22858389900648, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2793, "loss": 4.455385684967041, "lr": 0.0002, "elapsed_sec": 23148.42183470726, "step_time_sec": 8.22872415999882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2794, "loss": 4.445532321929932, "lr": 0.0002, "elapsed_sec": 23156.65061068535, "step_time_sec": 8.228552339976886, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2795, "loss": 4.499051094055176, "lr": 0.0002, "elapsed_sec": 23164.879005670547, "step_time_sec": 8.228245034988504, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2796, "loss": 4.603705883026123, "lr": 0.0002, "elapsed_sec": 23173.10753273964, "step_time_sec": 8.228365804010537, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2797, "loss": 4.421053409576416, "lr": 0.0002, "elapsed_sec": 23181.337941884995, "step_time_sec": 8.230299400980584, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2798, "loss": 4.67800760269165, "lr": 0.0002, "elapsed_sec": 23189.566338300705, "step_time_sec": 8.228230476990575, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2799, "loss": 4.667261600494385, "lr": 0.0002, "elapsed_sec": 23197.794907331467, "step_time_sec": 8.228394392994232, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2800, "loss": 4.560338020324707, "lr": 0.0002, "elapsed_sec": 23206.023777723312, "step_time_sec": 8.228743352985475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2801, "loss": 4.411718845367432, "lr": 0.0002, "elapsed_sec": 23214.253712892532, "step_time_sec": 8.229767571989214, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2802, "loss": 4.666287422180176, "lr": 0.0002, "elapsed_sec": 23222.484487771988, "step_time_sec": 8.230603588977829, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2803, "loss": 4.536012649536133, "lr": 0.0002, "elapsed_sec": 23230.715762376785, "step_time_sec": 8.231161309988238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2804, "loss": 4.443243980407715, "lr": 0.0002, "elapsed_sec": 23238.947377204895, "step_time_sec": 8.231421805976424, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2805, "loss": 4.564094543457031, "lr": 0.0002, "elapsed_sec": 23247.175755023956, "step_time_sec": 8.228220499993768, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2806, "loss": 4.591754913330078, "lr": 0.0002, "elapsed_sec": 23255.405359506607, "step_time_sec": 8.2294823040138, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2807, "loss": 4.532959938049316, "lr": 0.0002, "elapsed_sec": 23263.636507749557, "step_time_sec": 8.231015319994185, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2808, "loss": 4.6142072677612305, "lr": 0.0002, "elapsed_sec": 23271.867735147476, "step_time_sec": 8.231030914990697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2809, "loss": 4.52388334274292, "lr": 0.0002, "elapsed_sec": 23280.095814943314, "step_time_sec": 8.227934480004478, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2810, "loss": 4.573061466217041, "lr": 0.0002, "elapsed_sec": 23288.327017068863, "step_time_sec": 8.23109426497831, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2811, "loss": 4.6632537841796875, "lr": 0.0002, "elapsed_sec": 23296.558127641678, "step_time_sec": 8.230938550987048, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2812, "loss": 4.391626834869385, "lr": 0.0002, "elapsed_sec": 23304.786519289017, "step_time_sec": 8.228157828998519, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2813, "loss": 4.531961917877197, "lr": 0.0002, "elapsed_sec": 23313.01510834694, "step_time_sec": 8.228431372990599, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2814, "loss": 4.542135715484619, "lr": 0.0002, "elapsed_sec": 23321.244753599167, "step_time_sec": 8.229482935013948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2815, "loss": 4.56690788269043, "lr": 0.0002, "elapsed_sec": 23329.476157426834, "step_time_sec": 8.23129461298231, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2816, "loss": 4.61724328994751, "lr": 0.0002, "elapsed_sec": 23337.707100391388, "step_time_sec": 8.230777296004817, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2817, "loss": 4.756894588470459, "lr": 0.0002, "elapsed_sec": 23345.93688225746, "step_time_sec": 8.229625456006033, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2818, "loss": 4.697519779205322, "lr": 0.0002, "elapsed_sec": 23354.167471170425, "step_time_sec": 8.2304098700115, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2819, "loss": 4.545988082885742, "lr": 0.0002, "elapsed_sec": 23362.399802684784, "step_time_sec": 8.232163883018075, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2820, "loss": 4.613286018371582, "lr": 0.0002, "elapsed_sec": 23370.62910413742, "step_time_sec": 8.229144863027614, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2821, "loss": 4.6024699211120605, "lr": 0.0002, "elapsed_sec": 23378.858599185944, "step_time_sec": 8.229393546003848, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2822, "loss": 4.492286205291748, "lr": 0.0002, "elapsed_sec": 23387.089374780655, "step_time_sec": 8.23059391300194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2823, "loss": 4.551214218139648, "lr": 0.0002, "elapsed_sec": 23395.320779561996, "step_time_sec": 8.23123516299529, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2824, "loss": 4.584118843078613, "lr": 0.0002, "elapsed_sec": 23403.55215215683, "step_time_sec": 8.231280859996332, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2825, "loss": 4.594841957092285, "lr": 0.0002, "elapsed_sec": 23411.783669233322, "step_time_sec": 8.231334059004439, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2826, "loss": 4.650684833526611, "lr": 0.0002, "elapsed_sec": 23420.014234781265, "step_time_sec": 8.230382242996711, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2827, "loss": 4.767763137817383, "lr": 0.0002, "elapsed_sec": 23428.245049238205, "step_time_sec": 8.230677371000638, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2828, "loss": 4.568088531494141, "lr": 0.0002, "elapsed_sec": 23436.47475528717, "step_time_sec": 8.22951716199168, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2829, "loss": 4.516456604003906, "lr": 0.0002, "elapsed_sec": 23444.703402757645, "step_time_sec": 8.228477012977237, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2830, "loss": 4.607192516326904, "lr": 0.0002, "elapsed_sec": 23452.93395423889, "step_time_sec": 8.230418683000607, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2831, "loss": 4.476310729980469, "lr": 0.0002, "elapsed_sec": 23461.16426038742, "step_time_sec": 8.230181418010034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2832, "loss": 4.744218349456787, "lr": 0.0002, "elapsed_sec": 23469.395594596863, "step_time_sec": 8.23115376001806, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2833, "loss": 4.627787113189697, "lr": 0.0002, "elapsed_sec": 23477.627315044403, "step_time_sec": 8.231631673988886, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2834, "loss": 4.500881671905518, "lr": 0.0002, "elapsed_sec": 23485.857669115067, "step_time_sec": 8.230166867986554, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2835, "loss": 4.584688186645508, "lr": 0.0002, "elapsed_sec": 23494.08914041519, "step_time_sec": 8.231295200996101, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2836, "loss": 4.552865982055664, "lr": 0.0002, "elapsed_sec": 23502.317957639694, "step_time_sec": 8.228617717977613, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2837, "loss": 4.561269283294678, "lr": 0.0002, "elapsed_sec": 23510.548583984375, "step_time_sec": 8.230486814019969, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2838, "loss": 4.775350570678711, "lr": 0.0002, "elapsed_sec": 23518.77964782715, "step_time_sec": 8.230956696002977, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2839, "loss": 4.603885173797607, "lr": 0.0002, "elapsed_sec": 23527.010868549347, "step_time_sec": 8.231103992002318, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2840, "loss": 4.518984794616699, "lr": 0.0002, "elapsed_sec": 23535.24244594574, "step_time_sec": 8.231410857988521, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2841, "loss": 4.585295677185059, "lr": 0.0002, "elapsed_sec": 23543.471435070038, "step_time_sec": 8.228760433994466, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2842, "loss": 4.694502830505371, "lr": 0.0002, "elapsed_sec": 23551.69986796379, "step_time_sec": 8.228303790994687, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2843, "loss": 4.454806804656982, "lr": 0.0002, "elapsed_sec": 23559.93069791794, "step_time_sec": 8.23069331899751, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2844, "loss": 4.6807122230529785, "lr": 0.0002, "elapsed_sec": 23568.162239789963, "step_time_sec": 8.231399830983719, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2845, "loss": 4.607193470001221, "lr": 0.0002, "elapsed_sec": 23576.392911672592, "step_time_sec": 8.230442111002048, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2846, "loss": 4.637869834899902, "lr": 0.0002, "elapsed_sec": 23584.62362408638, "step_time_sec": 8.230608694982948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2847, "loss": 4.527827739715576, "lr": 0.0002, "elapsed_sec": 23592.85561275482, "step_time_sec": 8.231785649986705, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2848, "loss": 4.615720748901367, "lr": 0.0002, "elapsed_sec": 23601.086864233017, "step_time_sec": 8.231122230004985, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2849, "loss": 4.564467430114746, "lr": 0.0002, "elapsed_sec": 23609.318739652634, "step_time_sec": 8.231683220015839, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2850, "loss": 4.698380470275879, "lr": 0.0002, "elapsed_sec": 23617.549333572388, "step_time_sec": 8.230445004999638, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2851, "loss": 4.449108600616455, "lr": 0.0002, "elapsed_sec": 23625.776481628418, "step_time_sec": 8.226971258001868, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2852, "loss": 4.6440629959106445, "lr": 0.0002, "elapsed_sec": 23634.006304740906, "step_time_sec": 8.22971964199678, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2853, "loss": 4.625702381134033, "lr": 0.0002, "elapsed_sec": 23642.234894752502, "step_time_sec": 8.228428233007435, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2854, "loss": 4.557787895202637, "lr": 0.0002, "elapsed_sec": 23650.464257001877, "step_time_sec": 8.229164215998026, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2855, "loss": 4.585596561431885, "lr": 0.0002, "elapsed_sec": 23658.693416833878, "step_time_sec": 8.229019963007886, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2856, "loss": 4.4100751876831055, "lr": 0.0002, "elapsed_sec": 23666.922672748566, "step_time_sec": 8.229082263016608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2857, "loss": 4.388934135437012, "lr": 0.0002, "elapsed_sec": 23675.15346121788, "step_time_sec": 8.230639152985532, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2858, "loss": 4.627986431121826, "lr": 0.0002, "elapsed_sec": 23683.383664369583, "step_time_sec": 8.230039571004454, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2859, "loss": 4.388398170471191, "lr": 0.0002, "elapsed_sec": 23691.615531682968, "step_time_sec": 8.231728423997993, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2860, "loss": 4.549843788146973, "lr": 0.0002, "elapsed_sec": 23699.845804929733, "step_time_sec": 8.230125197995221, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2861, "loss": 4.558018207550049, "lr": 0.0002, "elapsed_sec": 23708.074990034103, "step_time_sec": 8.229039764002664, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2862, "loss": 4.636157512664795, "lr": 0.0002, "elapsed_sec": 23716.304932117462, "step_time_sec": 8.229821641987655, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2863, "loss": 4.614828109741211, "lr": 0.0002, "elapsed_sec": 23724.534724235535, "step_time_sec": 8.229559892992256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2864, "loss": 4.496252536773682, "lr": 0.0002, "elapsed_sec": 23732.764159679413, "step_time_sec": 8.229316601005848, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2865, "loss": 4.443150043487549, "lr": 0.0002, "elapsed_sec": 23740.993718624115, "step_time_sec": 8.229422003001673, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2866, "loss": 4.581871509552002, "lr": 0.0002, "elapsed_sec": 23749.224478006363, "step_time_sec": 8.230578987015178, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2867, "loss": 4.497636318206787, "lr": 0.0002, "elapsed_sec": 23757.455327749252, "step_time_sec": 8.230634320992976, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2868, "loss": 4.445837497711182, "lr": 0.0002, "elapsed_sec": 23765.68600344658, "step_time_sec": 8.230527084000641, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2869, "loss": 4.573306560516357, "lr": 0.0002, "elapsed_sec": 23773.917078971863, "step_time_sec": 8.230979659012519, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2870, "loss": 4.632876873016357, "lr": 0.0002, "elapsed_sec": 23782.148518562317, "step_time_sec": 8.231211036996683, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2871, "loss": 4.596323013305664, "lr": 0.0002, "elapsed_sec": 23790.379576921463, "step_time_sec": 8.230917053995654, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2872, "loss": 4.5751142501831055, "lr": 0.0002, "elapsed_sec": 23798.610686540604, "step_time_sec": 8.230954754020786, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2873, "loss": 4.557948112487793, "lr": 0.0002, "elapsed_sec": 23806.841304063797, "step_time_sec": 8.230510847002733, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2874, "loss": 4.537570953369141, "lr": 0.0002, "elapsed_sec": 23815.071314573288, "step_time_sec": 8.229825230984716, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2875, "loss": 4.510697841644287, "lr": 0.0002, "elapsed_sec": 23823.301827430725, "step_time_sec": 8.230367648007814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2876, "loss": 4.578820705413818, "lr": 0.0002, "elapsed_sec": 23831.532945871353, "step_time_sec": 8.230959587992402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2877, "loss": 4.598762512207031, "lr": 0.0002, "elapsed_sec": 23839.76378273964, "step_time_sec": 8.230665115988813, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2878, "loss": 4.478279113769531, "lr": 0.0002, "elapsed_sec": 23847.995282649994, "step_time_sec": 8.231353071983904, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2879, "loss": 4.593464374542236, "lr": 0.0002, "elapsed_sec": 23856.22524523735, "step_time_sec": 8.229831565986387, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2880, "loss": 4.504091262817383, "lr": 0.0002, "elapsed_sec": 23864.454406499863, "step_time_sec": 8.228948517004028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2881, "loss": 4.573498249053955, "lr": 0.0002, "elapsed_sec": 23872.684482336044, "step_time_sec": 8.229930850997334, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2882, "loss": 4.557616710662842, "lr": 0.0002, "elapsed_sec": 23880.91395664215, "step_time_sec": 8.229317751014605, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2883, "loss": 4.6382927894592285, "lr": 0.0002, "elapsed_sec": 23889.143023490906, "step_time_sec": 8.228911327983951, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2884, "loss": 4.603132247924805, "lr": 0.0002, "elapsed_sec": 23897.37098622322, "step_time_sec": 8.22775081099826, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2885, "loss": 4.583941459655762, "lr": 0.0002, "elapsed_sec": 23905.60195040703, "step_time_sec": 8.230784813000355, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2886, "loss": 4.503550052642822, "lr": 0.0002, "elapsed_sec": 23913.830531835556, "step_time_sec": 8.228483977989526, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2887, "loss": 4.598721981048584, "lr": 0.0002, "elapsed_sec": 23922.058312892914, "step_time_sec": 8.227596290991642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2888, "loss": 4.492114067077637, "lr": 0.0002, "elapsed_sec": 23930.290065288544, "step_time_sec": 8.231584472989198, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2889, "loss": 4.530465126037598, "lr": 0.0002, "elapsed_sec": 23938.51909017563, "step_time_sec": 8.22891829299624, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2890, "loss": 4.39451265335083, "lr": 0.0002, "elapsed_sec": 23946.75144982338, "step_time_sec": 8.232116224011406, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2891, "loss": 4.56776762008667, "lr": 0.0002, "elapsed_sec": 23954.981977939606, "step_time_sec": 8.23045606497908, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2892, "loss": 4.477401256561279, "lr": 0.0002, "elapsed_sec": 23963.213097810745, "step_time_sec": 8.230932339007268, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2893, "loss": 4.609582901000977, "lr": 0.0002, "elapsed_sec": 23971.444159030914, "step_time_sec": 8.230907428980572, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2894, "loss": 4.639607906341553, "lr": 0.0002, "elapsed_sec": 23979.675417900085, "step_time_sec": 8.23106378200464, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2895, "loss": 4.4552788734436035, "lr": 0.0002, "elapsed_sec": 23987.90641117096, "step_time_sec": 8.230818841984728, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2896, "loss": 4.617063522338867, "lr": 0.0002, "elapsed_sec": 23996.1374065876, "step_time_sec": 8.230857934977394, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2897, "loss": 4.578759670257568, "lr": 0.0002, "elapsed_sec": 24004.368327140808, "step_time_sec": 8.230808915977832, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2898, "loss": 4.4786882400512695, "lr": 0.0002, "elapsed_sec": 24012.597633123398, "step_time_sec": 8.229102165991208, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2899, "loss": 4.577725887298584, "lr": 0.0002, "elapsed_sec": 24020.82681941986, "step_time_sec": 8.22908357900451, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2900, "loss": 4.79613733291626, "lr": 0.0002, "elapsed_sec": 24029.054409742355, "step_time_sec": 8.227380277006887, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2901, "loss": 4.551348686218262, "lr": 0.0002, "elapsed_sec": 24037.283765792847, "step_time_sec": 8.229201573005412, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2902, "loss": 4.589111804962158, "lr": 0.0002, "elapsed_sec": 24045.51246905327, "step_time_sec": 8.22858371498296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2903, "loss": 4.619271755218506, "lr": 0.0002, "elapsed_sec": 24053.74188184738, "step_time_sec": 8.229235231003258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2904, "loss": 4.600138187408447, "lr": 0.0002, "elapsed_sec": 24061.970329284668, "step_time_sec": 8.228314271982526, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2905, "loss": 4.484142780303955, "lr": 0.0002, "elapsed_sec": 24070.201372146606, "step_time_sec": 8.230874667002354, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2906, "loss": 4.686042308807373, "lr": 0.0002, "elapsed_sec": 24078.432332992554, "step_time_sec": 8.23081296600867, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2907, "loss": 4.52003812789917, "lr": 0.0002, "elapsed_sec": 24086.664191246033, "step_time_sec": 8.231736193993129, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2908, "loss": 4.632213115692139, "lr": 0.0002, "elapsed_sec": 24094.89343237877, "step_time_sec": 8.229092580004362, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2909, "loss": 4.536559581756592, "lr": 0.0002, "elapsed_sec": 24103.12208867073, "step_time_sec": 8.228465332009364, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2910, "loss": 4.60026741027832, "lr": 0.0002, "elapsed_sec": 24111.35350584984, "step_time_sec": 8.231288908980787, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2911, "loss": 4.4648518562316895, "lr": 0.0002, "elapsed_sec": 24119.584859609604, "step_time_sec": 8.231204331008485, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2912, "loss": 4.600060939788818, "lr": 0.0002, "elapsed_sec": 24127.81619119644, "step_time_sec": 8.231126771017443, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2913, "loss": 4.683010578155518, "lr": 0.0002, "elapsed_sec": 24136.04420399666, "step_time_sec": 8.227869023976382, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2914, "loss": 4.398467540740967, "lr": 0.0002, "elapsed_sec": 24144.274448871613, "step_time_sec": 8.230078847002005, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2915, "loss": 4.569817066192627, "lr": 0.0002, "elapsed_sec": 24152.50511240959, "step_time_sec": 8.230516300012823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2916, "loss": 4.500409126281738, "lr": 0.0002, "elapsed_sec": 24160.73561525345, "step_time_sec": 8.230334013991524, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2917, "loss": 4.536463737487793, "lr": 0.0002, "elapsed_sec": 24168.96661090851, "step_time_sec": 8.230854939989513, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2918, "loss": 4.594810962677002, "lr": 0.0002, "elapsed_sec": 24177.19713306427, "step_time_sec": 8.230370368983131, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2919, "loss": 4.592286586761475, "lr": 0.0002, "elapsed_sec": 24185.42800807953, "step_time_sec": 8.230763638974167, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2920, "loss": 4.51588249206543, "lr": 0.0002, "elapsed_sec": 24193.65859055519, "step_time_sec": 8.230428505979944, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2921, "loss": 4.521244049072266, "lr": 0.0002, "elapsed_sec": 24201.889891147614, "step_time_sec": 8.231072400987614, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2922, "loss": 4.582005500793457, "lr": 0.0002, "elapsed_sec": 24210.120111703873, "step_time_sec": 8.230080584995449, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2923, "loss": 4.589579105377197, "lr": 0.0002, "elapsed_sec": 24218.34897184372, "step_time_sec": 8.228714332013624, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2924, "loss": 4.607024192810059, "lr": 0.0002, "elapsed_sec": 24226.57753586769, "step_time_sec": 8.228462783008581, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2925, "loss": 4.613606929779053, "lr": 0.0002, "elapsed_sec": 24234.807153701782, "step_time_sec": 8.229440577008063, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2926, "loss": 4.578705787658691, "lr": 0.0002, "elapsed_sec": 24243.038192749023, "step_time_sec": 8.230850249994546, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2927, "loss": 4.475072860717773, "lr": 0.0002, "elapsed_sec": 24251.26857471466, "step_time_sec": 8.230244894017233, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2928, "loss": 4.526270389556885, "lr": 0.0002, "elapsed_sec": 24259.499925374985, "step_time_sec": 8.231251072022133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2929, "loss": 4.478902816772461, "lr": 0.0002, "elapsed_sec": 24267.73005247116, "step_time_sec": 8.229897633980727, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2930, "loss": 4.61025857925415, "lr": 0.0002, "elapsed_sec": 24275.959160089493, "step_time_sec": 8.228935575985815, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2931, "loss": 4.561093330383301, "lr": 0.0002, "elapsed_sec": 24284.189979076385, "step_time_sec": 8.230689642979996, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2932, "loss": 4.5231218338012695, "lr": 0.0002, "elapsed_sec": 24292.421955108643, "step_time_sec": 8.231774368003244, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2933, "loss": 4.510174751281738, "lr": 0.0002, "elapsed_sec": 24300.653133630753, "step_time_sec": 8.231061113998294, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2934, "loss": 4.520911693572998, "lr": 0.0002, "elapsed_sec": 24308.882813692093, "step_time_sec": 8.229489577002823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2935, "loss": 4.59043025970459, "lr": 0.0002, "elapsed_sec": 24317.112307548523, "step_time_sec": 8.229345437983284, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2936, "loss": 4.603836536407471, "lr": 0.0002, "elapsed_sec": 24325.338960647583, "step_time_sec": 8.226499448006507, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2937, "loss": 4.463178634643555, "lr": 0.0002, "elapsed_sec": 24333.5664999485, "step_time_sec": 8.227385840000352, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2938, "loss": 4.486845970153809, "lr": 0.0002, "elapsed_sec": 24341.796008825302, "step_time_sec": 8.229389653017279, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2939, "loss": 4.5013747215271, "lr": 0.0002, "elapsed_sec": 24350.027554273605, "step_time_sec": 8.231346023996593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2940, "loss": 4.501907825469971, "lr": 0.0002, "elapsed_sec": 24358.258303642273, "step_time_sec": 8.23066910597845, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2941, "loss": 4.362703800201416, "lr": 0.0002, "elapsed_sec": 24366.487366199493, "step_time_sec": 8.228883211006178, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2942, "loss": 4.498821258544922, "lr": 0.0002, "elapsed_sec": 24374.71521115303, "step_time_sec": 8.227676670998335, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2943, "loss": 4.65888786315918, "lr": 0.0002, "elapsed_sec": 24382.946315526962, "step_time_sec": 8.230915874999482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2944, "loss": 4.614913463592529, "lr": 0.0002, "elapsed_sec": 24391.176568984985, "step_time_sec": 8.230128614988644, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2945, "loss": 4.470675468444824, "lr": 0.0002, "elapsed_sec": 24399.40762734413, "step_time_sec": 8.230907835997641, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2946, "loss": 4.651138782501221, "lr": 0.0002, "elapsed_sec": 24407.638184070587, "step_time_sec": 8.230387543007964, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2947, "loss": 4.586792945861816, "lr": 0.0002, "elapsed_sec": 24415.86936020851, "step_time_sec": 8.231008881994057, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2948, "loss": 4.398551940917969, "lr": 0.0002, "elapsed_sec": 24424.100815296173, "step_time_sec": 8.23129536799388, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2949, "loss": 4.478698253631592, "lr": 0.0002, "elapsed_sec": 24432.329055070877, "step_time_sec": 8.228103180998005, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2950, "loss": 4.676365852355957, "lr": 0.0002, "elapsed_sec": 24440.559324026108, "step_time_sec": 8.230104212998413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2951, "loss": 4.578207492828369, "lr": 0.0002, "elapsed_sec": 24448.79017472267, "step_time_sec": 8.230723465996562, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2952, "loss": 4.562243938446045, "lr": 0.0002, "elapsed_sec": 24457.02085661888, "step_time_sec": 8.230541153025115, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2953, "loss": 4.526615619659424, "lr": 0.0002, "elapsed_sec": 24465.25212073326, "step_time_sec": 8.231100617995253, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2954, "loss": 4.6023268699646, "lr": 0.0002, "elapsed_sec": 24473.483867883682, "step_time_sec": 8.231568437011447, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2955, "loss": 4.53168249130249, "lr": 0.0002, "elapsed_sec": 24481.7146627903, "step_time_sec": 8.230681615998037, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2956, "loss": 4.5562005043029785, "lr": 0.0002, "elapsed_sec": 24489.94615650177, "step_time_sec": 8.231353504001163, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2957, "loss": 4.4491753578186035, "lr": 0.0002, "elapsed_sec": 24498.17546105385, "step_time_sec": 8.229086136998376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2958, "loss": 4.472263336181641, "lr": 0.0002, "elapsed_sec": 24506.404908657074, "step_time_sec": 8.229306242981693, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2959, "loss": 4.479744911193848, "lr": 0.0002, "elapsed_sec": 24514.633224964142, "step_time_sec": 8.22821219699108, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2960, "loss": 4.763888359069824, "lr": 0.0002, "elapsed_sec": 24522.862054109573, "step_time_sec": 8.228617307991954, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2961, "loss": 4.4713616371154785, "lr": 0.0002, "elapsed_sec": 24531.09281396866, "step_time_sec": 8.230620149988681, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2962, "loss": 4.7205352783203125, "lr": 0.0002, "elapsed_sec": 24539.323005199432, "step_time_sec": 8.230051674006972, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2963, "loss": 4.528275966644287, "lr": 0.0002, "elapsed_sec": 24547.55278325081, "step_time_sec": 8.229591589974007, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2964, "loss": 4.462243556976318, "lr": 0.0002, "elapsed_sec": 24555.781356096268, "step_time_sec": 8.228425605018856, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2965, "loss": 4.579093933105469, "lr": 0.0002, "elapsed_sec": 24564.010279655457, "step_time_sec": 8.228758701996412, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2966, "loss": 4.641961574554443, "lr": 0.0002, "elapsed_sec": 24572.240550518036, "step_time_sec": 8.23012774399831, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2967, "loss": 4.5006585121154785, "lr": 0.0002, "elapsed_sec": 24580.47088766098, "step_time_sec": 8.230178541998612, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2968, "loss": 4.563503742218018, "lr": 0.0002, "elapsed_sec": 24588.700278520584, "step_time_sec": 8.229259244020795, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2969, "loss": 4.536167621612549, "lr": 0.0002, "elapsed_sec": 24596.931715011597, "step_time_sec": 8.231301541003631, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2970, "loss": 4.502870082855225, "lr": 0.0002, "elapsed_sec": 24605.160922527313, "step_time_sec": 8.229051897011232, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2971, "loss": 4.411904335021973, "lr": 0.0002, "elapsed_sec": 24613.390996456146, "step_time_sec": 8.229888342990307, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2972, "loss": 4.455585479736328, "lr": 0.0002, "elapsed_sec": 24621.622328042984, "step_time_sec": 8.231187375000445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2973, "loss": 4.59849739074707, "lr": 0.0002, "elapsed_sec": 24629.852995872498, "step_time_sec": 8.23049101801007, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2974, "loss": 4.517385005950928, "lr": 0.0002, "elapsed_sec": 24638.08105778694, "step_time_sec": 8.227890901005594, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2975, "loss": 4.585744857788086, "lr": 0.0002, "elapsed_sec": 24646.3094894886, "step_time_sec": 8.228329679986928, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2976, "loss": 4.602401256561279, "lr": 0.0002, "elapsed_sec": 24654.540392398834, "step_time_sec": 8.230691920995014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2977, "loss": 4.393040180206299, "lr": 0.0002, "elapsed_sec": 24662.77148962021, "step_time_sec": 8.230936520994874, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2978, "loss": 4.649991989135742, "lr": 0.0002, "elapsed_sec": 24671.001732587814, "step_time_sec": 8.230123353016097, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2979, "loss": 4.569704532623291, "lr": 0.0002, "elapsed_sec": 24679.23281621933, "step_time_sec": 8.230908897996414, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2980, "loss": 4.540932655334473, "lr": 0.0002, "elapsed_sec": 24687.462364435196, "step_time_sec": 8.229402203985956, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2981, "loss": 4.6173930168151855, "lr": 0.0002, "elapsed_sec": 24695.691501140594, "step_time_sec": 8.228975016012555, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2982, "loss": 4.663516044616699, "lr": 0.0002, "elapsed_sec": 24703.92102575302, "step_time_sec": 8.229381448996719, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2983, "loss": 4.542163848876953, "lr": 0.0002, "elapsed_sec": 24712.15006017685, "step_time_sec": 8.228871533996426, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2984, "loss": 4.481139183044434, "lr": 0.0002, "elapsed_sec": 24720.380165815353, "step_time_sec": 8.229987452010391, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2985, "loss": 4.657299041748047, "lr": 0.0002, "elapsed_sec": 24728.60825586319, "step_time_sec": 8.227880648017162, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2986, "loss": 4.56674861907959, "lr": 0.0002, "elapsed_sec": 24736.83899283409, "step_time_sec": 8.230624608986545, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2987, "loss": 4.522998332977295, "lr": 0.0002, "elapsed_sec": 24745.069856643677, "step_time_sec": 8.23071748300572, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2988, "loss": 4.505543231964111, "lr": 0.0002, "elapsed_sec": 24753.300433397293, "step_time_sec": 8.230363044014666, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2989, "loss": 4.558279991149902, "lr": 0.0002, "elapsed_sec": 24761.53294491768, "step_time_sec": 8.23235699898214, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2990, "loss": 4.456969261169434, "lr": 0.0002, "elapsed_sec": 24769.76242375374, "step_time_sec": 8.229352700000163, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2991, "loss": 4.373049736022949, "lr": 0.0002, "elapsed_sec": 24777.992280960083, "step_time_sec": 8.22968130200752, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2992, "loss": 4.527470588684082, "lr": 0.0002, "elapsed_sec": 24786.223271608353, "step_time_sec": 8.230899043992395, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2993, "loss": 4.451897621154785, "lr": 0.0002, "elapsed_sec": 24794.454938411713, "step_time_sec": 8.231417511007749, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2994, "loss": 4.530401229858398, "lr": 0.0002, "elapsed_sec": 24802.685157060623, "step_time_sec": 8.230060861009406, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2995, "loss": 4.510833263397217, "lr": 0.0002, "elapsed_sec": 24810.91616821289, "step_time_sec": 8.230875765002565, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2996, "loss": 4.707082271575928, "lr": 0.0002, "elapsed_sec": 24819.14773464203, "step_time_sec": 8.231383001984796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2997, "loss": 4.606380462646484, "lr": 0.0002, "elapsed_sec": 24827.378623008728, "step_time_sec": 8.230737814999884, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2998, "loss": 4.3550567626953125, "lr": 0.0002, "elapsed_sec": 24835.607790470123, "step_time_sec": 8.229052048001904, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 2999, "loss": 4.561082363128662, "lr": 0.0002, "elapsed_sec": 24843.836261749268, "step_time_sec": 8.228278614988085, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3000, "loss": 4.466728210449219, "lr": 0.0002, "elapsed_sec": 24852.06564259529, "step_time_sec": 51.39885063399561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": true, "probe_ran": true, "probe_elapsed_sec": 0.9834307570126839, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3001, "loss": 4.470818519592285, "lr": 0.0002, "elapsed_sec": 24903.476132631302, "step_time_sec": 8.240767625014996, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3002, "loss": 4.485359191894531, "lr": 0.0002, "elapsed_sec": 24911.705514907837, "step_time_sec": 8.229210847988725, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3003, "loss": 4.4992475509643555, "lr": 0.0002, "elapsed_sec": 24919.936400175095, "step_time_sec": 8.230734940996626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3004, "loss": 4.471918106079102, "lr": 0.0002, "elapsed_sec": 24928.167743206024, "step_time_sec": 8.231189145997632, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3005, "loss": 4.562265396118164, "lr": 0.0002, "elapsed_sec": 24936.398186683655, "step_time_sec": 8.230279210984008, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3006, "loss": 4.520921230316162, "lr": 0.0002, "elapsed_sec": 24944.629564762115, "step_time_sec": 8.231229066994274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3007, "loss": 4.565518379211426, "lr": 0.0002, "elapsed_sec": 24952.86073088646, "step_time_sec": 8.230988467985298, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3008, "loss": 4.504579544067383, "lr": 0.0002, "elapsed_sec": 24961.091388702393, "step_time_sec": 8.230528109997977, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3009, "loss": 4.5583343505859375, "lr": 0.0002, "elapsed_sec": 24969.322280406952, "step_time_sec": 8.230766883993056, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3010, "loss": 4.566969394683838, "lr": 0.0002, "elapsed_sec": 24977.553594350815, "step_time_sec": 8.231089414999587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3011, "loss": 4.443416118621826, "lr": 0.0002, "elapsed_sec": 24985.784532785416, "step_time_sec": 8.230826941988198, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3012, "loss": 4.309601306915283, "lr": 0.0002, "elapsed_sec": 24994.015322446823, "step_time_sec": 8.230606831988553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3013, "loss": 4.45321798324585, "lr": 0.0002, "elapsed_sec": 25002.246587753296, "step_time_sec": 8.231145280995406, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3014, "loss": 4.579485893249512, "lr": 0.0002, "elapsed_sec": 25010.477969646454, "step_time_sec": 8.231203923991416, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3015, "loss": 4.425179481506348, "lr": 0.0002, "elapsed_sec": 25018.709466695786, "step_time_sec": 8.231292036012746, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3016, "loss": 4.4177703857421875, "lr": 0.0002, "elapsed_sec": 25026.939260721207, "step_time_sec": 8.229626499000005, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3017, "loss": 4.629584312438965, "lr": 0.0002, "elapsed_sec": 25035.17011642456, "step_time_sec": 8.230763622006634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3018, "loss": 4.498134136199951, "lr": 0.0002, "elapsed_sec": 25043.40179848671, "step_time_sec": 8.23144436700386, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3019, "loss": 4.451782703399658, "lr": 0.0002, "elapsed_sec": 25051.631393432617, "step_time_sec": 8.229445794015191, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3020, "loss": 4.572837829589844, "lr": 0.0002, "elapsed_sec": 25059.859826803207, "step_time_sec": 8.228274149005301, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3021, "loss": 4.566891193389893, "lr": 0.0002, "elapsed_sec": 25068.09068250656, "step_time_sec": 8.23072730898275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3022, "loss": 4.502648830413818, "lr": 0.0002, "elapsed_sec": 25076.321719646454, "step_time_sec": 8.230820859025698, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3023, "loss": 4.412215232849121, "lr": 0.0002, "elapsed_sec": 25084.553097963333, "step_time_sec": 8.231231721991207, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3024, "loss": 4.4787821769714355, "lr": 0.0002, "elapsed_sec": 25092.784078598022, "step_time_sec": 8.230824664991815, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3025, "loss": 4.545645236968994, "lr": 0.0002, "elapsed_sec": 25101.01505637169, "step_time_sec": 8.230863463017158, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3026, "loss": 4.699020862579346, "lr": 0.0002, "elapsed_sec": 25109.246095895767, "step_time_sec": 8.230808303982485, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3027, "loss": 4.457080841064453, "lr": 0.0002, "elapsed_sec": 25117.476003408432, "step_time_sec": 8.229747008008417, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3028, "loss": 4.412954807281494, "lr": 0.0002, "elapsed_sec": 25125.704279899597, "step_time_sec": 8.228152811992913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3029, "loss": 4.550955772399902, "lr": 0.0002, "elapsed_sec": 25133.933147907257, "step_time_sec": 8.228670818003593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3030, "loss": 4.442379951477051, "lr": 0.0002, "elapsed_sec": 25142.16252732277, "step_time_sec": 8.229256722988794, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3031, "loss": 4.384205341339111, "lr": 0.0002, "elapsed_sec": 25150.392004966736, "step_time_sec": 8.229261960019358, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3032, "loss": 4.439769744873047, "lr": 0.0002, "elapsed_sec": 25158.623046875, "step_time_sec": 8.230893962987466, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3033, "loss": 4.473965167999268, "lr": 0.0002, "elapsed_sec": 25166.853402137756, "step_time_sec": 8.230181232007453, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3034, "loss": 4.53125524520874, "lr": 0.0002, "elapsed_sec": 25175.0840780735, "step_time_sec": 8.230532843997935, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3035, "loss": 4.419245719909668, "lr": 0.0002, "elapsed_sec": 25183.314154863358, "step_time_sec": 8.229922575003002, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3036, "loss": 4.424022197723389, "lr": 0.0002, "elapsed_sec": 25191.5422642231, "step_time_sec": 8.22794967298978, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3037, "loss": 4.582273483276367, "lr": 0.0002, "elapsed_sec": 25199.773838996887, "step_time_sec": 8.23144695701194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3038, "loss": 4.512271881103516, "lr": 0.0002, "elapsed_sec": 25208.004496574402, "step_time_sec": 8.230566344980616, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3039, "loss": 4.594845294952393, "lr": 0.0002, "elapsed_sec": 25216.237416267395, "step_time_sec": 8.232716700993478, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3040, "loss": 4.426325798034668, "lr": 0.0002, "elapsed_sec": 25224.466050624847, "step_time_sec": 8.228422800981207, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3041, "loss": 4.371211528778076, "lr": 0.0002, "elapsed_sec": 25232.696018218994, "step_time_sec": 8.229814627004089, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3042, "loss": 4.398573875427246, "lr": 0.0002, "elapsed_sec": 25240.926849365234, "step_time_sec": 8.230720884981565, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3043, "loss": 4.598719120025635, "lr": 0.0002, "elapsed_sec": 25249.15794467926, "step_time_sec": 8.230850854015443, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3044, "loss": 4.396803855895996, "lr": 0.0002, "elapsed_sec": 25257.389053821564, "step_time_sec": 8.231014896999113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3045, "loss": 4.429068088531494, "lr": 0.0002, "elapsed_sec": 25265.619226932526, "step_time_sec": 8.22997004701756, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3046, "loss": 4.502948760986328, "lr": 0.0002, "elapsed_sec": 25273.848601818085, "step_time_sec": 8.229285523993894, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3047, "loss": 4.515621185302734, "lr": 0.0002, "elapsed_sec": 25282.079463481903, "step_time_sec": 8.230698583996855, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3048, "loss": 4.499756336212158, "lr": 0.0002, "elapsed_sec": 25290.30987906456, "step_time_sec": 8.230215135990875, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3049, "loss": 4.428409099578857, "lr": 0.0002, "elapsed_sec": 25298.54158139229, "step_time_sec": 8.231516611005645, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3050, "loss": 4.590595722198486, "lr": 0.0002, "elapsed_sec": 25306.772242069244, "step_time_sec": 8.230557381000835, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3051, "loss": 4.400084972381592, "lr": 0.0002, "elapsed_sec": 25315.004252672195, "step_time_sec": 8.231804524024483, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3052, "loss": 4.4301838874816895, "lr": 0.0002, "elapsed_sec": 25323.23468208313, "step_time_sec": 8.230294484994374, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3053, "loss": 4.527826309204102, "lr": 0.0002, "elapsed_sec": 25331.46511054039, "step_time_sec": 8.23029053400387, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3054, "loss": 4.5423383712768555, "lr": 0.0002, "elapsed_sec": 25339.694929599762, "step_time_sec": 8.229620988975512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3055, "loss": 4.523899555206299, "lr": 0.0002, "elapsed_sec": 25347.924920082092, "step_time_sec": 8.229786373005481, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3056, "loss": 4.460410118103027, "lr": 0.0002, "elapsed_sec": 25356.154370069504, "step_time_sec": 8.22932907298673, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3057, "loss": 4.546917915344238, "lr": 0.0002, "elapsed_sec": 25364.3830909729, "step_time_sec": 8.228538308001589, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3058, "loss": 4.553771495819092, "lr": 0.0002, "elapsed_sec": 25372.613676548004, "step_time_sec": 8.23047619999852, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3059, "loss": 4.498196601867676, "lr": 0.0002, "elapsed_sec": 25380.843078136444, "step_time_sec": 8.229194229003042, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3060, "loss": 4.572686195373535, "lr": 0.0002, "elapsed_sec": 25389.070969820023, "step_time_sec": 8.227720822003903, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3061, "loss": 4.401264667510986, "lr": 0.0002, "elapsed_sec": 25397.302394628525, "step_time_sec": 8.23131405498134, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3062, "loss": 4.575231552124023, "lr": 0.0002, "elapsed_sec": 25405.53175830841, "step_time_sec": 8.229149524006061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3063, "loss": 4.460958480834961, "lr": 0.0002, "elapsed_sec": 25413.75992155075, "step_time_sec": 8.228027673991164, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3064, "loss": 4.315942764282227, "lr": 0.0002, "elapsed_sec": 25421.989258289337, "step_time_sec": 8.229190392012242, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3065, "loss": 4.443437099456787, "lr": 0.0002, "elapsed_sec": 25430.21871805191, "step_time_sec": 8.22932250899612, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3066, "loss": 4.314659118652344, "lr": 0.0002, "elapsed_sec": 25438.446949720383, "step_time_sec": 8.22813447099179, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3067, "loss": 4.487939834594727, "lr": 0.0002, "elapsed_sec": 25446.675914287567, "step_time_sec": 8.228775009978563, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3068, "loss": 4.388115882873535, "lr": 0.0002, "elapsed_sec": 25454.907406806946, "step_time_sec": 8.231297138001537, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3069, "loss": 4.456025123596191, "lr": 0.0002, "elapsed_sec": 25463.137902498245, "step_time_sec": 8.230348187003983, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3070, "loss": 4.484449863433838, "lr": 0.0002, "elapsed_sec": 25471.369254112244, "step_time_sec": 8.231242177978856, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3071, "loss": 4.446601390838623, "lr": 0.0002, "elapsed_sec": 25479.598600149155, "step_time_sec": 8.229173959989566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3072, "loss": 4.51076078414917, "lr": 0.0002, "elapsed_sec": 25487.82797574997, "step_time_sec": 8.229180255992105, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3073, "loss": 4.430323123931885, "lr": 0.0002, "elapsed_sec": 25496.058425188065, "step_time_sec": 8.230330393998884, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3074, "loss": 4.370058059692383, "lr": 0.0002, "elapsed_sec": 25504.289551258087, "step_time_sec": 8.230953919002786, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3075, "loss": 4.30721378326416, "lr": 0.0002, "elapsed_sec": 25512.51937675476, "step_time_sec": 8.22971823302214, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3076, "loss": 4.608954906463623, "lr": 0.0002, "elapsed_sec": 25520.750720739365, "step_time_sec": 8.23114204002195, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3077, "loss": 4.577596187591553, "lr": 0.0002, "elapsed_sec": 25528.98275732994, "step_time_sec": 8.231871202995535, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3078, "loss": 4.637663841247559, "lr": 0.0002, "elapsed_sec": 25537.212285757065, "step_time_sec": 8.22944278398063, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3079, "loss": 4.2776713371276855, "lr": 0.0002, "elapsed_sec": 25545.442372083664, "step_time_sec": 8.22991686000023, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3080, "loss": 4.60506010055542, "lr": 0.0002, "elapsed_sec": 25553.671313524246, "step_time_sec": 8.228758878016379, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3081, "loss": 4.5102763175964355, "lr": 0.0002, "elapsed_sec": 25561.8994512558, "step_time_sec": 8.227963593992172, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3082, "loss": 4.4080071449279785, "lr": 0.0002, "elapsed_sec": 25570.129108428955, "step_time_sec": 8.229515766986879, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3083, "loss": 4.556649684906006, "lr": 0.0002, "elapsed_sec": 25578.35925602913, "step_time_sec": 8.230026339006145, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3084, "loss": 4.419519901275635, "lr": 0.0002, "elapsed_sec": 25586.59001159668, "step_time_sec": 8.230619525013026, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3085, "loss": 4.5282416343688965, "lr": 0.0002, "elapsed_sec": 25594.819988965988, "step_time_sec": 8.22981905200868, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3086, "loss": 4.6058349609375, "lr": 0.0002, "elapsed_sec": 25603.050424575806, "step_time_sec": 8.230220676981844, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3087, "loss": 4.534759521484375, "lr": 0.0002, "elapsed_sec": 25611.281548023224, "step_time_sec": 8.23101839699666, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3088, "loss": 4.36432409286499, "lr": 0.0002, "elapsed_sec": 25619.509914159775, "step_time_sec": 8.228193479997572, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3089, "loss": 4.604262828826904, "lr": 0.0002, "elapsed_sec": 25627.739303588867, "step_time_sec": 8.229249196010642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3090, "loss": 4.38106632232666, "lr": 0.0002, "elapsed_sec": 25635.965512275696, "step_time_sec": 8.226022293994902, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3091, "loss": 4.449632167816162, "lr": 0.0002, "elapsed_sec": 25644.194018125534, "step_time_sec": 8.228347075986676, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3092, "loss": 4.379798889160156, "lr": 0.0002, "elapsed_sec": 25652.4216196537, "step_time_sec": 8.227495803002967, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3093, "loss": 4.347586631774902, "lr": 0.0002, "elapsed_sec": 25660.65191078186, "step_time_sec": 8.230080425011693, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3094, "loss": 4.463610649108887, "lr": 0.0002, "elapsed_sec": 25668.88396835327, "step_time_sec": 8.23189455200918, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3095, "loss": 4.5220561027526855, "lr": 0.0002, "elapsed_sec": 25677.114607810974, "step_time_sec": 8.230491672991775, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3096, "loss": 4.476691722869873, "lr": 0.0002, "elapsed_sec": 25685.3434278965, "step_time_sec": 8.228648326010443, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3097, "loss": 4.534171104431152, "lr": 0.0002, "elapsed_sec": 25693.57292675972, "step_time_sec": 8.229422022996005, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3098, "loss": 4.507242679595947, "lr": 0.0002, "elapsed_sec": 25701.802916765213, "step_time_sec": 8.229744089010637, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3099, "loss": 4.4496684074401855, "lr": 0.0002, "elapsed_sec": 25710.03352499008, "step_time_sec": 8.230458959995303, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3100, "loss": 4.566051483154297, "lr": 0.0002, "elapsed_sec": 25718.262939929962, "step_time_sec": 8.229242611007066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3101, "loss": 4.414078712463379, "lr": 0.0002, "elapsed_sec": 25726.493042230606, "step_time_sec": 8.230007342994213, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3102, "loss": 4.4420976638793945, "lr": 0.0002, "elapsed_sec": 25734.724046468735, "step_time_sec": 8.230816782015609, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3103, "loss": 4.530849456787109, "lr": 0.0002, "elapsed_sec": 25742.95553946495, "step_time_sec": 8.231305724009871, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3104, "loss": 4.456760406494141, "lr": 0.0002, "elapsed_sec": 25751.186690330505, "step_time_sec": 8.23101612998289, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3105, "loss": 4.466392993927002, "lr": 0.0002, "elapsed_sec": 25759.41742682457, "step_time_sec": 8.23059318898595, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3106, "loss": 4.491484642028809, "lr": 0.0002, "elapsed_sec": 25767.64780306816, "step_time_sec": 8.2302591369953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3107, "loss": 4.534976005554199, "lr": 0.0002, "elapsed_sec": 25775.878826856613, "step_time_sec": 8.230798707023496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3108, "loss": 4.48853063583374, "lr": 0.0002, "elapsed_sec": 25784.11006975174, "step_time_sec": 8.231087906984612, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3109, "loss": 4.5087056159973145, "lr": 0.0002, "elapsed_sec": 25792.339266061783, "step_time_sec": 8.229068751999876, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3110, "loss": 4.400065898895264, "lr": 0.0002, "elapsed_sec": 25800.566823005676, "step_time_sec": 8.227381731005153, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3111, "loss": 4.316638946533203, "lr": 0.0002, "elapsed_sec": 25808.804991960526, "step_time_sec": 8.231385952007258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3112, "loss": 4.42234468460083, "lr": 0.0002, "elapsed_sec": 25817.03527903557, "step_time_sec": 8.230113629018888, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3113, "loss": 4.494475364685059, "lr": 0.0002, "elapsed_sec": 25825.265151262283, "step_time_sec": 8.22969394698157, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3114, "loss": 4.516253471374512, "lr": 0.0002, "elapsed_sec": 25833.495725393295, "step_time_sec": 8.230450037983246, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3115, "loss": 4.472161293029785, "lr": 0.0002, "elapsed_sec": 25841.726578235626, "step_time_sec": 8.230724349006778, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3116, "loss": 4.351531982421875, "lr": 0.0002, "elapsed_sec": 25849.958312273026, "step_time_sec": 8.231546677998267, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3117, "loss": 4.452042102813721, "lr": 0.0002, "elapsed_sec": 25858.187751293182, "step_time_sec": 8.22936038998887, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3118, "loss": 4.4184770584106445, "lr": 0.0002, "elapsed_sec": 25866.41741991043, "step_time_sec": 8.22942992800381, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3119, "loss": 4.5960588455200195, "lr": 0.0002, "elapsed_sec": 25874.648166179657, "step_time_sec": 8.230610385013279, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3120, "loss": 4.585423946380615, "lr": 0.0002, "elapsed_sec": 25882.879094839096, "step_time_sec": 8.230806149018463, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3121, "loss": 4.450464725494385, "lr": 0.0002, "elapsed_sec": 25891.110422372818, "step_time_sec": 8.231149338011164, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3122, "loss": 4.399534702301025, "lr": 0.0002, "elapsed_sec": 25899.340787410736, "step_time_sec": 8.230211218993645, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3123, "loss": 4.4506988525390625, "lr": 0.0002, "elapsed_sec": 25907.572592258453, "step_time_sec": 8.23164225500659, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3124, "loss": 4.496690273284912, "lr": 0.0002, "elapsed_sec": 25915.80344057083, "step_time_sec": 8.230670185002964, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3125, "loss": 4.279498100280762, "lr": 0.0002, "elapsed_sec": 25924.030793190002, "step_time_sec": 8.227220695000142, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3126, "loss": 4.585143089294434, "lr": 0.0002, "elapsed_sec": 25932.26024198532, "step_time_sec": 8.229260613006772, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3127, "loss": 4.432993412017822, "lr": 0.0002, "elapsed_sec": 25940.49156332016, "step_time_sec": 8.231155543006025, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3128, "loss": 4.469257354736328, "lr": 0.0002, "elapsed_sec": 25948.72260260582, "step_time_sec": 8.230925003997982, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3129, "loss": 4.508471965789795, "lr": 0.0002, "elapsed_sec": 25956.953171491623, "step_time_sec": 8.230461391998688, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3130, "loss": 4.440368175506592, "lr": 0.0002, "elapsed_sec": 25965.181361436844, "step_time_sec": 8.227985733014066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3131, "loss": 4.4681620597839355, "lr": 0.0002, "elapsed_sec": 25973.40966939926, "step_time_sec": 8.228139163024025, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3132, "loss": 4.380584239959717, "lr": 0.0002, "elapsed_sec": 25981.64018344879, "step_time_sec": 8.230432610987918, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3133, "loss": 4.332805156707764, "lr": 0.0002, "elapsed_sec": 25989.871198415756, "step_time_sec": 8.230806261999533, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3134, "loss": 4.426947116851807, "lr": 0.0002, "elapsed_sec": 25998.102251529694, "step_time_sec": 8.230931403988507, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3135, "loss": 4.542978763580322, "lr": 0.0002, "elapsed_sec": 26006.33243727684, "step_time_sec": 8.229967382998439, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3136, "loss": 4.481237888336182, "lr": 0.0002, "elapsed_sec": 26014.56311392784, "step_time_sec": 8.230527551000705, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3137, "loss": 4.4311017990112305, "lr": 0.0002, "elapsed_sec": 26022.794270038605, "step_time_sec": 8.231062230013777, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3138, "loss": 4.408867835998535, "lr": 0.0002, "elapsed_sec": 26031.02420091629, "step_time_sec": 8.229781645990442, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3139, "loss": 4.485312461853027, "lr": 0.0002, "elapsed_sec": 26039.253507614136, "step_time_sec": 8.229105478007114, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3140, "loss": 4.413788318634033, "lr": 0.0002, "elapsed_sec": 26047.484511375427, "step_time_sec": 8.230845921993023, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3141, "loss": 4.514333724975586, "lr": 0.0002, "elapsed_sec": 26055.714396715164, "step_time_sec": 8.229734215012286, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3142, "loss": 4.592126846313477, "lr": 0.0002, "elapsed_sec": 26063.944145679474, "step_time_sec": 8.229660896991845, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3143, "loss": 4.446378707885742, "lr": 0.0002, "elapsed_sec": 26072.174556970596, "step_time_sec": 8.230230808985652, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3144, "loss": 4.434617519378662, "lr": 0.0002, "elapsed_sec": 26080.40620470047, "step_time_sec": 8.231442430987954, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3145, "loss": 4.443873405456543, "lr": 0.0002, "elapsed_sec": 26088.63486623764, "step_time_sec": 8.228505951992702, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3146, "loss": 4.499070644378662, "lr": 0.0002, "elapsed_sec": 26096.863597393036, "step_time_sec": 8.22862629199517, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3147, "loss": 4.433960914611816, "lr": 0.0002, "elapsed_sec": 26105.09200692177, "step_time_sec": 8.228226003004238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3148, "loss": 4.398509979248047, "lr": 0.0002, "elapsed_sec": 26113.322782993317, "step_time_sec": 8.230604648008011, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3149, "loss": 4.402205467224121, "lr": 0.0002, "elapsed_sec": 26121.55404496193, "step_time_sec": 8.231105943006696, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3150, "loss": 4.4102654457092285, "lr": 0.0002, "elapsed_sec": 26129.78466320038, "step_time_sec": 8.230473648989573, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3151, "loss": 4.552150249481201, "lr": 0.0002, "elapsed_sec": 26138.01544523239, "step_time_sec": 8.230670186982024, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3152, "loss": 4.496363162994385, "lr": 0.0002, "elapsed_sec": 26146.247014522552, "step_time_sec": 8.231418488983763, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3153, "loss": 4.563864231109619, "lr": 0.0002, "elapsed_sec": 26154.477952957153, "step_time_sec": 8.23077188298339, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3154, "loss": 4.299012660980225, "lr": 0.0002, "elapsed_sec": 26162.709989070892, "step_time_sec": 8.231797212996753, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3155, "loss": 4.499622821807861, "lr": 0.0002, "elapsed_sec": 26170.940871953964, "step_time_sec": 8.230744503991446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3156, "loss": 4.366950988769531, "lr": 0.0002, "elapsed_sec": 26179.17319202423, "step_time_sec": 8.232143330009421, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3157, "loss": 4.420504570007324, "lr": 0.0002, "elapsed_sec": 26187.403670310974, "step_time_sec": 8.23031175599317, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3158, "loss": 4.640929698944092, "lr": 0.0002, "elapsed_sec": 26195.635179042816, "step_time_sec": 8.231353276001755, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3159, "loss": 4.603835105895996, "lr": 0.0002, "elapsed_sec": 26203.86659526825, "step_time_sec": 8.231284289009636, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3160, "loss": 4.422759532928467, "lr": 0.0002, "elapsed_sec": 26212.09833741188, "step_time_sec": 8.231644639978185, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3161, "loss": 4.4185590744018555, "lr": 0.0002, "elapsed_sec": 26220.327131032944, "step_time_sec": 8.22856014900026, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3162, "loss": 4.533501625061035, "lr": 0.0002, "elapsed_sec": 26228.55669760704, "step_time_sec": 8.229444632015657, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3163, "loss": 4.39523458480835, "lr": 0.0002, "elapsed_sec": 26236.785587072372, "step_time_sec": 8.228689058014425, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3164, "loss": 4.414966583251953, "lr": 0.0002, "elapsed_sec": 26245.013738393784, "step_time_sec": 8.227987366000889, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3165, "loss": 4.455351829528809, "lr": 0.0002, "elapsed_sec": 26253.243534564972, "step_time_sec": 8.229658317985013, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3166, "loss": 4.472109317779541, "lr": 0.0002, "elapsed_sec": 26261.473875284195, "step_time_sec": 8.230139512976166, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3167, "loss": 4.487344264984131, "lr": 0.0002, "elapsed_sec": 26269.706752300262, "step_time_sec": 8.232733846001793, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3168, "loss": 4.301603317260742, "lr": 0.0002, "elapsed_sec": 26277.938119888306, "step_time_sec": 8.231257061997894, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3169, "loss": 4.353638648986816, "lr": 0.0002, "elapsed_sec": 26286.16956949234, "step_time_sec": 8.23125349599286, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3170, "loss": 4.4510016441345215, "lr": 0.0002, "elapsed_sec": 26294.401163101196, "step_time_sec": 8.231432544009294, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3171, "loss": 4.425023078918457, "lr": 0.0002, "elapsed_sec": 26302.631983995438, "step_time_sec": 8.230641172995092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3172, "loss": 4.508490085601807, "lr": 0.0002, "elapsed_sec": 26310.86281323433, "step_time_sec": 8.230699924984947, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3173, "loss": 4.381847381591797, "lr": 0.0002, "elapsed_sec": 26319.093426942825, "step_time_sec": 8.230465905013261, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3174, "loss": 4.6182403564453125, "lr": 0.0002, "elapsed_sec": 26327.323684215546, "step_time_sec": 8.230137831007596, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3175, "loss": 4.477121353149414, "lr": 0.0002, "elapsed_sec": 26335.55519247055, "step_time_sec": 8.231297273014206, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3176, "loss": 4.673255443572998, "lr": 0.0002, "elapsed_sec": 26343.785224437714, "step_time_sec": 8.229901875020005, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3177, "loss": 4.536473274230957, "lr": 0.0002, "elapsed_sec": 26352.016857624054, "step_time_sec": 8.23145997102256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3178, "loss": 4.458102226257324, "lr": 0.0002, "elapsed_sec": 26360.248138189316, "step_time_sec": 8.23114960998646, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3179, "loss": 4.529295921325684, "lr": 0.0002, "elapsed_sec": 26368.477744579315, "step_time_sec": 8.229466230026446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3180, "loss": 4.500597953796387, "lr": 0.0002, "elapsed_sec": 26376.706575632095, "step_time_sec": 8.228654375008773, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3181, "loss": 4.464397430419922, "lr": 0.0002, "elapsed_sec": 26384.93545818329, "step_time_sec": 8.2287297139992, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3182, "loss": 4.420536994934082, "lr": 0.0002, "elapsed_sec": 26393.167284727097, "step_time_sec": 8.23172182397684, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3183, "loss": 4.42844295501709, "lr": 0.0002, "elapsed_sec": 26401.397754192352, "step_time_sec": 8.230267804989126, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3184, "loss": 4.4195075035095215, "lr": 0.0002, "elapsed_sec": 26409.627340078354, "step_time_sec": 8.229463093011873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3185, "loss": 4.405627250671387, "lr": 0.0002, "elapsed_sec": 26417.856460094452, "step_time_sec": 8.228908026008867, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3186, "loss": 4.461382865905762, "lr": 0.0002, "elapsed_sec": 26426.085708379745, "step_time_sec": 8.229089807980927, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3187, "loss": 4.621487617492676, "lr": 0.0002, "elapsed_sec": 26434.316982984543, "step_time_sec": 8.231148574996041, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3188, "loss": 4.513190746307373, "lr": 0.0002, "elapsed_sec": 26442.547823905945, "step_time_sec": 8.230684286012547, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3189, "loss": 4.513082027435303, "lr": 0.0002, "elapsed_sec": 26450.77930498123, "step_time_sec": 8.231282504013507, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3190, "loss": 4.452555179595947, "lr": 0.0002, "elapsed_sec": 26459.010066747665, "step_time_sec": 8.230649547011126, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3191, "loss": 4.51918363571167, "lr": 0.0002, "elapsed_sec": 26467.24160718918, "step_time_sec": 8.231351275986526, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3192, "loss": 4.415657043457031, "lr": 0.0002, "elapsed_sec": 26475.470849990845, "step_time_sec": 8.229109915002482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3193, "loss": 4.480160713195801, "lr": 0.0002, "elapsed_sec": 26483.69889831543, "step_time_sec": 8.227828720002435, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3194, "loss": 4.432823657989502, "lr": 0.0002, "elapsed_sec": 26491.92675113678, "step_time_sec": 8.227660510980058, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3195, "loss": 4.424227714538574, "lr": 0.0002, "elapsed_sec": 26500.155119657516, "step_time_sec": 8.22822394099785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3196, "loss": 4.520366668701172, "lr": 0.0002, "elapsed_sec": 26508.38423895836, "step_time_sec": 8.229038393008523, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3197, "loss": 4.6140947341918945, "lr": 0.0002, "elapsed_sec": 26516.61548113823, "step_time_sec": 8.231010249000974, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3198, "loss": 4.439587116241455, "lr": 0.0002, "elapsed_sec": 26524.847226142883, "step_time_sec": 8.231676267983858, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3199, "loss": 4.506829738616943, "lr": 0.0002, "elapsed_sec": 26533.077533245087, "step_time_sec": 8.230099241016433, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3200, "loss": 4.437835693359375, "lr": 0.0002, "elapsed_sec": 26541.308929681778, "step_time_sec": 8.231261315988377, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3201, "loss": 4.508228302001953, "lr": 0.0002, "elapsed_sec": 26549.53959751129, "step_time_sec": 8.230508467997424, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3202, "loss": 4.542181491851807, "lr": 0.0002, "elapsed_sec": 26557.766515016556, "step_time_sec": 8.226706540997839, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3203, "loss": 4.362091541290283, "lr": 0.0002, "elapsed_sec": 26565.9949593544, "step_time_sec": 8.228298801986966, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3204, "loss": 4.574263095855713, "lr": 0.0002, "elapsed_sec": 26574.2255692482, "step_time_sec": 8.230438495025737, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3205, "loss": 4.425631523132324, "lr": 0.0002, "elapsed_sec": 26582.455909729004, "step_time_sec": 8.230251611006679, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3206, "loss": 4.5056328773498535, "lr": 0.0002, "elapsed_sec": 26590.686386823654, "step_time_sec": 8.230284719000338, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3207, "loss": 4.461718559265137, "lr": 0.0002, "elapsed_sec": 26598.916863441467, "step_time_sec": 8.230329073005123, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3208, "loss": 4.6201629638671875, "lr": 0.0002, "elapsed_sec": 26607.147569417953, "step_time_sec": 8.230555371002993, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3209, "loss": 4.5525312423706055, "lr": 0.0002, "elapsed_sec": 26615.378932714462, "step_time_sec": 8.231208768993383, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3210, "loss": 4.473776340484619, "lr": 0.0002, "elapsed_sec": 26623.609072208405, "step_time_sec": 8.22999156199512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3211, "loss": 4.359553813934326, "lr": 0.0002, "elapsed_sec": 26631.840500354767, "step_time_sec": 8.231245484988904, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3212, "loss": 4.4687418937683105, "lr": 0.0002, "elapsed_sec": 26640.071082353592, "step_time_sec": 8.230431559000863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3213, "loss": 4.477515697479248, "lr": 0.0002, "elapsed_sec": 26648.300556898117, "step_time_sec": 8.229375766997691, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3214, "loss": 4.450435161590576, "lr": 0.0002, "elapsed_sec": 26656.529408454895, "step_time_sec": 8.228652077988954, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3215, "loss": 4.640117645263672, "lr": 0.0002, "elapsed_sec": 26664.759361982346, "step_time_sec": 8.229802794987336, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3216, "loss": 4.5370049476623535, "lr": 0.0002, "elapsed_sec": 26672.990649700165, "step_time_sec": 8.231126263010083, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3217, "loss": 4.520330905914307, "lr": 0.0002, "elapsed_sec": 26681.21942639351, "step_time_sec": 8.228622435999569, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3218, "loss": 4.707184791564941, "lr": 0.0002, "elapsed_sec": 26689.450253725052, "step_time_sec": 8.230727744987234, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3219, "loss": 4.4563307762146, "lr": 0.0002, "elapsed_sec": 26697.680045604706, "step_time_sec": 8.229616633005207, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3220, "loss": 4.439465522766113, "lr": 0.0002, "elapsed_sec": 26705.90993976593, "step_time_sec": 8.22968259602203, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3221, "loss": 4.561869144439697, "lr": 0.0002, "elapsed_sec": 26714.13981771469, "step_time_sec": 8.22971759000211, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3222, "loss": 4.291754722595215, "lr": 0.0002, "elapsed_sec": 26722.369158506393, "step_time_sec": 8.22920193200116, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3223, "loss": 4.450132846832275, "lr": 0.0002, "elapsed_sec": 26730.599324464798, "step_time_sec": 8.23004221401061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3224, "loss": 4.507689476013184, "lr": 0.0002, "elapsed_sec": 26738.82966899872, "step_time_sec": 8.230178036988946, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3225, "loss": 4.329500675201416, "lr": 0.0002, "elapsed_sec": 26747.05888414383, "step_time_sec": 8.229047586995875, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3226, "loss": 4.2949018478393555, "lr": 0.0002, "elapsed_sec": 26755.286558628082, "step_time_sec": 8.22751273002359, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3227, "loss": 5.217949390411377, "lr": 0.0002, "elapsed_sec": 26763.515472650528, "step_time_sec": 8.228815590002341, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3228, "loss": 4.323785781860352, "lr": 0.0002, "elapsed_sec": 26771.74653983116, "step_time_sec": 8.230832310015103, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3229, "loss": 4.286735534667969, "lr": 0.0002, "elapsed_sec": 26779.977441072464, "step_time_sec": 8.23079656198388, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3230, "loss": 4.389774799346924, "lr": 0.0002, "elapsed_sec": 26788.205563545227, "step_time_sec": 8.227925212995615, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3231, "loss": 4.531062602996826, "lr": 0.0002, "elapsed_sec": 26796.43461561203, "step_time_sec": 8.228886833996512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3232, "loss": 4.541388988494873, "lr": 0.0002, "elapsed_sec": 26804.66276884079, "step_time_sec": 8.228022280003643, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3233, "loss": 4.446537017822266, "lr": 0.0002, "elapsed_sec": 26812.892379522324, "step_time_sec": 8.229495969018899, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3234, "loss": 4.445704460144043, "lr": 0.0002, "elapsed_sec": 26821.11863398552, "step_time_sec": 8.22607000300195, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3235, "loss": 4.551733493804932, "lr": 0.0002, "elapsed_sec": 26829.348955869675, "step_time_sec": 8.23016929498408, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3236, "loss": 4.5209197998046875, "lr": 0.0002, "elapsed_sec": 26837.5782020092, "step_time_sec": 8.229154994012788, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3237, "loss": 4.507297515869141, "lr": 0.0002, "elapsed_sec": 26845.8064596653, "step_time_sec": 8.2281110350159, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3238, "loss": 4.554379940032959, "lr": 0.0002, "elapsed_sec": 26854.037650585175, "step_time_sec": 8.23095481900964, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3239, "loss": 4.462619781494141, "lr": 0.0002, "elapsed_sec": 26862.265671253204, "step_time_sec": 8.227837106998777, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3240, "loss": 4.43787145614624, "lr": 0.0002, "elapsed_sec": 26870.496024131775, "step_time_sec": 8.230196263990365, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3241, "loss": 4.467719554901123, "lr": 0.0002, "elapsed_sec": 26878.725385189056, "step_time_sec": 8.22924744701595, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3242, "loss": 4.474631309509277, "lr": 0.0002, "elapsed_sec": 26886.95310807228, "step_time_sec": 8.227585569984512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3243, "loss": 4.648032188415527, "lr": 0.0002, "elapsed_sec": 26895.183495759964, "step_time_sec": 8.230208435998065, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3244, "loss": 4.5701727867126465, "lr": 0.0002, "elapsed_sec": 26903.41269350052, "step_time_sec": 8.229021250997903, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3245, "loss": 4.438131332397461, "lr": 0.0002, "elapsed_sec": 26911.641637563705, "step_time_sec": 8.228752864990383, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3246, "loss": 4.435431480407715, "lr": 0.0002, "elapsed_sec": 26919.8711810112, "step_time_sec": 8.22940058700624, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3247, "loss": 4.533096790313721, "lr": 0.0002, "elapsed_sec": 26928.09993815422, "step_time_sec": 8.228631389996735, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3248, "loss": 4.449465751647949, "lr": 0.0002, "elapsed_sec": 26936.329351186752, "step_time_sec": 8.22924840199994, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3249, "loss": 4.526245594024658, "lr": 0.0002, "elapsed_sec": 26944.558037996292, "step_time_sec": 8.22844756001723, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3250, "loss": 4.332434177398682, "lr": 0.0002, "elapsed_sec": 26954.215623617172, "step_time_sec": 9.657415455003502, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3251, "loss": 4.430172443389893, "lr": 0.0002, "elapsed_sec": 26962.44660258293, "step_time_sec": 8.230832422006642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3252, "loss": 4.429368495941162, "lr": 0.0002, "elapsed_sec": 26970.677556037903, "step_time_sec": 8.230790901026921, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3253, "loss": 4.468626022338867, "lr": 0.0002, "elapsed_sec": 26978.909214258194, "step_time_sec": 8.231527081981767, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3254, "loss": 4.36950159072876, "lr": 0.0002, "elapsed_sec": 26987.138577222824, "step_time_sec": 8.22920358600095, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3255, "loss": 4.47849178314209, "lr": 0.0002, "elapsed_sec": 26995.3675968647, "step_time_sec": 8.228866902005393, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3256, "loss": 4.4426374435424805, "lr": 0.0002, "elapsed_sec": 27003.599096536636, "step_time_sec": 8.231403824000154, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3257, "loss": 4.531147480010986, "lr": 0.0002, "elapsed_sec": 27011.829909086227, "step_time_sec": 8.230566762998933, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3258, "loss": 4.452873706817627, "lr": 0.0002, "elapsed_sec": 27020.061158895493, "step_time_sec": 8.231105807004496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3259, "loss": 4.303166389465332, "lr": 0.0002, "elapsed_sec": 27028.29190015793, "step_time_sec": 8.230585148994578, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3260, "loss": 4.557346820831299, "lr": 0.0002, "elapsed_sec": 27036.52199435234, "step_time_sec": 8.229920322017279, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3261, "loss": 4.401882648468018, "lr": 0.0002, "elapsed_sec": 27044.75303888321, "step_time_sec": 8.230902641022112, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3262, "loss": 4.435324192047119, "lr": 0.0002, "elapsed_sec": 27052.981494903564, "step_time_sec": 8.22832620301051, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3263, "loss": 4.333390712738037, "lr": 0.0002, "elapsed_sec": 27061.21093249321, "step_time_sec": 8.229271440999582, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3264, "loss": 4.491867542266846, "lr": 0.0002, "elapsed_sec": 27069.44203400612, "step_time_sec": 8.230931210011477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3265, "loss": 4.52962064743042, "lr": 0.0002, "elapsed_sec": 27077.673644304276, "step_time_sec": 8.231501606991515, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3266, "loss": 4.610566139221191, "lr": 0.0002, "elapsed_sec": 27085.904561281204, "step_time_sec": 8.230712116026552, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3267, "loss": 4.462151050567627, "lr": 0.0002, "elapsed_sec": 27094.134983301163, "step_time_sec": 8.230285658995854, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3268, "loss": 4.685315132141113, "lr": 0.0002, "elapsed_sec": 27102.366096496582, "step_time_sec": 8.23091445400496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3269, "loss": 4.481561183929443, "lr": 0.0002, "elapsed_sec": 27110.594021320343, "step_time_sec": 8.227795295009855, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3270, "loss": 4.461173057556152, "lr": 0.0002, "elapsed_sec": 27118.823761940002, "step_time_sec": 8.229579909995664, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3271, "loss": 4.465943813323975, "lr": 0.0002, "elapsed_sec": 27127.054797172546, "step_time_sec": 8.230910523008788, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3272, "loss": 4.517214298248291, "lr": 0.0002, "elapsed_sec": 27135.283385038376, "step_time_sec": 8.228434558986919, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3273, "loss": 4.482524394989014, "lr": 0.0002, "elapsed_sec": 27143.51266670227, "step_time_sec": 8.229063984006643, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3274, "loss": 4.465067386627197, "lr": 0.0002, "elapsed_sec": 27151.74333167076, "step_time_sec": 8.230488978995709, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3275, "loss": 4.426177978515625, "lr": 0.0002, "elapsed_sec": 27159.97409749031, "step_time_sec": 8.230637739994563, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3276, "loss": 4.490196228027344, "lr": 0.0002, "elapsed_sec": 27168.205622673035, "step_time_sec": 8.231416833004914, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3277, "loss": 4.479206085205078, "lr": 0.0002, "elapsed_sec": 27176.43577528, "step_time_sec": 8.229925091989571, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3278, "loss": 4.28537130355835, "lr": 0.0002, "elapsed_sec": 27184.666578769684, "step_time_sec": 8.230653131002327, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3279, "loss": 4.370266914367676, "lr": 0.0002, "elapsed_sec": 27192.897102832794, "step_time_sec": 8.230370222998317, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3280, "loss": 4.369002342224121, "lr": 0.0002, "elapsed_sec": 27201.128782749176, "step_time_sec": 8.231535522994818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3281, "loss": 4.46621036529541, "lr": 0.0002, "elapsed_sec": 27209.358580112457, "step_time_sec": 8.22966855802224, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3282, "loss": 4.517801761627197, "lr": 0.0002, "elapsed_sec": 27217.58711719513, "step_time_sec": 8.22837015899131, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3283, "loss": 4.503474235534668, "lr": 0.0002, "elapsed_sec": 27225.816042900085, "step_time_sec": 8.228756062017055, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3284, "loss": 4.4352593421936035, "lr": 0.0002, "elapsed_sec": 27234.04667019844, "step_time_sec": 8.230461399973137, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3285, "loss": 4.430231094360352, "lr": 0.0002, "elapsed_sec": 27242.27497935295, "step_time_sec": 8.228173906012671, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3286, "loss": 4.474393844604492, "lr": 0.0002, "elapsed_sec": 27250.502722263336, "step_time_sec": 8.227550468000118, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3287, "loss": 4.556535720825195, "lr": 0.0002, "elapsed_sec": 27258.732017040253, "step_time_sec": 8.22918549698079, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3288, "loss": 4.341766834259033, "lr": 0.0002, "elapsed_sec": 27266.961489200592, "step_time_sec": 8.229336573014734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3289, "loss": 4.435474395751953, "lr": 0.0002, "elapsed_sec": 27275.188744306564, "step_time_sec": 8.227028484019684, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3290, "loss": 4.525892734527588, "lr": 0.0002, "elapsed_sec": 27283.416729688644, "step_time_sec": 8.227884627995081, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3291, "loss": 4.542980670928955, "lr": 0.0002, "elapsed_sec": 27291.646166563034, "step_time_sec": 8.22921945998678, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3292, "loss": 4.416932582855225, "lr": 0.0002, "elapsed_sec": 27299.87712597847, "step_time_sec": 8.230804702994647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3293, "loss": 4.533740043640137, "lr": 0.0002, "elapsed_sec": 27308.10913515091, "step_time_sec": 8.23189895998803, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3294, "loss": 4.470763206481934, "lr": 0.0002, "elapsed_sec": 27316.339948415756, "step_time_sec": 8.230633573024534, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3295, "loss": 4.408924579620361, "lr": 0.0002, "elapsed_sec": 27324.57105612755, "step_time_sec": 8.230931512982352, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3296, "loss": 4.396158695220947, "lr": 0.0002, "elapsed_sec": 27332.80216884613, "step_time_sec": 8.230951051984448, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3297, "loss": 4.551839351654053, "lr": 0.0002, "elapsed_sec": 27341.033540964127, "step_time_sec": 8.231272780016297, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3298, "loss": 4.487804412841797, "lr": 0.0002, "elapsed_sec": 27349.263159751892, "step_time_sec": 8.22942504400271, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3299, "loss": 4.314112663269043, "lr": 0.0002, "elapsed_sec": 27357.492413282394, "step_time_sec": 8.22915811499115, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3300, "loss": 4.394819259643555, "lr": 0.0002, "elapsed_sec": 27365.722635269165, "step_time_sec": 8.22999612201238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3301, "loss": 4.410363674163818, "lr": 0.0002, "elapsed_sec": 27373.952914714813, "step_time_sec": 8.230197447002865, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3302, "loss": 4.356383800506592, "lr": 0.0002, "elapsed_sec": 27382.181832313538, "step_time_sec": 8.228679858002579, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3303, "loss": 4.371292591094971, "lr": 0.0002, "elapsed_sec": 27390.412004947662, "step_time_sec": 8.230009346007137, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3304, "loss": 4.289678573608398, "lr": 0.0002, "elapsed_sec": 27398.64298057556, "step_time_sec": 8.230864440993173, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3305, "loss": 4.542479515075684, "lr": 0.0002, "elapsed_sec": 27406.87362408638, "step_time_sec": 8.230454650998581, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3306, "loss": 4.277215003967285, "lr": 0.0002, "elapsed_sec": 27415.10357618332, "step_time_sec": 8.229756087006535, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3307, "loss": 4.360106945037842, "lr": 0.0002, "elapsed_sec": 27423.33482170105, "step_time_sec": 8.231118594005238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3308, "loss": 4.3852128982543945, "lr": 0.0002, "elapsed_sec": 27431.565580129623, "step_time_sec": 8.230565125995781, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3309, "loss": 4.278487205505371, "lr": 0.0002, "elapsed_sec": 27439.797023296356, "step_time_sec": 8.231288683018647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3310, "loss": 4.464388847351074, "lr": 0.0002, "elapsed_sec": 27448.02614712715, "step_time_sec": 8.228965476999292, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3311, "loss": 4.516493797302246, "lr": 0.0002, "elapsed_sec": 27456.25706934929, "step_time_sec": 8.230766609980492, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3312, "loss": 4.392803192138672, "lr": 0.0002, "elapsed_sec": 27464.487686157227, "step_time_sec": 8.230461997998646, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3313, "loss": 4.5551934242248535, "lr": 0.0002, "elapsed_sec": 27472.718432188034, "step_time_sec": 8.230576433998067, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3314, "loss": 4.42043399810791, "lr": 0.0002, "elapsed_sec": 27480.95064330101, "step_time_sec": 8.232059305999428, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3315, "loss": 4.649469375610352, "lr": 0.0002, "elapsed_sec": 27489.180938005447, "step_time_sec": 8.230160826002248, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3316, "loss": 4.385161399841309, "lr": 0.0002, "elapsed_sec": 27497.41250514984, "step_time_sec": 8.23139159497805, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3317, "loss": 4.378974914550781, "lr": 0.0002, "elapsed_sec": 27505.64214515686, "step_time_sec": 8.229460073023802, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3318, "loss": 4.490382671356201, "lr": 0.0002, "elapsed_sec": 27513.8734126091, "step_time_sec": 8.23118438498932, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3319, "loss": 4.422257900238037, "lr": 0.0002, "elapsed_sec": 27522.104786157608, "step_time_sec": 8.231142124976031, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3320, "loss": 4.378215789794922, "lr": 0.0002, "elapsed_sec": 27530.335671901703, "step_time_sec": 8.230750482005533, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3321, "loss": 4.4950785636901855, "lr": 0.0002, "elapsed_sec": 27538.56551361084, "step_time_sec": 8.229754024010617, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3322, "loss": 4.34498405456543, "lr": 0.0002, "elapsed_sec": 27546.79451560974, "step_time_sec": 8.228762964979978, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3323, "loss": 4.386223316192627, "lr": 0.0002, "elapsed_sec": 27555.024448156357, "step_time_sec": 8.229873837000923, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3324, "loss": 4.646018981933594, "lr": 0.0002, "elapsed_sec": 27563.2543861866, "step_time_sec": 8.229694723995635, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3325, "loss": 4.564456939697266, "lr": 0.0002, "elapsed_sec": 27571.486129760742, "step_time_sec": 8.231653446011478, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3326, "loss": 4.309813022613525, "lr": 0.0002, "elapsed_sec": 27579.714694976807, "step_time_sec": 8.228386268019676, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3327, "loss": 4.442012786865234, "lr": 0.0002, "elapsed_sec": 27587.943600177765, "step_time_sec": 8.228699269006029, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3328, "loss": 4.539633750915527, "lr": 0.0002, "elapsed_sec": 27596.173862457275, "step_time_sec": 8.230117010010872, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3329, "loss": 4.391417503356934, "lr": 0.0002, "elapsed_sec": 27604.40483021736, "step_time_sec": 8.230821795994416, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3330, "loss": 4.474010944366455, "lr": 0.0002, "elapsed_sec": 27612.636021137238, "step_time_sec": 8.231037168006878, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3331, "loss": 4.360463619232178, "lr": 0.0002, "elapsed_sec": 27620.867304325104, "step_time_sec": 8.231116536015179, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3332, "loss": 4.529821872711182, "lr": 0.0002, "elapsed_sec": 27629.098655462265, "step_time_sec": 8.231221111986088, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3333, "loss": 4.29197883605957, "lr": 0.0002, "elapsed_sec": 27637.329959869385, "step_time_sec": 8.23112479300471, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3334, "loss": 4.448523044586182, "lr": 0.0002, "elapsed_sec": 27645.55979681015, "step_time_sec": 8.229715816007229, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3335, "loss": 4.33416223526001, "lr": 0.0002, "elapsed_sec": 27653.79113650322, "step_time_sec": 8.23120973201003, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3336, "loss": 4.478496074676514, "lr": 0.0002, "elapsed_sec": 27662.02069211006, "step_time_sec": 8.229362197977025, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3337, "loss": 4.393015384674072, "lr": 0.0002, "elapsed_sec": 27670.251900196075, "step_time_sec": 8.231027374014957, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3338, "loss": 4.552966594696045, "lr": 0.0002, "elapsed_sec": 27678.483026742935, "step_time_sec": 8.230994270998053, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3339, "loss": 4.543249130249023, "lr": 0.0002, "elapsed_sec": 27686.710874557495, "step_time_sec": 8.22766956599662, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3340, "loss": 4.436389446258545, "lr": 0.0002, "elapsed_sec": 27694.939237117767, "step_time_sec": 8.228225770988502, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3341, "loss": 4.563558101654053, "lr": 0.0002, "elapsed_sec": 27703.168085575104, "step_time_sec": 8.22869324599742, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3342, "loss": 4.32221794128418, "lr": 0.0002, "elapsed_sec": 27711.39812850952, "step_time_sec": 8.229883840016555, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3343, "loss": 4.4896674156188965, "lr": 0.0002, "elapsed_sec": 27719.628046274185, "step_time_sec": 8.229785825998988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3344, "loss": 4.480669021606445, "lr": 0.0002, "elapsed_sec": 27727.858698129654, "step_time_sec": 8.230461892992025, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3345, "loss": 4.462979793548584, "lr": 0.0002, "elapsed_sec": 27736.090193748474, "step_time_sec": 8.231367000000319, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3346, "loss": 4.320279598236084, "lr": 0.0002, "elapsed_sec": 27744.321136713028, "step_time_sec": 8.230800871009706, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3347, "loss": 4.294346809387207, "lr": 0.0002, "elapsed_sec": 27752.5504925251, "step_time_sec": 8.229171131999465, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3348, "loss": 4.504697322845459, "lr": 0.0002, "elapsed_sec": 27760.77807378769, "step_time_sec": 8.227424953016452, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3349, "loss": 4.39504337310791, "lr": 0.0002, "elapsed_sec": 27769.008885622025, "step_time_sec": 8.230703212000662, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3350, "loss": 4.3238396644592285, "lr": 0.0002, "elapsed_sec": 27777.23930644989, "step_time_sec": 8.230209514993476, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3351, "loss": 4.4087018966674805, "lr": 0.0002, "elapsed_sec": 27785.470624923706, "step_time_sec": 8.231151755986502, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3352, "loss": 4.459768772125244, "lr": 0.0002, "elapsed_sec": 27793.701618909836, "step_time_sec": 8.230937729997095, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3353, "loss": 4.500437259674072, "lr": 0.0002, "elapsed_sec": 27801.933052778244, "step_time_sec": 8.23124579701107, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3354, "loss": 4.468922138214111, "lr": 0.0002, "elapsed_sec": 27810.162905693054, "step_time_sec": 8.229609608999453, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3355, "loss": 4.42917013168335, "lr": 0.0002, "elapsed_sec": 27818.393670082092, "step_time_sec": 8.230625010997755, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3356, "loss": 4.409924507141113, "lr": 0.0002, "elapsed_sec": 27826.625399827957, "step_time_sec": 8.231604983011493, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3357, "loss": 4.372023582458496, "lr": 0.0002, "elapsed_sec": 27834.856490135193, "step_time_sec": 8.230903413001215, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3358, "loss": 4.344518184661865, "lr": 0.0002, "elapsed_sec": 27843.08750963211, "step_time_sec": 8.230873191001592, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3359, "loss": 4.4210004806518555, "lr": 0.0002, "elapsed_sec": 27851.318929195404, "step_time_sec": 8.231242000008933, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3360, "loss": 4.438368797302246, "lr": 0.0002, "elapsed_sec": 27859.548785686493, "step_time_sec": 8.229730824008584, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3361, "loss": 4.415255546569824, "lr": 0.0002, "elapsed_sec": 27867.777939081192, "step_time_sec": 8.228951609984506, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3362, "loss": 4.41702938079834, "lr": 0.0002, "elapsed_sec": 27876.00662255287, "step_time_sec": 8.228536798007553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3363, "loss": 4.354255199432373, "lr": 0.0002, "elapsed_sec": 27884.235327720642, "step_time_sec": 8.22863835701719, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3364, "loss": 4.388772010803223, "lr": 0.0002, "elapsed_sec": 27892.463218927383, "step_time_sec": 8.227697366004577, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3365, "loss": 4.4722418785095215, "lr": 0.0002, "elapsed_sec": 27900.69414448738, "step_time_sec": 8.23073848398053, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3366, "loss": 4.389218330383301, "lr": 0.0002, "elapsed_sec": 27908.925486326218, "step_time_sec": 8.231219707988203, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3367, "loss": 4.4344964027404785, "lr": 0.0002, "elapsed_sec": 27917.154183626175, "step_time_sec": 8.228513559006387, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3368, "loss": 4.404391765594482, "lr": 0.0002, "elapsed_sec": 27925.38424229622, "step_time_sec": 8.229886105982587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3369, "loss": 4.446346282958984, "lr": 0.0002, "elapsed_sec": 27933.613533496857, "step_time_sec": 8.22913908699411, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3370, "loss": 4.450611591339111, "lr": 0.0002, "elapsed_sec": 27941.841734409332, "step_time_sec": 8.228039981011534, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3371, "loss": 4.403958320617676, "lr": 0.0002, "elapsed_sec": 27950.07044506073, "step_time_sec": 8.228560330986511, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3372, "loss": 4.345091342926025, "lr": 0.0002, "elapsed_sec": 27958.300627231598, "step_time_sec": 8.230100581014995, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3373, "loss": 4.424515724182129, "lr": 0.0002, "elapsed_sec": 27966.531338214874, "step_time_sec": 8.230480044992873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3374, "loss": 4.519711494445801, "lr": 0.0002, "elapsed_sec": 27974.7620575428, "step_time_sec": 8.230585593002615, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3375, "loss": 4.496437072753906, "lr": 0.0002, "elapsed_sec": 27982.992826461792, "step_time_sec": 8.230573135981103, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3376, "loss": 4.397324562072754, "lr": 0.0002, "elapsed_sec": 27991.223655939102, "step_time_sec": 8.23072641401086, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3377, "loss": 4.2599196434021, "lr": 0.0002, "elapsed_sec": 27999.45222711563, "step_time_sec": 8.22841012099525, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3378, "loss": 4.2588582038879395, "lr": 0.0002, "elapsed_sec": 28007.68093395233, "step_time_sec": 8.228494956012582, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3379, "loss": 4.510846138000488, "lr": 0.0002, "elapsed_sec": 28015.910047531128, "step_time_sec": 8.228984469023999, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3380, "loss": 4.427487850189209, "lr": 0.0002, "elapsed_sec": 28024.14079093933, "step_time_sec": 8.230615080014104, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3381, "loss": 4.506715774536133, "lr": 0.0002, "elapsed_sec": 28032.3722448349, "step_time_sec": 8.231267625000328, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3382, "loss": 4.3623576164245605, "lr": 0.0002, "elapsed_sec": 28040.60320329666, "step_time_sec": 8.230805864994181, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3383, "loss": 4.560169219970703, "lr": 0.0002, "elapsed_sec": 28048.83463191986, "step_time_sec": 8.23130222299369, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3384, "loss": 4.37449312210083, "lr": 0.0002, "elapsed_sec": 28057.065459251404, "step_time_sec": 8.23069126199698, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3385, "loss": 4.420494079589844, "lr": 0.0002, "elapsed_sec": 28065.295701026917, "step_time_sec": 8.230062360002194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3386, "loss": 4.4174628257751465, "lr": 0.0002, "elapsed_sec": 28073.52693796158, "step_time_sec": 8.231057071010582, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3387, "loss": 4.202512741088867, "lr": 0.0002, "elapsed_sec": 28081.75749540329, "step_time_sec": 8.230394513986539, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3388, "loss": 4.449079513549805, "lr": 0.0002, "elapsed_sec": 28089.9874856472, "step_time_sec": 8.229849851020845, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3389, "loss": 4.454647064208984, "lr": 0.0002, "elapsed_sec": 28098.216881275177, "step_time_sec": 8.229245827998966, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3390, "loss": 4.491340637207031, "lr": 0.0002, "elapsed_sec": 28106.44770669937, "step_time_sec": 8.23063783699763, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3391, "loss": 4.465384006500244, "lr": 0.0002, "elapsed_sec": 28114.67752099037, "step_time_sec": 8.229692269000225, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3392, "loss": 4.506267070770264, "lr": 0.0002, "elapsed_sec": 28122.908338069916, "step_time_sec": 8.230670285003725, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3393, "loss": 4.45588493347168, "lr": 0.0002, "elapsed_sec": 28131.139801740646, "step_time_sec": 8.231277508981293, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3394, "loss": 4.393284320831299, "lr": 0.0002, "elapsed_sec": 28139.37021589279, "step_time_sec": 8.230313824984478, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3395, "loss": 4.404869556427002, "lr": 0.0002, "elapsed_sec": 28147.60136270523, "step_time_sec": 8.23092957600602, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3396, "loss": 4.346882343292236, "lr": 0.0002, "elapsed_sec": 28155.8319876194, "step_time_sec": 8.230470384994987, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3397, "loss": 4.45742130279541, "lr": 0.0002, "elapsed_sec": 28164.063031196594, "step_time_sec": 8.230858558992622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3398, "loss": 4.401591777801514, "lr": 0.0002, "elapsed_sec": 28172.291692495346, "step_time_sec": 8.228493756003445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3399, "loss": 4.356271266937256, "lr": 0.0002, "elapsed_sec": 28180.521866321564, "step_time_sec": 8.230036495020613, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3400, "loss": 4.433597564697266, "lr": 0.0002, "elapsed_sec": 28188.752028226852, "step_time_sec": 8.229999488015892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3401, "loss": 4.460267066955566, "lr": 0.0002, "elapsed_sec": 28196.979583740234, "step_time_sec": 8.227396785019664, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3402, "loss": 4.488476753234863, "lr": 0.0002, "elapsed_sec": 28205.207997322083, "step_time_sec": 8.228245790989604, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3403, "loss": 4.28740119934082, "lr": 0.0002, "elapsed_sec": 28213.438030958176, "step_time_sec": 8.229864014982013, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3404, "loss": 4.422746658325195, "lr": 0.0002, "elapsed_sec": 28221.66855454445, "step_time_sec": 8.230350773985265, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3405, "loss": 4.464597702026367, "lr": 0.0002, "elapsed_sec": 28229.89925289154, "step_time_sec": 8.23053756498848, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3406, "loss": 4.374096870422363, "lr": 0.0002, "elapsed_sec": 28238.127296447754, "step_time_sec": 8.2279151630064, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3407, "loss": 4.380039215087891, "lr": 0.0002, "elapsed_sec": 28246.356905460358, "step_time_sec": 8.229484592011431, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3408, "loss": 4.481874942779541, "lr": 0.0002, "elapsed_sec": 28254.586802244186, "step_time_sec": 8.229753273015376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3409, "loss": 4.475619316101074, "lr": 0.0002, "elapsed_sec": 28262.81821489334, "step_time_sec": 8.231192892999388, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3410, "loss": 4.463409423828125, "lr": 0.0002, "elapsed_sec": 28271.049723148346, "step_time_sec": 8.231375942996237, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3411, "loss": 4.360481262207031, "lr": 0.0002, "elapsed_sec": 28279.2796523571, "step_time_sec": 8.229777222994016, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3412, "loss": 4.499388217926025, "lr": 0.0002, "elapsed_sec": 28287.509966611862, "step_time_sec": 8.230145430978155, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3413, "loss": 4.354983806610107, "lr": 0.0002, "elapsed_sec": 28295.740107297897, "step_time_sec": 8.229986097983783, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3414, "loss": 4.430453300476074, "lr": 0.0002, "elapsed_sec": 28303.97096681595, "step_time_sec": 8.23069234401919, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3415, "loss": 4.442412853240967, "lr": 0.0002, "elapsed_sec": 28312.202184200287, "step_time_sec": 8.231071140995482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3416, "loss": 4.5357232093811035, "lr": 0.0002, "elapsed_sec": 28320.431485414505, "step_time_sec": 8.229161887022201, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3417, "loss": 4.428037166595459, "lr": 0.0002, "elapsed_sec": 28328.660708665848, "step_time_sec": 8.229059044999303, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3418, "loss": 4.3353190422058105, "lr": 0.0002, "elapsed_sec": 28336.890744924545, "step_time_sec": 8.229912688984768, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3419, "loss": 4.430807590484619, "lr": 0.0002, "elapsed_sec": 28345.12251663208, "step_time_sec": 8.231581705011195, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3420, "loss": 4.498447895050049, "lr": 0.0002, "elapsed_sec": 28353.35295343399, "step_time_sec": 8.230335085012484, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3421, "loss": 4.441807746887207, "lr": 0.0002, "elapsed_sec": 28361.58194541931, "step_time_sec": 8.228763326012995, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3422, "loss": 4.474726676940918, "lr": 0.0002, "elapsed_sec": 28369.812304973602, "step_time_sec": 8.23027145798551, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3423, "loss": 4.325424671173096, "lr": 0.0002, "elapsed_sec": 28378.04152226448, "step_time_sec": 8.229003565007588, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3424, "loss": 4.461056232452393, "lr": 0.0002, "elapsed_sec": 28386.27242422104, "step_time_sec": 8.230729403992882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3425, "loss": 4.354142665863037, "lr": 0.0002, "elapsed_sec": 28394.50279378891, "step_time_sec": 8.230290231993422, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3426, "loss": 4.143357753753662, "lr": 0.0002, "elapsed_sec": 28402.734372138977, "step_time_sec": 8.231361765996553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3427, "loss": 4.374614238739014, "lr": 0.0002, "elapsed_sec": 28410.963555336, "step_time_sec": 8.229039530007867, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3428, "loss": 4.349703788757324, "lr": 0.0002, "elapsed_sec": 28419.19308447838, "step_time_sec": 8.229423926997697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3429, "loss": 4.406083106994629, "lr": 0.0002, "elapsed_sec": 28427.42445731163, "step_time_sec": 8.231168603990227, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3430, "loss": 4.338114261627197, "lr": 0.0002, "elapsed_sec": 28435.655330181122, "step_time_sec": 8.230727942020167, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3431, "loss": 4.398792743682861, "lr": 0.0002, "elapsed_sec": 28443.88614153862, "step_time_sec": 8.230663169990294, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3432, "loss": 4.443232536315918, "lr": 0.0002, "elapsed_sec": 28452.116966962814, "step_time_sec": 8.230683666013647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3433, "loss": 4.4172892570495605, "lr": 0.0002, "elapsed_sec": 28460.34848856926, "step_time_sec": 8.23131623500376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3434, "loss": 4.452401161193848, "lr": 0.0002, "elapsed_sec": 28468.578952550888, "step_time_sec": 8.230324244010262, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3435, "loss": 4.464353561401367, "lr": 0.0002, "elapsed_sec": 28476.808821439743, "step_time_sec": 8.229768866993254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3436, "loss": 4.394493103027344, "lr": 0.0002, "elapsed_sec": 28485.03613758087, "step_time_sec": 8.22711179801263, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3437, "loss": 4.39304780960083, "lr": 0.0002, "elapsed_sec": 28493.265964508057, "step_time_sec": 8.229687036015093, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3438, "loss": 4.494816780090332, "lr": 0.0002, "elapsed_sec": 28501.495837450027, "step_time_sec": 8.229717306996463, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3439, "loss": 4.448995113372803, "lr": 0.0002, "elapsed_sec": 28509.72344827652, "step_time_sec": 8.227511506993324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3440, "loss": 4.3641533851623535, "lr": 0.0002, "elapsed_sec": 28517.952167749405, "step_time_sec": 8.228569937986322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3441, "loss": 4.389455795288086, "lr": 0.0002, "elapsed_sec": 28526.182213306427, "step_time_sec": 8.229824971000198, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3442, "loss": 4.348433971405029, "lr": 0.0002, "elapsed_sec": 28534.41217160225, "step_time_sec": 8.229822510998929, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3443, "loss": 4.356039524078369, "lr": 0.0002, "elapsed_sec": 28542.640768527985, "step_time_sec": 8.228421495994553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3444, "loss": 4.340787410736084, "lr": 0.0002, "elapsed_sec": 28550.869242191315, "step_time_sec": 8.228325778007274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3445, "loss": 4.521442413330078, "lr": 0.0002, "elapsed_sec": 28559.097772359848, "step_time_sec": 8.228385001013521, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3446, "loss": 4.382851600646973, "lr": 0.0002, "elapsed_sec": 28567.32669878006, "step_time_sec": 8.228761292004492, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3447, "loss": 4.504435062408447, "lr": 0.0002, "elapsed_sec": 28575.55795288086, "step_time_sec": 8.231160152994562, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3448, "loss": 4.3992180824279785, "lr": 0.0002, "elapsed_sec": 28583.788609027863, "step_time_sec": 8.230416626989609, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3449, "loss": 4.433636665344238, "lr": 0.0002, "elapsed_sec": 28592.016555547714, "step_time_sec": 8.227798784995684, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3450, "loss": 4.271605491638184, "lr": 0.0002, "elapsed_sec": 28600.246994495392, "step_time_sec": 8.230277021008078, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3451, "loss": 4.427840232849121, "lr": 0.0002, "elapsed_sec": 28608.477827310562, "step_time_sec": 8.23071636800887, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3452, "loss": 4.367599010467529, "lr": 0.0002, "elapsed_sec": 28616.70640516281, "step_time_sec": 8.228393952012993, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3453, "loss": 4.354592323303223, "lr": 0.0002, "elapsed_sec": 28624.934767723083, "step_time_sec": 8.228254691988695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3454, "loss": 4.338437557220459, "lr": 0.0002, "elapsed_sec": 28633.1652238369, "step_time_sec": 8.23028067901032, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3455, "loss": 4.35774040222168, "lr": 0.0002, "elapsed_sec": 28641.395919799805, "step_time_sec": 8.23051054001553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3456, "loss": 4.423768997192383, "lr": 0.0002, "elapsed_sec": 28649.62611937523, "step_time_sec": 8.23005400999682, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3457, "loss": 4.539088726043701, "lr": 0.0002, "elapsed_sec": 28657.855649471283, "step_time_sec": 8.229344445018796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3458, "loss": 4.457990646362305, "lr": 0.0002, "elapsed_sec": 28666.08643269539, "step_time_sec": 8.230645087023731, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3459, "loss": 4.45110559463501, "lr": 0.0002, "elapsed_sec": 28674.317915201187, "step_time_sec": 8.231363998987945, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3460, "loss": 4.336256504058838, "lr": 0.0002, "elapsed_sec": 28682.54979801178, "step_time_sec": 8.231662631995277, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3461, "loss": 4.361005783081055, "lr": 0.0002, "elapsed_sec": 28690.780153036118, "step_time_sec": 8.230220674013253, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3462, "loss": 4.399612903594971, "lr": 0.0002, "elapsed_sec": 28699.01030921936, "step_time_sec": 8.230010759987636, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3463, "loss": 4.2669677734375, "lr": 0.0002, "elapsed_sec": 28707.24141716957, "step_time_sec": 8.230942887981655, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3464, "loss": 4.396317005157471, "lr": 0.0002, "elapsed_sec": 28715.472928762436, "step_time_sec": 8.231363173021236, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3465, "loss": 4.376651763916016, "lr": 0.0002, "elapsed_sec": 28723.70370411873, "step_time_sec": 8.230632957012858, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3466, "loss": 4.362549304962158, "lr": 0.0002, "elapsed_sec": 28731.934600830078, "step_time_sec": 8.23074596497463, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3467, "loss": 4.372727870941162, "lr": 0.0002, "elapsed_sec": 28740.164600610733, "step_time_sec": 8.229871146992082, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3468, "loss": 4.394814491271973, "lr": 0.0002, "elapsed_sec": 28748.39549088478, "step_time_sec": 8.230681281012949, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3469, "loss": 4.312089920043945, "lr": 0.0002, "elapsed_sec": 28756.626976966858, "step_time_sec": 8.2313640829816, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3470, "loss": 4.278158187866211, "lr": 0.0002, "elapsed_sec": 28764.85852575302, "step_time_sec": 8.231420399999479, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3471, "loss": 4.308925151824951, "lr": 0.0002, "elapsed_sec": 28773.089272260666, "step_time_sec": 8.230524179001804, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3472, "loss": 4.341785907745361, "lr": 0.0002, "elapsed_sec": 28781.32047867775, "step_time_sec": 8.23105109500466, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3473, "loss": 4.37357234954834, "lr": 0.0002, "elapsed_sec": 28789.55122923851, "step_time_sec": 8.230629750003573, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3474, "loss": 4.505119323730469, "lr": 0.0002, "elapsed_sec": 28797.782187223434, "step_time_sec": 8.230774152994854, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3475, "loss": 4.228499412536621, "lr": 0.0002, "elapsed_sec": 28806.010704755783, "step_time_sec": 8.2283718889812, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3476, "loss": 4.2640275955200195, "lr": 0.0002, "elapsed_sec": 28814.240855932236, "step_time_sec": 8.229994491004618, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3477, "loss": 4.277612686157227, "lr": 0.0002, "elapsed_sec": 28822.4703707695, "step_time_sec": 8.229337781987851, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3478, "loss": 4.455322742462158, "lr": 0.0002, "elapsed_sec": 28830.700412750244, "step_time_sec": 8.229923220991623, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3479, "loss": 4.367403030395508, "lr": 0.0002, "elapsed_sec": 28838.931490182877, "step_time_sec": 8.230906323995441, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3480, "loss": 4.373393535614014, "lr": 0.0002, "elapsed_sec": 28847.162516117096, "step_time_sec": 8.230875579989515, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3481, "loss": 4.202367782592773, "lr": 0.0002, "elapsed_sec": 28855.393696308136, "step_time_sec": 8.23106253502192, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3482, "loss": 4.3937554359436035, "lr": 0.0002, "elapsed_sec": 28863.62439775467, "step_time_sec": 8.230506958003389, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3483, "loss": 4.261160850524902, "lr": 0.0002, "elapsed_sec": 28871.854779720306, "step_time_sec": 8.23021500298637, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3484, "loss": 4.268413543701172, "lr": 0.0002, "elapsed_sec": 28880.086442947388, "step_time_sec": 8.231549565010937, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3485, "loss": 3.8517518043518066, "lr": 0.0002, "elapsed_sec": 28888.317009687424, "step_time_sec": 8.230397298000753, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3486, "loss": 4.437774181365967, "lr": 0.0002, "elapsed_sec": 28896.54542207718, "step_time_sec": 8.228237065981375, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3487, "loss": 4.379549503326416, "lr": 0.0002, "elapsed_sec": 28904.772270917892, "step_time_sec": 8.22672321400023, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3488, "loss": 4.270596981048584, "lr": 0.0002, "elapsed_sec": 28913.00314450264, "step_time_sec": 8.230666246992769, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3489, "loss": 4.3458757400512695, "lr": 0.0002, "elapsed_sec": 28921.233246088028, "step_time_sec": 8.23002369599999, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3490, "loss": 4.430504322052002, "lr": 0.0002, "elapsed_sec": 28929.46445083618, "step_time_sec": 8.231002360000275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3491, "loss": 4.303278923034668, "lr": 0.0002, "elapsed_sec": 28937.694167375565, "step_time_sec": 8.229530890996102, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3492, "loss": 4.388810634613037, "lr": 0.0002, "elapsed_sec": 28945.923370599747, "step_time_sec": 8.229040466976585, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3493, "loss": 4.314977645874023, "lr": 0.0002, "elapsed_sec": 28954.15439748764, "step_time_sec": 8.23093111300841, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3494, "loss": 4.344173431396484, "lr": 0.0002, "elapsed_sec": 28962.385350227356, "step_time_sec": 8.23074772200198, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3495, "loss": 4.353844165802002, "lr": 0.0002, "elapsed_sec": 28970.616719722748, "step_time_sec": 8.231257119012298, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3496, "loss": 4.412060737609863, "lr": 0.0002, "elapsed_sec": 28978.84737610817, "step_time_sec": 8.230459140002495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3497, "loss": 4.3751678466796875, "lr": 0.0002, "elapsed_sec": 28987.077987909317, "step_time_sec": 8.230449869995937, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3498, "loss": 4.434428691864014, "lr": 0.0002, "elapsed_sec": 28995.308859586716, "step_time_sec": 8.230791434994899, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3499, "loss": 4.403817653656006, "lr": 0.0002, "elapsed_sec": 29003.537236452103, "step_time_sec": 8.228161904000444, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3500, "loss": 4.346377849578857, "lr": 0.0002, "elapsed_sec": 29011.767226457596, "step_time_sec": 29.533175038988702, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3501, "loss": 4.338993549346924, "lr": 0.0002, "elapsed_sec": 29041.315746068954, "step_time_sec": 8.244748187978985, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3502, "loss": 4.423699855804443, "lr": 0.0002, "elapsed_sec": 29049.542994976044, "step_time_sec": 8.227032946015242, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3503, "loss": 4.402364253997803, "lr": 0.0002, "elapsed_sec": 29057.759811639786, "step_time_sec": 8.216610177012626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3504, "loss": 4.544566631317139, "lr": 0.0002, "elapsed_sec": 29065.976602315903, "step_time_sec": 8.216627232002793, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3505, "loss": 4.42059326171875, "lr": 0.0002, "elapsed_sec": 29074.193108558655, "step_time_sec": 8.21641515698866, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3506, "loss": 4.3690876960754395, "lr": 0.0002, "elapsed_sec": 29082.410264968872, "step_time_sec": 8.216941332997521, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3507, "loss": 4.206995010375977, "lr": 0.0002, "elapsed_sec": 29090.62691617012, "step_time_sec": 8.216527862008661, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3508, "loss": 4.193863391876221, "lr": 0.0002, "elapsed_sec": 29098.84384059906, "step_time_sec": 8.216727321007056, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3509, "loss": 4.338441848754883, "lr": 0.0002, "elapsed_sec": 29107.06056022644, "step_time_sec": 8.216559832013445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3510, "loss": 4.413856029510498, "lr": 0.0002, "elapsed_sec": 29115.29116678238, "step_time_sec": 8.230528707994381, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3511, "loss": 4.5623250007629395, "lr": 0.0002, "elapsed_sec": 29123.52192735672, "step_time_sec": 8.230530664994149, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3512, "loss": 4.395768642425537, "lr": 0.0002, "elapsed_sec": 29131.75271463394, "step_time_sec": 8.230689874995733, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3513, "loss": 4.341139793395996, "lr": 0.0002, "elapsed_sec": 29139.98331975937, "step_time_sec": 8.23037860300974, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3514, "loss": 4.433938503265381, "lr": 0.0002, "elapsed_sec": 29148.2144241333, "step_time_sec": 8.231008973991266, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3515, "loss": 4.568587303161621, "lr": 0.0002, "elapsed_sec": 29156.443996429443, "step_time_sec": 8.229374575021211, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3516, "loss": 4.447394371032715, "lr": 0.0002, "elapsed_sec": 29164.67154955864, "step_time_sec": 8.227380794996861, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3517, "loss": 4.288628578186035, "lr": 0.0002, "elapsed_sec": 29172.901064157486, "step_time_sec": 8.229363180988003, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3518, "loss": 4.315690994262695, "lr": 0.0002, "elapsed_sec": 29181.13174843788, "step_time_sec": 8.230587961996207, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3519, "loss": 4.31873083114624, "lr": 0.0002, "elapsed_sec": 29189.362420797348, "step_time_sec": 8.230453099997249, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3520, "loss": 4.358511447906494, "lr": 0.0002, "elapsed_sec": 29197.592461586, "step_time_sec": 8.229875887016533, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3521, "loss": 4.467759132385254, "lr": 0.0002, "elapsed_sec": 29205.822638750076, "step_time_sec": 8.230089035001583, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3522, "loss": 4.33433723449707, "lr": 0.0002, "elapsed_sec": 29214.055318832397, "step_time_sec": 8.232480392995058, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3523, "loss": 4.402170181274414, "lr": 0.0002, "elapsed_sec": 29222.285773038864, "step_time_sec": 8.23028060598881, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3524, "loss": 4.272892475128174, "lr": 0.0002, "elapsed_sec": 29230.51654958725, "step_time_sec": 8.230668773991056, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3525, "loss": 4.415740489959717, "lr": 0.0002, "elapsed_sec": 29238.744361162186, "step_time_sec": 8.227630089008017, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3526, "loss": 4.333793640136719, "lr": 0.0002, "elapsed_sec": 29246.972849607468, "step_time_sec": 8.228324516006978, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3527, "loss": 4.31712532043457, "lr": 0.0002, "elapsed_sec": 29255.20323896408, "step_time_sec": 8.230252326990012, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3528, "loss": 4.46790885925293, "lr": 0.0002, "elapsed_sec": 29263.432749271393, "step_time_sec": 8.22928228098317, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3529, "loss": 4.408263206481934, "lr": 0.0002, "elapsed_sec": 29271.663430929184, "step_time_sec": 8.230531503999373, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3530, "loss": 4.42959451675415, "lr": 0.0002, "elapsed_sec": 29279.89426136017, "step_time_sec": 8.230664371018065, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3531, "loss": 4.5398993492126465, "lr": 0.0002, "elapsed_sec": 29288.124953985214, "step_time_sec": 8.23052408901276, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3532, "loss": 4.371072769165039, "lr": 0.0002, "elapsed_sec": 29296.356144189835, "step_time_sec": 8.2310111639963, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3533, "loss": 4.334693908691406, "lr": 0.0002, "elapsed_sec": 29304.584560632706, "step_time_sec": 8.2282561440079, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3534, "loss": 4.361059665679932, "lr": 0.0002, "elapsed_sec": 29312.81436944008, "step_time_sec": 8.229637982993154, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3535, "loss": 4.306999683380127, "lr": 0.0002, "elapsed_sec": 29321.042066574097, "step_time_sec": 8.22758031100966, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3536, "loss": 4.368964672088623, "lr": 0.0002, "elapsed_sec": 29329.27007508278, "step_time_sec": 8.227824544010218, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3537, "loss": 4.3144636154174805, "lr": 0.0002, "elapsed_sec": 29337.49818968773, "step_time_sec": 8.227941199002089, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3538, "loss": 4.451719760894775, "lr": 0.0002, "elapsed_sec": 29345.72913813591, "step_time_sec": 8.230855672998587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3539, "loss": 4.320417404174805, "lr": 0.0002, "elapsed_sec": 29353.959841489792, "step_time_sec": 8.230559557996457, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3540, "loss": 4.403833866119385, "lr": 0.0002, "elapsed_sec": 29362.190856456757, "step_time_sec": 8.230787191016134, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3541, "loss": 4.526712894439697, "lr": 0.0002, "elapsed_sec": 29370.42165493965, "step_time_sec": 8.230680181004573, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3542, "loss": 4.478344440460205, "lr": 0.0002, "elapsed_sec": 29378.652591705322, "step_time_sec": 8.230716785998084, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3543, "loss": 4.3526201248168945, "lr": 0.0002, "elapsed_sec": 29386.882267475128, "step_time_sec": 8.229579428996658, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3544, "loss": 4.344573974609375, "lr": 0.0002, "elapsed_sec": 29395.11266684532, "step_time_sec": 8.230209430999821, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3545, "loss": 4.439387321472168, "lr": 0.0002, "elapsed_sec": 29403.343564271927, "step_time_sec": 8.230742192972684, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3546, "loss": 4.3487653732299805, "lr": 0.0002, "elapsed_sec": 29411.574638843536, "step_time_sec": 8.230908701982116, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3547, "loss": 4.319957256317139, "lr": 0.0002, "elapsed_sec": 29419.80609369278, "step_time_sec": 8.231357393000508, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3548, "loss": 4.442051410675049, "lr": 0.0002, "elapsed_sec": 29428.033308506012, "step_time_sec": 8.227025618980406, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3549, "loss": 4.316030979156494, "lr": 0.0002, "elapsed_sec": 29436.26066350937, "step_time_sec": 8.227197859989246, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3550, "loss": 4.40692138671875, "lr": 0.0002, "elapsed_sec": 29444.489879608154, "step_time_sec": 8.229071984998882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3551, "loss": 4.462306976318359, "lr": 0.0002, "elapsed_sec": 29452.72122812271, "step_time_sec": 8.231181176990503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3552, "loss": 4.495969295501709, "lr": 0.0002, "elapsed_sec": 29460.951219558716, "step_time_sec": 8.229884286993183, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3553, "loss": 4.250987529754639, "lr": 0.0002, "elapsed_sec": 29469.181335926056, "step_time_sec": 8.229920659010531, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3554, "loss": 4.411708354949951, "lr": 0.0002, "elapsed_sec": 29477.412867307663, "step_time_sec": 8.23135967599228, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3555, "loss": 4.382138252258301, "lr": 0.0002, "elapsed_sec": 29485.643125534058, "step_time_sec": 8.230155686993385, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3556, "loss": 4.5042853355407715, "lr": 0.0002, "elapsed_sec": 29493.871361732483, "step_time_sec": 8.228047612996306, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3557, "loss": 4.335880756378174, "lr": 0.0002, "elapsed_sec": 29502.099951267242, "step_time_sec": 8.22844000402256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3558, "loss": 4.248302936553955, "lr": 0.0002, "elapsed_sec": 29510.330671072006, "step_time_sec": 8.230607748002512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3559, "loss": 4.397187232971191, "lr": 0.0002, "elapsed_sec": 29518.561107873917, "step_time_sec": 8.230302908981685, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3560, "loss": 4.348598003387451, "lr": 0.0002, "elapsed_sec": 29526.78969836235, "step_time_sec": 8.228403983986937, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3561, "loss": 4.271038055419922, "lr": 0.0002, "elapsed_sec": 29535.020233154297, "step_time_sec": 8.230417732003843, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3562, "loss": 4.227813720703125, "lr": 0.0002, "elapsed_sec": 29543.248663902283, "step_time_sec": 8.22818322799867, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3563, "loss": 4.350368499755859, "lr": 0.0002, "elapsed_sec": 29551.476739406586, "step_time_sec": 8.227925762010273, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3564, "loss": 4.330943584442139, "lr": 0.0002, "elapsed_sec": 29559.70496582985, "step_time_sec": 8.22807974301395, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3565, "loss": 4.351937770843506, "lr": 0.0002, "elapsed_sec": 29567.93360066414, "step_time_sec": 8.228442093008198, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3566, "loss": 4.290771961212158, "lr": 0.0002, "elapsed_sec": 29576.164494991302, "step_time_sec": 8.230789393011946, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3567, "loss": 4.329458713531494, "lr": 0.0002, "elapsed_sec": 29584.395139932632, "step_time_sec": 8.23045930598164, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3568, "loss": 4.364384651184082, "lr": 0.0002, "elapsed_sec": 29592.62487387657, "step_time_sec": 8.22958454300533, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3569, "loss": 4.317965030670166, "lr": 0.0002, "elapsed_sec": 29600.85365629196, "step_time_sec": 8.228602646995569, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3570, "loss": 4.337486743927002, "lr": 0.0002, "elapsed_sec": 29609.081483364105, "step_time_sec": 8.227681899996242, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3571, "loss": 4.3067240715026855, "lr": 0.0002, "elapsed_sec": 29617.312458753586, "step_time_sec": 8.230872654006816, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3572, "loss": 4.561549663543701, "lr": 0.0002, "elapsed_sec": 29625.542698144913, "step_time_sec": 8.230097846011631, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3573, "loss": 4.586500644683838, "lr": 0.0002, "elapsed_sec": 29633.772362947464, "step_time_sec": 8.229430141014745, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3574, "loss": 4.42216157913208, "lr": 0.0002, "elapsed_sec": 29642.00106859207, "step_time_sec": 8.228564260993153, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3575, "loss": 4.3441481590271, "lr": 0.0002, "elapsed_sec": 29650.229098796844, "step_time_sec": 8.227874387986958, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3576, "loss": 4.419578552246094, "lr": 0.0002, "elapsed_sec": 29658.460809230804, "step_time_sec": 8.231555101985577, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3577, "loss": 4.342338562011719, "lr": 0.0002, "elapsed_sec": 29666.691081762314, "step_time_sec": 8.230116763006663, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3578, "loss": 4.368005752563477, "lr": 0.0002, "elapsed_sec": 29674.92257785797, "step_time_sec": 8.231339416990522, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3579, "loss": 4.316127300262451, "lr": 0.0002, "elapsed_sec": 29683.153549432755, "step_time_sec": 8.230823184974724, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3580, "loss": 4.370311260223389, "lr": 0.0002, "elapsed_sec": 29691.384413719177, "step_time_sec": 8.230753948999336, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3581, "loss": 4.448378086090088, "lr": 0.0002, "elapsed_sec": 29699.615446567535, "step_time_sec": 8.230868034996092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3582, "loss": 4.27655553817749, "lr": 0.0002, "elapsed_sec": 29707.845742702484, "step_time_sec": 8.230101907014614, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3583, "loss": 4.33707332611084, "lr": 0.0002, "elapsed_sec": 29716.076231002808, "step_time_sec": 8.230367972020758, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3584, "loss": 4.460002422332764, "lr": 0.0002, "elapsed_sec": 29724.30490040779, "step_time_sec": 8.228501161007443, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3585, "loss": 4.406341552734375, "lr": 0.0002, "elapsed_sec": 29732.533182621002, "step_time_sec": 8.22811548699974, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3586, "loss": 4.384148597717285, "lr": 0.0002, "elapsed_sec": 29740.763323783875, "step_time_sec": 8.230018897011178, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3587, "loss": 4.481139659881592, "lr": 0.0002, "elapsed_sec": 29748.993725061417, "step_time_sec": 8.23020773200551, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3588, "loss": 4.439589500427246, "lr": 0.0002, "elapsed_sec": 29757.221354961395, "step_time_sec": 8.22747322099167, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3589, "loss": 4.405970096588135, "lr": 0.0002, "elapsed_sec": 29765.45183491707, "step_time_sec": 8.230341761984164, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3590, "loss": 4.351934432983398, "lr": 0.0002, "elapsed_sec": 29773.682834625244, "step_time_sec": 8.23090489700553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3591, "loss": 4.230014801025391, "lr": 0.0002, "elapsed_sec": 29781.910501241684, "step_time_sec": 8.227439353999216, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3592, "loss": 4.363224029541016, "lr": 0.0002, "elapsed_sec": 29790.139518499374, "step_time_sec": 8.228852667991305, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3593, "loss": 4.419138431549072, "lr": 0.0002, "elapsed_sec": 29798.37034893036, "step_time_sec": 8.230691748001846, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3594, "loss": 4.347899913787842, "lr": 0.0002, "elapsed_sec": 29806.60070848465, "step_time_sec": 8.230261093995068, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3595, "loss": 4.545469284057617, "lr": 0.0002, "elapsed_sec": 29814.831780433655, "step_time_sec": 8.230830375017831, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3596, "loss": 4.379372596740723, "lr": 0.0002, "elapsed_sec": 29823.062423229218, "step_time_sec": 8.230477305012755, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3597, "loss": 4.409631729125977, "lr": 0.0002, "elapsed_sec": 29831.293016910553, "step_time_sec": 8.230513961985707, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3598, "loss": 4.433509826660156, "lr": 0.0002, "elapsed_sec": 29839.524561166763, "step_time_sec": 8.231324726977618, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3599, "loss": 4.405034065246582, "lr": 0.0002, "elapsed_sec": 29847.75456237793, "step_time_sec": 8.229870505980216, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3600, "loss": 4.368614673614502, "lr": 0.0002, "elapsed_sec": 29855.98581814766, "step_time_sec": 8.231051879003644, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3601, "loss": 4.334927558898926, "lr": 0.0002, "elapsed_sec": 29864.214721918106, "step_time_sec": 8.22873881299165, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3602, "loss": 4.407304763793945, "lr": 0.0002, "elapsed_sec": 29872.447311401367, "step_time_sec": 8.232462085987208, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3603, "loss": 4.403789520263672, "lr": 0.0002, "elapsed_sec": 29880.67769575119, "step_time_sec": 8.230243291007355, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3604, "loss": 4.3698930740356445, "lr": 0.0002, "elapsed_sec": 29888.907751321793, "step_time_sec": 8.229844037006842, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3605, "loss": 4.383462905883789, "lr": 0.0002, "elapsed_sec": 29897.138866186142, "step_time_sec": 8.230961741995998, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3606, "loss": 4.354090213775635, "lr": 0.0002, "elapsed_sec": 29905.36854505539, "step_time_sec": 8.229532233992359, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3607, "loss": 4.213277816772461, "lr": 0.0002, "elapsed_sec": 29913.599564790726, "step_time_sec": 8.230926408985397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3608, "loss": 4.319405555725098, "lr": 0.0002, "elapsed_sec": 29921.830189943314, "step_time_sec": 8.23039280102239, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3609, "loss": 4.3693389892578125, "lr": 0.0002, "elapsed_sec": 29930.060013771057, "step_time_sec": 8.229703160002828, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3610, "loss": 4.506268501281738, "lr": 0.0002, "elapsed_sec": 29938.290361881256, "step_time_sec": 8.230226744984975, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3611, "loss": 4.406532287597656, "lr": 0.0002, "elapsed_sec": 29946.521163225174, "step_time_sec": 8.230645798990736, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3612, "loss": 4.340824127197266, "lr": 0.0002, "elapsed_sec": 29954.75282907486, "step_time_sec": 8.231474244006677, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3613, "loss": 4.466582298278809, "lr": 0.0002, "elapsed_sec": 29962.982961177826, "step_time_sec": 8.22997795700212, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3614, "loss": 4.35071325302124, "lr": 0.0002, "elapsed_sec": 29971.21393418312, "step_time_sec": 8.230839301017113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3615, "loss": 4.3043036460876465, "lr": 0.0002, "elapsed_sec": 29979.442049503326, "step_time_sec": 8.227978887007339, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3616, "loss": 4.4495720863342285, "lr": 0.0002, "elapsed_sec": 29987.67182278633, "step_time_sec": 8.229563759989105, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3617, "loss": 4.37445592880249, "lr": 0.0002, "elapsed_sec": 29995.899709701538, "step_time_sec": 8.227767033997225, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3618, "loss": 4.5673933029174805, "lr": 0.0002, "elapsed_sec": 30004.12788748741, "step_time_sec": 8.22802615701221, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3619, "loss": 4.40283727645874, "lr": 0.0002, "elapsed_sec": 30012.357127666473, "step_time_sec": 8.229052496986696, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3620, "loss": 4.510135173797607, "lr": 0.0002, "elapsed_sec": 30020.58668398857, "step_time_sec": 8.229414414003259, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3621, "loss": 4.367496013641357, "lr": 0.0002, "elapsed_sec": 30028.817051172256, "step_time_sec": 8.230204701016191, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3622, "loss": 4.355834484100342, "lr": 0.0002, "elapsed_sec": 30037.04830646515, "step_time_sec": 8.231170998973539, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3623, "loss": 4.403988838195801, "lr": 0.0002, "elapsed_sec": 30045.278286218643, "step_time_sec": 8.229806854011258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3624, "loss": 4.34478235244751, "lr": 0.0002, "elapsed_sec": 30053.50932407379, "step_time_sec": 8.230840310017811, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3625, "loss": 4.482344150543213, "lr": 0.0002, "elapsed_sec": 30061.73879981041, "step_time_sec": 8.229367575986544, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3626, "loss": 4.317246437072754, "lr": 0.0002, "elapsed_sec": 30069.968816757202, "step_time_sec": 8.229814506019466, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3627, "loss": 4.295644760131836, "lr": 0.0002, "elapsed_sec": 30078.197241783142, "step_time_sec": 8.228268059989205, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3628, "loss": 4.121183395385742, "lr": 0.0002, "elapsed_sec": 30086.426116228104, "step_time_sec": 8.22870210200199, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3629, "loss": 4.317293167114258, "lr": 0.0002, "elapsed_sec": 30094.654505729675, "step_time_sec": 8.22829906901461, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3630, "loss": 4.397296905517578, "lr": 0.0002, "elapsed_sec": 30102.882869958878, "step_time_sec": 8.228156541998032, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3631, "loss": 4.354253768920898, "lr": 0.0002, "elapsed_sec": 30111.111218214035, "step_time_sec": 8.228236582013778, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3632, "loss": 4.364577293395996, "lr": 0.0002, "elapsed_sec": 30119.341255664825, "step_time_sec": 8.22984494100092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3633, "loss": 4.3377885818481445, "lr": 0.0002, "elapsed_sec": 30127.571726083755, "step_time_sec": 8.23032341900398, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3634, "loss": 4.386941432952881, "lr": 0.0002, "elapsed_sec": 30135.801312446594, "step_time_sec": 8.229461401992012, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3635, "loss": 4.37787389755249, "lr": 0.0002, "elapsed_sec": 30144.031567811966, "step_time_sec": 8.230056313012028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3636, "loss": 4.399217128753662, "lr": 0.0002, "elapsed_sec": 30152.262495040894, "step_time_sec": 8.230766987981042, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3637, "loss": 4.263769626617432, "lr": 0.0002, "elapsed_sec": 30160.4927175045, "step_time_sec": 8.230081082991092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3638, "loss": 4.36077356338501, "lr": 0.0002, "elapsed_sec": 30168.724338769913, "step_time_sec": 8.231465493998257, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3639, "loss": 4.301899433135986, "lr": 0.0002, "elapsed_sec": 30176.955103635788, "step_time_sec": 8.2306525829772, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3640, "loss": 4.270628929138184, "lr": 0.0002, "elapsed_sec": 30185.183224916458, "step_time_sec": 8.227918699994916, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3641, "loss": 4.302090167999268, "lr": 0.0002, "elapsed_sec": 30193.41121840477, "step_time_sec": 8.227874311996857, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3642, "loss": 4.2947282791137695, "lr": 0.0002, "elapsed_sec": 30201.639434814453, "step_time_sec": 8.228063936985563, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3643, "loss": 4.14316987991333, "lr": 0.0002, "elapsed_sec": 30209.868030309677, "step_time_sec": 8.228403105022153, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3644, "loss": 4.357378959655762, "lr": 0.0002, "elapsed_sec": 30218.096234083176, "step_time_sec": 8.228090474993223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3645, "loss": 4.376167297363281, "lr": 0.0002, "elapsed_sec": 30226.325850725174, "step_time_sec": 8.229410859989002, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3646, "loss": 4.390651226043701, "lr": 0.0002, "elapsed_sec": 30234.55664587021, "step_time_sec": 8.230658950982615, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3647, "loss": 4.419891834259033, "lr": 0.0002, "elapsed_sec": 30242.782047986984, "step_time_sec": 8.225251993018901, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3648, "loss": 4.601613521575928, "lr": 0.0002, "elapsed_sec": 30251.010637044907, "step_time_sec": 8.22846564601059, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3649, "loss": 4.318861961364746, "lr": 0.0002, "elapsed_sec": 30259.239597797394, "step_time_sec": 8.228786799008958, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3650, "loss": 4.298709392547607, "lr": 0.0002, "elapsed_sec": 30267.46869277954, "step_time_sec": 8.228885071992408, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3651, "loss": 4.269018650054932, "lr": 0.0002, "elapsed_sec": 30275.696810483932, "step_time_sec": 8.22797491500387, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3652, "loss": 4.467810153961182, "lr": 0.0002, "elapsed_sec": 30283.92742872238, "step_time_sec": 8.230475774995284, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3653, "loss": 4.348565578460693, "lr": 0.0002, "elapsed_sec": 30292.15852165222, "step_time_sec": 8.230982310022227, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3654, "loss": 4.349771022796631, "lr": 0.0002, "elapsed_sec": 30300.387762069702, "step_time_sec": 8.229039274010574, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3655, "loss": 4.363048553466797, "lr": 0.0002, "elapsed_sec": 30308.618595600128, "step_time_sec": 8.230684170004679, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3656, "loss": 4.317258834838867, "lr": 0.0002, "elapsed_sec": 30316.85005068779, "step_time_sec": 8.231363433995284, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3657, "loss": 4.482484817504883, "lr": 0.0002, "elapsed_sec": 30325.080509901047, "step_time_sec": 8.230240796023281, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3658, "loss": 4.446081161499023, "lr": 0.0002, "elapsed_sec": 30333.31059527397, "step_time_sec": 8.229930295987288, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3659, "loss": 4.342315196990967, "lr": 0.0002, "elapsed_sec": 30341.54136443138, "step_time_sec": 8.230605260992888, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3660, "loss": 4.397573947906494, "lr": 0.0002, "elapsed_sec": 30349.770139932632, "step_time_sec": 8.228652980003972, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3661, "loss": 4.314742565155029, "lr": 0.0002, "elapsed_sec": 30357.999479055405, "step_time_sec": 8.229170821985463, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3662, "loss": 4.417026996612549, "lr": 0.0002, "elapsed_sec": 30366.228969812393, "step_time_sec": 8.229390008986229, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3663, "loss": 4.343773365020752, "lr": 0.0002, "elapsed_sec": 30374.45829463005, "step_time_sec": 8.229099321994, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3664, "loss": 4.424603462219238, "lr": 0.0002, "elapsed_sec": 30382.685852766037, "step_time_sec": 8.227397721988382, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3665, "loss": 4.468283176422119, "lr": 0.0002, "elapsed_sec": 30390.91626882553, "step_time_sec": 8.230289932980668, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3666, "loss": 4.357036113739014, "lr": 0.0002, "elapsed_sec": 30399.14736700058, "step_time_sec": 8.230942349997349, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3667, "loss": 4.365944862365723, "lr": 0.0002, "elapsed_sec": 30407.373687505722, "step_time_sec": 8.226189701992553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3668, "loss": 4.372641563415527, "lr": 0.0002, "elapsed_sec": 30415.60184454918, "step_time_sec": 8.22795531398151, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3669, "loss": 4.372580051422119, "lr": 0.0002, "elapsed_sec": 30423.83014702797, "step_time_sec": 8.228142163017765, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3670, "loss": 4.374931335449219, "lr": 0.0002, "elapsed_sec": 30432.05884194374, "step_time_sec": 8.228567943995586, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3671, "loss": 4.341894149780273, "lr": 0.0002, "elapsed_sec": 30440.28888273239, "step_time_sec": 8.229856830992503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3672, "loss": 4.329599380493164, "lr": 0.0002, "elapsed_sec": 30448.519101381302, "step_time_sec": 8.230073245998938, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3673, "loss": 4.15116024017334, "lr": 0.0002, "elapsed_sec": 30456.749627113342, "step_time_sec": 8.23036615899764, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3674, "loss": 4.328600883483887, "lr": 0.0002, "elapsed_sec": 30464.98104071617, "step_time_sec": 8.231257520004874, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3675, "loss": 4.575311660766602, "lr": 0.0002, "elapsed_sec": 30473.21173143387, "step_time_sec": 8.230534260015702, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3676, "loss": 4.348882675170898, "lr": 0.0002, "elapsed_sec": 30481.44314479828, "step_time_sec": 8.231304524000734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3677, "loss": 4.453239440917969, "lr": 0.0002, "elapsed_sec": 30489.672050237656, "step_time_sec": 8.228756243013777, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3678, "loss": 4.489771366119385, "lr": 0.0002, "elapsed_sec": 30497.900651216507, "step_time_sec": 8.228379016014514, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3679, "loss": 4.2723894119262695, "lr": 0.0002, "elapsed_sec": 30506.13122010231, "step_time_sec": 8.230474757991033, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3680, "loss": 4.537083148956299, "lr": 0.0002, "elapsed_sec": 30514.35883116722, "step_time_sec": 8.227430638973601, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3681, "loss": 4.341922760009766, "lr": 0.0002, "elapsed_sec": 30522.58809185028, "step_time_sec": 8.229069458000595, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3682, "loss": 4.404867172241211, "lr": 0.0002, "elapsed_sec": 30530.815791130066, "step_time_sec": 8.227548661001492, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3683, "loss": 4.383286952972412, "lr": 0.0002, "elapsed_sec": 30539.046384334564, "step_time_sec": 8.230451128998538, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3684, "loss": 4.340182304382324, "lr": 0.0002, "elapsed_sec": 30547.276400327682, "step_time_sec": 8.229904715990415, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3685, "loss": 4.292015552520752, "lr": 0.0002, "elapsed_sec": 30555.506823301315, "step_time_sec": 8.230297904025065, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3686, "loss": 4.331972599029541, "lr": 0.0002, "elapsed_sec": 30563.73770260811, "step_time_sec": 8.230644044990186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3687, "loss": 4.416652679443359, "lr": 0.0002, "elapsed_sec": 30571.965799808502, "step_time_sec": 8.227957020979375, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3688, "loss": 4.294590950012207, "lr": 0.0002, "elapsed_sec": 30580.19474554062, "step_time_sec": 8.228782999009127, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3689, "loss": 4.395679950714111, "lr": 0.0002, "elapsed_sec": 30588.425390958786, "step_time_sec": 8.230561635980848, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3690, "loss": 4.247930526733398, "lr": 0.0002, "elapsed_sec": 30596.653988838196, "step_time_sec": 8.22838581999531, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3691, "loss": 4.208449363708496, "lr": 0.0002, "elapsed_sec": 30604.882849931717, "step_time_sec": 8.22865463900962, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3692, "loss": 4.294539928436279, "lr": 0.0002, "elapsed_sec": 30613.11256456375, "step_time_sec": 8.229585560999112, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3693, "loss": 4.267182350158691, "lr": 0.0002, "elapsed_sec": 30621.342190027237, "step_time_sec": 8.229525840026326, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3694, "loss": 4.465607166290283, "lr": 0.0002, "elapsed_sec": 30629.569586277008, "step_time_sec": 8.22721794701647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3695, "loss": 4.486808776855469, "lr": 0.0002, "elapsed_sec": 30637.79932165146, "step_time_sec": 8.229542443004902, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3696, "loss": 4.291160583496094, "lr": 0.0002, "elapsed_sec": 30646.02872776985, "step_time_sec": 8.22931003998383, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3697, "loss": 4.258844375610352, "lr": 0.0002, "elapsed_sec": 30654.257411956787, "step_time_sec": 8.22850164299598, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3698, "loss": 4.383447170257568, "lr": 0.0002, "elapsed_sec": 30662.48638319969, "step_time_sec": 8.228860911010997, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3699, "loss": 4.4064106941223145, "lr": 0.0002, "elapsed_sec": 30670.715782165527, "step_time_sec": 8.229201710986672, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3700, "loss": 4.3877997398376465, "lr": 0.0002, "elapsed_sec": 30678.94558405876, "step_time_sec": 8.229709853971144, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3701, "loss": 4.256030082702637, "lr": 0.0002, "elapsed_sec": 30687.174330472946, "step_time_sec": 8.228483934974065, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3702, "loss": 4.218549728393555, "lr": 0.0002, "elapsed_sec": 30695.405047416687, "step_time_sec": 8.230630689999089, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3703, "loss": 4.372588634490967, "lr": 0.0002, "elapsed_sec": 30703.636382818222, "step_time_sec": 8.231110148015432, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3704, "loss": 4.462784767150879, "lr": 0.0002, "elapsed_sec": 30711.86745619774, "step_time_sec": 8.230906051001512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3705, "loss": 4.279874324798584, "lr": 0.0002, "elapsed_sec": 30720.097587823868, "step_time_sec": 8.230016763991443, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3706, "loss": 4.3883562088012695, "lr": 0.0002, "elapsed_sec": 30728.328849315643, "step_time_sec": 8.231089505017735, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3707, "loss": 4.210941314697266, "lr": 0.0002, "elapsed_sec": 30736.55913734436, "step_time_sec": 8.230189182999311, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3708, "loss": 4.287497043609619, "lr": 0.0002, "elapsed_sec": 30744.79107117653, "step_time_sec": 8.231707409024239, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3709, "loss": 4.307959079742432, "lr": 0.0002, "elapsed_sec": 30753.022100687027, "step_time_sec": 8.230879159003962, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3710, "loss": 4.314382553100586, "lr": 0.0002, "elapsed_sec": 30761.252688407898, "step_time_sec": 8.230494254996302, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3711, "loss": 4.286992073059082, "lr": 0.0002, "elapsed_sec": 30769.48049712181, "step_time_sec": 8.227621690981323, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3712, "loss": 4.389692306518555, "lr": 0.0002, "elapsed_sec": 30777.706207752228, "step_time_sec": 8.225565628992626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3713, "loss": 4.425141334533691, "lr": 0.0002, "elapsed_sec": 30785.93745303154, "step_time_sec": 8.231096739007626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3714, "loss": 4.288214683532715, "lr": 0.0002, "elapsed_sec": 30794.169173002243, "step_time_sec": 8.231566868023947, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3715, "loss": 4.338421821594238, "lr": 0.0002, "elapsed_sec": 30802.39741230011, "step_time_sec": 8.228043929004343, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3716, "loss": 4.321913242340088, "lr": 0.0002, "elapsed_sec": 30810.62706375122, "step_time_sec": 8.229499026987469, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3717, "loss": 4.3981451988220215, "lr": 0.0002, "elapsed_sec": 30818.85765147209, "step_time_sec": 8.230386558017926, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3718, "loss": 4.408106327056885, "lr": 0.0002, "elapsed_sec": 30827.08598089218, "step_time_sec": 8.228105640999274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3719, "loss": 4.236610412597656, "lr": 0.0002, "elapsed_sec": 30835.31580543518, "step_time_sec": 8.229671209992375, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3720, "loss": 4.413113117218018, "lr": 0.0002, "elapsed_sec": 30843.543423891068, "step_time_sec": 8.227477941982215, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3721, "loss": 4.363086223602295, "lr": 0.0002, "elapsed_sec": 30851.77199959755, "step_time_sec": 8.2284140860138, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3722, "loss": 4.384056568145752, "lr": 0.0002, "elapsed_sec": 30859.99959874153, "step_time_sec": 8.227403410011902, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3723, "loss": 4.344746112823486, "lr": 0.0002, "elapsed_sec": 30868.230598688126, "step_time_sec": 8.230858805996832, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3724, "loss": 4.347034454345703, "lr": 0.0002, "elapsed_sec": 30876.46157193184, "step_time_sec": 8.230846364982426, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3725, "loss": 4.268102169036865, "lr": 0.0002, "elapsed_sec": 30884.68980407715, "step_time_sec": 8.228030238999054, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3726, "loss": 4.310291767120361, "lr": 0.0002, "elapsed_sec": 30892.919234752655, "step_time_sec": 8.229334400006337, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3727, "loss": 4.440749168395996, "lr": 0.0002, "elapsed_sec": 30901.147436141968, "step_time_sec": 8.228022566996515, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3728, "loss": 4.283635139465332, "lr": 0.0002, "elapsed_sec": 30909.377561569214, "step_time_sec": 8.22992279601749, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3729, "loss": 4.358355522155762, "lr": 0.0002, "elapsed_sec": 30917.608948230743, "step_time_sec": 8.231276191974757, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3730, "loss": 4.47357177734375, "lr": 0.0002, "elapsed_sec": 30925.837299346924, "step_time_sec": 8.228147842979524, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3731, "loss": 4.252403259277344, "lr": 0.0002, "elapsed_sec": 30934.06596302986, "step_time_sec": 8.22851070901379, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3732, "loss": 4.363175868988037, "lr": 0.0002, "elapsed_sec": 30942.29418182373, "step_time_sec": 8.228040446992964, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3733, "loss": 4.382525444030762, "lr": 0.0002, "elapsed_sec": 30950.523394346237, "step_time_sec": 8.229076107993023, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3734, "loss": 4.328487873077393, "lr": 0.0002, "elapsed_sec": 30958.753266334534, "step_time_sec": 8.229708589002257, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3735, "loss": 4.262984275817871, "lr": 0.0002, "elapsed_sec": 30966.98278927803, "step_time_sec": 8.229358348995447, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3736, "loss": 4.2502641677856445, "lr": 0.0002, "elapsed_sec": 30975.276428699493, "step_time_sec": 8.236263491999125, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3737, "loss": 4.368290424346924, "lr": 0.0002, "elapsed_sec": 30983.50717806816, "step_time_sec": 8.230550245003542, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3738, "loss": 4.305673599243164, "lr": 0.0002, "elapsed_sec": 30991.737185955048, "step_time_sec": 8.229904410021845, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3739, "loss": 4.345179080963135, "lr": 0.0002, "elapsed_sec": 30999.967150449753, "step_time_sec": 8.229771384008927, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3740, "loss": 4.3810296058654785, "lr": 0.0002, "elapsed_sec": 31008.198199033737, "step_time_sec": 8.230942510010209, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3741, "loss": 4.3198723793029785, "lr": 0.0002, "elapsed_sec": 31016.428431749344, "step_time_sec": 8.230055099003948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3742, "loss": 4.3038225173950195, "lr": 0.0002, "elapsed_sec": 31024.659512043, "step_time_sec": 8.230923242022982, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3743, "loss": 4.3692097663879395, "lr": 0.0002, "elapsed_sec": 31032.887031555176, "step_time_sec": 8.227374920010334, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3744, "loss": 4.277916431427002, "lr": 0.0002, "elapsed_sec": 31041.117719888687, "step_time_sec": 8.230524479993619, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3745, "loss": 4.221821308135986, "lr": 0.0002, "elapsed_sec": 31049.345695734024, "step_time_sec": 8.227770069992403, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3746, "loss": 4.36283016204834, "lr": 0.0002, "elapsed_sec": 31057.573836565018, "step_time_sec": 8.227966353006195, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3747, "loss": 4.346493244171143, "lr": 0.0002, "elapsed_sec": 31065.802708864212, "step_time_sec": 8.228746406995924, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3748, "loss": 4.363189220428467, "lr": 0.0002, "elapsed_sec": 31074.031669139862, "step_time_sec": 8.22880533800344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3749, "loss": 4.382987022399902, "lr": 0.0002, "elapsed_sec": 31082.258832216263, "step_time_sec": 8.22700623300625, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3750, "loss": 4.429785251617432, "lr": 0.0002, "elapsed_sec": 31090.486485004425, "step_time_sec": 8.227500142005738, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3751, "loss": 4.334851264953613, "lr": 0.0002, "elapsed_sec": 31098.715260505676, "step_time_sec": 8.228661979024764, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3752, "loss": 4.3161301612854, "lr": 0.0002, "elapsed_sec": 31106.944999456406, "step_time_sec": 8.229568834998645, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3753, "loss": 4.375868797302246, "lr": 0.0002, "elapsed_sec": 31115.173609495163, "step_time_sec": 8.228459374979138, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3754, "loss": 4.425106525421143, "lr": 0.0002, "elapsed_sec": 31123.402047872543, "step_time_sec": 8.228238053998211, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3755, "loss": 4.256992340087891, "lr": 0.0002, "elapsed_sec": 31131.630661725998, "step_time_sec": 8.228493351023644, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3756, "loss": 4.236534595489502, "lr": 0.0002, "elapsed_sec": 31139.858304023743, "step_time_sec": 8.227450589998625, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3757, "loss": 4.369329452514648, "lr": 0.0002, "elapsed_sec": 31148.0867395401, "step_time_sec": 8.228331306978362, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3758, "loss": 4.334201335906982, "lr": 0.0002, "elapsed_sec": 31156.31579709053, "step_time_sec": 8.228853641019668, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3759, "loss": 4.208539009094238, "lr": 0.0002, "elapsed_sec": 31164.54297709465, "step_time_sec": 8.227031714020995, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3760, "loss": 4.311966419219971, "lr": 0.0002, "elapsed_sec": 31172.774077415466, "step_time_sec": 8.230942780995974, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3761, "loss": 4.307900428771973, "lr": 0.0002, "elapsed_sec": 31181.004929304123, "step_time_sec": 8.230708995979512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3762, "loss": 4.357756614685059, "lr": 0.0002, "elapsed_sec": 31189.23698401451, "step_time_sec": 8.231902970990632, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3763, "loss": 4.397282123565674, "lr": 0.0002, "elapsed_sec": 31197.465217351913, "step_time_sec": 8.228050700010499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3764, "loss": 4.259530067443848, "lr": 0.0002, "elapsed_sec": 31205.69493174553, "step_time_sec": 8.229565721994732, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3765, "loss": 4.272098064422607, "lr": 0.0002, "elapsed_sec": 31213.925913095474, "step_time_sec": 8.230904964002548, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3766, "loss": 4.189880847930908, "lr": 0.0002, "elapsed_sec": 31222.15657877922, "step_time_sec": 8.230411949014524, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3767, "loss": 4.29555082321167, "lr": 0.0002, "elapsed_sec": 31230.387469530106, "step_time_sec": 8.230731425981503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3768, "loss": 4.379024028778076, "lr": 0.0002, "elapsed_sec": 31238.617520093918, "step_time_sec": 8.229912028997205, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3769, "loss": 4.469236850738525, "lr": 0.0002, "elapsed_sec": 31246.846724033356, "step_time_sec": 8.229117584996857, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3770, "loss": 4.407219886779785, "lr": 0.0002, "elapsed_sec": 31255.076942920685, "step_time_sec": 8.230024877993856, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3771, "loss": 4.270740032196045, "lr": 0.0002, "elapsed_sec": 31263.307312488556, "step_time_sec": 8.23020069897757, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3772, "loss": 4.299328804016113, "lr": 0.0002, "elapsed_sec": 31271.536500930786, "step_time_sec": 8.229025493987137, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3773, "loss": 4.481057643890381, "lr": 0.0002, "elapsed_sec": 31279.76781487465, "step_time_sec": 8.231133850989863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3774, "loss": 4.450122833251953, "lr": 0.0002, "elapsed_sec": 31287.998888731003, "step_time_sec": 8.230951108009322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3775, "loss": 4.306571006774902, "lr": 0.0002, "elapsed_sec": 31296.226900577545, "step_time_sec": 8.227823737979634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3776, "loss": 4.348840236663818, "lr": 0.0002, "elapsed_sec": 31304.45473766327, "step_time_sec": 8.227693269989686, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3777, "loss": 4.415161609649658, "lr": 0.0002, "elapsed_sec": 31312.684504270554, "step_time_sec": 8.229564089007908, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3778, "loss": 4.316957950592041, "lr": 0.0002, "elapsed_sec": 31320.91276526451, "step_time_sec": 8.22812871800852, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3779, "loss": 4.295652866363525, "lr": 0.0002, "elapsed_sec": 31329.1422662735, "step_time_sec": 8.229390496999258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3780, "loss": 4.458754539489746, "lr": 0.0002, "elapsed_sec": 31337.373203277588, "step_time_sec": 8.230739788006758, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3781, "loss": 4.314869403839111, "lr": 0.0002, "elapsed_sec": 31345.60399246216, "step_time_sec": 8.23062406599638, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3782, "loss": 4.41377592086792, "lr": 0.0002, "elapsed_sec": 31353.83327627182, "step_time_sec": 8.229108455008827, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3783, "loss": 4.390407085418701, "lr": 0.0002, "elapsed_sec": 31362.06295442581, "step_time_sec": 8.229584232991328, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3784, "loss": 4.296734809875488, "lr": 0.0002, "elapsed_sec": 31370.294163942337, "step_time_sec": 8.231007308000699, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3785, "loss": 4.350225448608398, "lr": 0.0002, "elapsed_sec": 31378.524210453033, "step_time_sec": 8.229966996004805, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3786, "loss": 4.290098190307617, "lr": 0.0002, "elapsed_sec": 31386.754507780075, "step_time_sec": 8.230105569993611, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3787, "loss": 4.5767035484313965, "lr": 0.0002, "elapsed_sec": 31394.985919475555, "step_time_sec": 8.231290665018605, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3788, "loss": 4.336678504943848, "lr": 0.0002, "elapsed_sec": 31403.215519428253, "step_time_sec": 8.229375598020852, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3789, "loss": 4.073936462402344, "lr": 0.0002, "elapsed_sec": 31411.44603586197, "step_time_sec": 8.230355676001636, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3790, "loss": 4.2929840087890625, "lr": 0.0002, "elapsed_sec": 31419.676407575607, "step_time_sec": 8.230184684012784, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3791, "loss": 4.225894451141357, "lr": 0.0002, "elapsed_sec": 31427.9061729908, "step_time_sec": 8.229611801012652, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3792, "loss": 4.382284164428711, "lr": 0.0002, "elapsed_sec": 31436.136355161667, "step_time_sec": 8.230037800996797, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3793, "loss": 4.2564921379089355, "lr": 0.0002, "elapsed_sec": 31444.366619110107, "step_time_sec": 8.230107668001438, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3794, "loss": 4.3422465324401855, "lr": 0.0002, "elapsed_sec": 31452.596866369247, "step_time_sec": 8.230123354995158, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3795, "loss": 4.328639030456543, "lr": 0.0002, "elapsed_sec": 31460.82451748848, "step_time_sec": 8.22744173699175, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3796, "loss": 4.351274013519287, "lr": 0.0002, "elapsed_sec": 31469.055463314056, "step_time_sec": 8.230791070003761, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3797, "loss": 4.47899866104126, "lr": 0.0002, "elapsed_sec": 31477.28581929207, "step_time_sec": 8.230262465018313, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3798, "loss": 4.301695346832275, "lr": 0.0002, "elapsed_sec": 31485.517323732376, "step_time_sec": 8.231312211981276, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3799, "loss": 4.41202449798584, "lr": 0.0002, "elapsed_sec": 31493.749573230743, "step_time_sec": 8.232030459999805, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3800, "loss": 4.215738773345947, "lr": 0.0002, "elapsed_sec": 31501.980162620544, "step_time_sec": 8.230479896010365, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3801, "loss": 4.4170074462890625, "lr": 0.0002, "elapsed_sec": 31510.210983514786, "step_time_sec": 8.230647766002221, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3802, "loss": 4.231374263763428, "lr": 0.0002, "elapsed_sec": 31518.440893173218, "step_time_sec": 8.229780241992557, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3803, "loss": 4.288003444671631, "lr": 0.0002, "elapsed_sec": 31526.669897317886, "step_time_sec": 8.228832672000863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3804, "loss": 4.317137241363525, "lr": 0.0002, "elapsed_sec": 31534.90013360977, "step_time_sec": 8.230048764002277, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3805, "loss": 4.289917469024658, "lr": 0.0002, "elapsed_sec": 31543.128028154373, "step_time_sec": 8.227752683975268, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3806, "loss": 4.4511284828186035, "lr": 0.0002, "elapsed_sec": 31551.359020471573, "step_time_sec": 8.23083673499059, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3807, "loss": 4.257936477661133, "lr": 0.0002, "elapsed_sec": 31559.589111328125, "step_time_sec": 8.229936640011147, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3808, "loss": 4.304121971130371, "lr": 0.0002, "elapsed_sec": 31567.818137407303, "step_time_sec": 8.228905427007703, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3809, "loss": 4.261103630065918, "lr": 0.0002, "elapsed_sec": 31576.049158096313, "step_time_sec": 8.230802051984938, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3810, "loss": 4.407131671905518, "lr": 0.0002, "elapsed_sec": 31584.279551267624, "step_time_sec": 8.230248993000714, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3811, "loss": 4.26809549331665, "lr": 0.0002, "elapsed_sec": 31592.510011434555, "step_time_sec": 8.230304560012883, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3812, "loss": 4.280922889709473, "lr": 0.0002, "elapsed_sec": 31600.740103960037, "step_time_sec": 8.229914285999257, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3813, "loss": 4.188467502593994, "lr": 0.0002, "elapsed_sec": 31608.96900868416, "step_time_sec": 8.228765614010626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3814, "loss": 4.396171569824219, "lr": 0.0002, "elapsed_sec": 31617.197113752365, "step_time_sec": 8.2279789079912, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3815, "loss": 4.330630779266357, "lr": 0.0002, "elapsed_sec": 31625.425542116165, "step_time_sec": 8.228255172027275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3816, "loss": 4.336909294128418, "lr": 0.0002, "elapsed_sec": 31633.65650701523, "step_time_sec": 8.230875964014558, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3817, "loss": 4.3215460777282715, "lr": 0.0002, "elapsed_sec": 31641.887667655945, "step_time_sec": 8.230966978997458, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3818, "loss": 4.120808124542236, "lr": 0.0002, "elapsed_sec": 31650.1191945076, "step_time_sec": 8.231334563985001, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3819, "loss": 4.331411361694336, "lr": 0.0002, "elapsed_sec": 31658.349923610687, "step_time_sec": 8.230586613994092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3820, "loss": 4.3912224769592285, "lr": 0.0002, "elapsed_sec": 31666.581312656403, "step_time_sec": 8.231269074021839, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3821, "loss": 4.272830963134766, "lr": 0.0002, "elapsed_sec": 31674.811027288437, "step_time_sec": 8.229517987987492, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3822, "loss": 4.377232551574707, "lr": 0.0002, "elapsed_sec": 31683.041194200516, "step_time_sec": 8.230039961985312, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3823, "loss": 4.256333827972412, "lr": 0.0002, "elapsed_sec": 31691.272516965866, "step_time_sec": 8.231126322003547, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3824, "loss": 4.431614398956299, "lr": 0.0002, "elapsed_sec": 31699.503540992737, "step_time_sec": 8.2308996040083, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3825, "loss": 4.1836347579956055, "lr": 0.0002, "elapsed_sec": 31707.73529958725, "step_time_sec": 8.231569230993045, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3826, "loss": 4.150941848754883, "lr": 0.0002, "elapsed_sec": 31715.96590924263, "step_time_sec": 8.230442482978106, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3827, "loss": 4.342574119567871, "lr": 0.0002, "elapsed_sec": 31724.19496035576, "step_time_sec": 8.228916742024012, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3828, "loss": 4.28326416015625, "lr": 0.0002, "elapsed_sec": 31732.42486190796, "step_time_sec": 8.229747776000295, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3829, "loss": 4.237391948699951, "lr": 0.0002, "elapsed_sec": 31740.653920650482, "step_time_sec": 8.228905024996493, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3830, "loss": 4.340160369873047, "lr": 0.0002, "elapsed_sec": 31748.88503575325, "step_time_sec": 8.231004150002263, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3831, "loss": 4.244538307189941, "lr": 0.0002, "elapsed_sec": 31757.11570906639, "step_time_sec": 8.230486052983906, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3832, "loss": 4.319122791290283, "lr": 0.0002, "elapsed_sec": 31765.347103357315, "step_time_sec": 8.23119777397369, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3833, "loss": 4.277496337890625, "lr": 0.0002, "elapsed_sec": 31773.576152086258, "step_time_sec": 8.228955277008936, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3834, "loss": 4.389785289764404, "lr": 0.0002, "elapsed_sec": 31781.807203292847, "step_time_sec": 8.230863166012568, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3835, "loss": 4.145439147949219, "lr": 0.0002, "elapsed_sec": 31790.03978562355, "step_time_sec": 8.232410149998032, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3836, "loss": 4.139077186584473, "lr": 0.0002, "elapsed_sec": 31798.270528316498, "step_time_sec": 8.230671250988962, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3837, "loss": 4.190182209014893, "lr": 0.0002, "elapsed_sec": 31806.500913143158, "step_time_sec": 8.230163478001487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3838, "loss": 4.167932033538818, "lr": 0.0002, "elapsed_sec": 31814.731845617294, "step_time_sec": 8.230816619005054, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3839, "loss": 4.084413051605225, "lr": 0.0002, "elapsed_sec": 31822.963587284088, "step_time_sec": 8.231558266998036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3840, "loss": 4.274399280548096, "lr": 0.0002, "elapsed_sec": 31831.194047927856, "step_time_sec": 8.23031594598433, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3841, "loss": 4.275119304656982, "lr": 0.0002, "elapsed_sec": 31839.42563033104, "step_time_sec": 8.231411701999605, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3842, "loss": 4.353847980499268, "lr": 0.0002, "elapsed_sec": 31847.65371108055, "step_time_sec": 8.227906754007563, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3843, "loss": 4.250579833984375, "lr": 0.0002, "elapsed_sec": 31855.88263821602, "step_time_sec": 8.228770090994658, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3844, "loss": 4.2613115310668945, "lr": 0.0002, "elapsed_sec": 31864.112018108368, "step_time_sec": 8.22922277401085, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3845, "loss": 4.220020294189453, "lr": 0.0002, "elapsed_sec": 31872.34236931801, "step_time_sec": 8.23023260300397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3846, "loss": 4.328530311584473, "lr": 0.0002, "elapsed_sec": 31880.573424339294, "step_time_sec": 8.230880536983022, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3847, "loss": 4.111883163452148, "lr": 0.0002, "elapsed_sec": 31888.80087375641, "step_time_sec": 8.227314591000322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3848, "loss": 4.33354377746582, "lr": 0.0002, "elapsed_sec": 31897.029302597046, "step_time_sec": 8.228314463980496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3849, "loss": 4.300914764404297, "lr": 0.0002, "elapsed_sec": 31905.259470939636, "step_time_sec": 8.229979319003178, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3850, "loss": 4.314345836639404, "lr": 0.0002, "elapsed_sec": 31913.487662792206, "step_time_sec": 8.227999509021174, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3851, "loss": 4.226129055023193, "lr": 0.0002, "elapsed_sec": 31921.716953754425, "step_time_sec": 8.22918033300084, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3852, "loss": 4.25871467590332, "lr": 0.0002, "elapsed_sec": 31929.945899009705, "step_time_sec": 8.22875552001642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3853, "loss": 4.356433391571045, "lr": 0.0002, "elapsed_sec": 31938.173632144928, "step_time_sec": 8.2275514859939, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3854, "loss": 4.277080535888672, "lr": 0.0002, "elapsed_sec": 31946.40124297142, "step_time_sec": 8.227467894990696, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3855, "loss": 4.2778801918029785, "lr": 0.0002, "elapsed_sec": 31954.62933063507, "step_time_sec": 8.228011667000828, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3856, "loss": 4.382030963897705, "lr": 0.0002, "elapsed_sec": 31962.85838651657, "step_time_sec": 8.228844779980136, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3857, "loss": 4.154329299926758, "lr": 0.0002, "elapsed_sec": 31971.0886862278, "step_time_sec": 8.230137100996217, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3858, "loss": 4.327300548553467, "lr": 0.0002, "elapsed_sec": 31979.31985592842, "step_time_sec": 8.23100068300846, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3859, "loss": 4.259330749511719, "lr": 0.0002, "elapsed_sec": 31987.550444364548, "step_time_sec": 8.230474226002116, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3860, "loss": 4.31917667388916, "lr": 0.0002, "elapsed_sec": 31995.778881072998, "step_time_sec": 8.228248027997324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3861, "loss": 4.17183256149292, "lr": 0.0002, "elapsed_sec": 32004.00857949257, "step_time_sec": 8.229613914998481, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3862, "loss": 4.373720169067383, "lr": 0.0002, "elapsed_sec": 32012.239346027374, "step_time_sec": 8.230587941012345, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3863, "loss": 4.311954975128174, "lr": 0.0002, "elapsed_sec": 32020.469907045364, "step_time_sec": 8.230372709018411, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3864, "loss": 4.360599040985107, "lr": 0.0002, "elapsed_sec": 32028.701379060745, "step_time_sec": 8.231302826985484, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3865, "loss": 4.2553486824035645, "lr": 0.0002, "elapsed_sec": 32036.93158507347, "step_time_sec": 8.2300603230251, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3866, "loss": 4.167922496795654, "lr": 0.0002, "elapsed_sec": 32045.15955400467, "step_time_sec": 8.22781120298896, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3867, "loss": 4.277260780334473, "lr": 0.0002, "elapsed_sec": 32053.3904504776, "step_time_sec": 8.230718876991887, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3868, "loss": 4.3425493240356445, "lr": 0.0002, "elapsed_sec": 32061.621431827545, "step_time_sec": 8.23083918399061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3869, "loss": 4.2551069259643555, "lr": 0.0002, "elapsed_sec": 32069.851967573166, "step_time_sec": 8.230378501000814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3870, "loss": 4.298026084899902, "lr": 0.0002, "elapsed_sec": 32078.082473278046, "step_time_sec": 8.230349442979787, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3871, "loss": 4.358399391174316, "lr": 0.0002, "elapsed_sec": 32086.31368470192, "step_time_sec": 8.231068496010266, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3872, "loss": 4.185969352722168, "lr": 0.0002, "elapsed_sec": 32094.54557824135, "step_time_sec": 8.231735925015528, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3873, "loss": 4.361919403076172, "lr": 0.0002, "elapsed_sec": 32102.774753808975, "step_time_sec": 8.229030515998602, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3874, "loss": 4.200232982635498, "lr": 0.0002, "elapsed_sec": 32111.004696130753, "step_time_sec": 8.229777914006263, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3875, "loss": 4.371757507324219, "lr": 0.0002, "elapsed_sec": 32119.234733343124, "step_time_sec": 8.229957055998966, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3876, "loss": 4.138255596160889, "lr": 0.0002, "elapsed_sec": 32127.46549487114, "step_time_sec": 8.230582444986794, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3877, "loss": 4.416727066040039, "lr": 0.0002, "elapsed_sec": 32135.6967151165, "step_time_sec": 8.23101048200624, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3878, "loss": 4.195880889892578, "lr": 0.0002, "elapsed_sec": 32143.927642822266, "step_time_sec": 8.23082128498936, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3879, "loss": 4.240340709686279, "lr": 0.0002, "elapsed_sec": 32152.158358335495, "step_time_sec": 8.230542465986218, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3880, "loss": 4.3380446434021, "lr": 0.0002, "elapsed_sec": 32160.38964319229, "step_time_sec": 8.231124585989164, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3881, "loss": 4.4062981605529785, "lr": 0.0002, "elapsed_sec": 32168.618291139603, "step_time_sec": 8.228507658990566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3882, "loss": 4.243563175201416, "lr": 0.0002, "elapsed_sec": 32176.848026752472, "step_time_sec": 8.229609119996894, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3883, "loss": 4.202719688415527, "lr": 0.0002, "elapsed_sec": 32185.078941583633, "step_time_sec": 8.230721917992923, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3884, "loss": 4.261735439300537, "lr": 0.0002, "elapsed_sec": 32193.30961918831, "step_time_sec": 8.230512209003791, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3885, "loss": 4.311820983886719, "lr": 0.0002, "elapsed_sec": 32201.539818286896, "step_time_sec": 8.23005284901592, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3886, "loss": 4.381129741668701, "lr": 0.0002, "elapsed_sec": 32209.769422531128, "step_time_sec": 8.229456575005315, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3887, "loss": 4.307270050048828, "lr": 0.0002, "elapsed_sec": 32217.998383760452, "step_time_sec": 8.22879690802074, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3888, "loss": 4.227459907531738, "lr": 0.0002, "elapsed_sec": 32226.228764772415, "step_time_sec": 8.23022903999663, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3889, "loss": 4.277250289916992, "lr": 0.0002, "elapsed_sec": 32234.459151506424, "step_time_sec": 8.230303756019566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3890, "loss": 4.3239312171936035, "lr": 0.0002, "elapsed_sec": 32242.690387248993, "step_time_sec": 8.231035418022657, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3891, "loss": 4.090926647186279, "lr": 0.0002, "elapsed_sec": 32250.919135332108, "step_time_sec": 8.22858473600354, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3892, "loss": 4.1803460121154785, "lr": 0.0002, "elapsed_sec": 32259.149522066116, "step_time_sec": 8.230282070988324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3893, "loss": 4.438586711883545, "lr": 0.0002, "elapsed_sec": 32267.379987478256, "step_time_sec": 8.23030579200713, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3894, "loss": 4.375459671020508, "lr": 0.0002, "elapsed_sec": 32275.610587358475, "step_time_sec": 8.230377441010205, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3895, "loss": 4.275376319885254, "lr": 0.0002, "elapsed_sec": 32283.842797756195, "step_time_sec": 8.232052369014127, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3896, "loss": 4.228475093841553, "lr": 0.0002, "elapsed_sec": 32292.073044538498, "step_time_sec": 8.230087117000949, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3897, "loss": 4.214354038238525, "lr": 0.0002, "elapsed_sec": 32300.304495573044, "step_time_sec": 8.231300677987747, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3898, "loss": 4.291790962219238, "lr": 0.0002, "elapsed_sec": 32308.535672664642, "step_time_sec": 8.231020989012904, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3899, "loss": 4.108234882354736, "lr": 0.0002, "elapsed_sec": 32316.767096042633, "step_time_sec": 8.231272130011348, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3900, "loss": 4.3747239112854, "lr": 0.0002, "elapsed_sec": 32324.997485399246, "step_time_sec": 8.230280832998687, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3901, "loss": 4.477768421173096, "lr": 0.0002, "elapsed_sec": 32333.22814846039, "step_time_sec": 8.230472465977073, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3902, "loss": 4.349101543426514, "lr": 0.0002, "elapsed_sec": 32341.459362745285, "step_time_sec": 8.231069747009315, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3903, "loss": 4.480299949645996, "lr": 0.0002, "elapsed_sec": 32349.689737796783, "step_time_sec": 8.230271795007866, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3904, "loss": 4.311545372009277, "lr": 0.0002, "elapsed_sec": 32357.919539690018, "step_time_sec": 8.229578102997039, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3905, "loss": 4.170235633850098, "lr": 0.0002, "elapsed_sec": 32366.150017023087, "step_time_sec": 8.230317161011044, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3906, "loss": 4.339476108551025, "lr": 0.0002, "elapsed_sec": 32374.378854751587, "step_time_sec": 8.228706983994925, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3907, "loss": 4.249515533447266, "lr": 0.0002, "elapsed_sec": 32382.608156204224, "step_time_sec": 8.229149227001471, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3908, "loss": 4.266180515289307, "lr": 0.0002, "elapsed_sec": 32390.83783507347, "step_time_sec": 8.229471712984378, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3909, "loss": 4.304264068603516, "lr": 0.0002, "elapsed_sec": 32399.068893432617, "step_time_sec": 8.23093498198432, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3910, "loss": 4.212134838104248, "lr": 0.0002, "elapsed_sec": 32407.298929214478, "step_time_sec": 8.229930206987774, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3911, "loss": 4.338644027709961, "lr": 0.0002, "elapsed_sec": 32415.52836704254, "step_time_sec": 8.229272352997214, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3912, "loss": 4.468347549438477, "lr": 0.0002, "elapsed_sec": 32423.756993055344, "step_time_sec": 8.228388324991101, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3913, "loss": 4.279786586761475, "lr": 0.0002, "elapsed_sec": 32431.98514842987, "step_time_sec": 8.228012843988836, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3914, "loss": 4.272038459777832, "lr": 0.0002, "elapsed_sec": 32440.215823173523, "step_time_sec": 8.230523521022405, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3915, "loss": 4.37983512878418, "lr": 0.0002, "elapsed_sec": 32448.447207450867, "step_time_sec": 8.231334719981533, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3916, "loss": 4.336924076080322, "lr": 0.0002, "elapsed_sec": 32456.67639183998, "step_time_sec": 8.22894256201107, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3917, "loss": 4.224004745483398, "lr": 0.0002, "elapsed_sec": 32464.90575361252, "step_time_sec": 8.229193767998368, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3918, "loss": 4.259735584259033, "lr": 0.0002, "elapsed_sec": 32473.135135650635, "step_time_sec": 8.229206410003826, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3919, "loss": 4.301487922668457, "lr": 0.0002, "elapsed_sec": 32481.365961551666, "step_time_sec": 8.230703187000472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3920, "loss": 4.216820240020752, "lr": 0.0002, "elapsed_sec": 32489.596650600433, "step_time_sec": 8.230554267996922, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3921, "loss": 4.457220077514648, "lr": 0.0002, "elapsed_sec": 32497.828127622604, "step_time_sec": 8.231302995001897, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3922, "loss": 4.246103763580322, "lr": 0.0002, "elapsed_sec": 32506.057354211807, "step_time_sec": 8.22903198300628, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3923, "loss": 4.310373783111572, "lr": 0.0002, "elapsed_sec": 32514.28606414795, "step_time_sec": 8.228559976996621, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3924, "loss": 4.334420204162598, "lr": 0.0002, "elapsed_sec": 32522.51427745819, "step_time_sec": 8.228109343006508, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3925, "loss": 4.301932334899902, "lr": 0.0002, "elapsed_sec": 32530.742826223373, "step_time_sec": 8.228348442993592, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3926, "loss": 4.4106035232543945, "lr": 0.0002, "elapsed_sec": 32538.972368717194, "step_time_sec": 8.229382518009515, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3927, "loss": 4.340314865112305, "lr": 0.0002, "elapsed_sec": 32547.200555562973, "step_time_sec": 8.228003093012376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3928, "loss": 4.366415500640869, "lr": 0.0002, "elapsed_sec": 32555.42884516716, "step_time_sec": 8.228164996020496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3929, "loss": 4.415005207061768, "lr": 0.0002, "elapsed_sec": 32563.659534931183, "step_time_sec": 8.230506268999306, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3930, "loss": 4.26270055770874, "lr": 0.0002, "elapsed_sec": 32571.8895945549, "step_time_sec": 8.229875158984214, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3931, "loss": 4.18297004699707, "lr": 0.0002, "elapsed_sec": 32580.120247364044, "step_time_sec": 8.23050444398541, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3932, "loss": 4.297182559967041, "lr": 0.0002, "elapsed_sec": 32588.349094629288, "step_time_sec": 8.228704937006114, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3933, "loss": 4.224771499633789, "lr": 0.0002, "elapsed_sec": 32596.580075979233, "step_time_sec": 8.230879440990975, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3934, "loss": 4.26038122177124, "lr": 0.0002, "elapsed_sec": 32604.810129404068, "step_time_sec": 8.229882695013657, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3935, "loss": 4.429069519042969, "lr": 0.0002, "elapsed_sec": 32613.04045343399, "step_time_sec": 8.230121898988727, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3936, "loss": 4.204877853393555, "lr": 0.0002, "elapsed_sec": 32621.27154326439, "step_time_sec": 8.230974389007315, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3937, "loss": 4.31327486038208, "lr": 0.0002, "elapsed_sec": 32629.502623081207, "step_time_sec": 8.230957538005896, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3938, "loss": 4.420902729034424, "lr": 0.0002, "elapsed_sec": 32637.733356952667, "step_time_sec": 8.230564072000561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3939, "loss": 4.186838626861572, "lr": 0.0002, "elapsed_sec": 32645.96367096901, "step_time_sec": 8.230115476995707, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3940, "loss": 4.304981708526611, "lr": 0.0002, "elapsed_sec": 32654.19346523285, "step_time_sec": 8.229629857989494, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3941, "loss": 4.388359546661377, "lr": 0.0002, "elapsed_sec": 32662.42341375351, "step_time_sec": 8.229788258002372, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3942, "loss": 4.277223587036133, "lr": 0.0002, "elapsed_sec": 32670.65033721924, "step_time_sec": 8.226765828003408, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3943, "loss": 4.1953959465026855, "lr": 0.0002, "elapsed_sec": 32678.88058900833, "step_time_sec": 8.230116935999831, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3944, "loss": 4.362602710723877, "lr": 0.0002, "elapsed_sec": 32687.110641002655, "step_time_sec": 8.22987871998339, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3945, "loss": 4.436655521392822, "lr": 0.0002, "elapsed_sec": 32695.339935064316, "step_time_sec": 8.229162183008157, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3946, "loss": 4.286576271057129, "lr": 0.0002, "elapsed_sec": 32703.56959748268, "step_time_sec": 8.229499670007499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3947, "loss": 4.3497796058654785, "lr": 0.0002, "elapsed_sec": 32711.800163269043, "step_time_sec": 8.230429102986818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3948, "loss": 4.315189838409424, "lr": 0.0002, "elapsed_sec": 32720.03176164627, "step_time_sec": 8.23143991001416, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3949, "loss": 4.492434978485107, "lr": 0.0002, "elapsed_sec": 32728.261548519135, "step_time_sec": 8.229604284017114, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3950, "loss": 4.351651668548584, "lr": 0.0002, "elapsed_sec": 32736.49280309677, "step_time_sec": 8.231104831997072, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3951, "loss": 4.38931131362915, "lr": 0.0002, "elapsed_sec": 32744.723829984665, "step_time_sec": 8.230920963018434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3952, "loss": 4.326233863830566, "lr": 0.0002, "elapsed_sec": 32752.954044818878, "step_time_sec": 8.230064949020743, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3953, "loss": 4.314541339874268, "lr": 0.0002, "elapsed_sec": 32761.182446479797, "step_time_sec": 8.228184165025596, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3954, "loss": 4.195590019226074, "lr": 0.0002, "elapsed_sec": 32769.410066366196, "step_time_sec": 8.22749362999457, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3955, "loss": 4.255939960479736, "lr": 0.0002, "elapsed_sec": 32777.63795685768, "step_time_sec": 8.227713341999333, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3956, "loss": 4.351117134094238, "lr": 0.0002, "elapsed_sec": 32785.86761260033, "step_time_sec": 8.229514624981675, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3957, "loss": 4.325922966003418, "lr": 0.0002, "elapsed_sec": 32794.09872317314, "step_time_sec": 8.230939928005682, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3958, "loss": 4.290003299713135, "lr": 0.0002, "elapsed_sec": 32802.330263614655, "step_time_sec": 8.231373739981791, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3959, "loss": 4.449404239654541, "lr": 0.0002, "elapsed_sec": 32810.56187224388, "step_time_sec": 8.231424809986493, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3960, "loss": 4.444206714630127, "lr": 0.0002, "elapsed_sec": 32818.79297924042, "step_time_sec": 8.23096124100266, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3961, "loss": 4.237222194671631, "lr": 0.0002, "elapsed_sec": 32827.02352261543, "step_time_sec": 8.23040949300048, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3962, "loss": 4.290450572967529, "lr": 0.0002, "elapsed_sec": 32835.25476169586, "step_time_sec": 8.23114536801586, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3963, "loss": 4.3621745109558105, "lr": 0.0002, "elapsed_sec": 32843.48445916176, "step_time_sec": 8.229474777996074, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3964, "loss": 4.234163284301758, "lr": 0.0002, "elapsed_sec": 32851.71321582794, "step_time_sec": 8.228626289987005, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3965, "loss": 4.273988246917725, "lr": 0.0002, "elapsed_sec": 32859.943655729294, "step_time_sec": 8.230303109012311, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3966, "loss": 4.430858135223389, "lr": 0.0002, "elapsed_sec": 32868.17502808571, "step_time_sec": 8.231236416992033, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3967, "loss": 4.38608455657959, "lr": 0.0002, "elapsed_sec": 32876.406594753265, "step_time_sec": 8.231340051977895, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3968, "loss": 4.2669572830200195, "lr": 0.0002, "elapsed_sec": 32884.63741493225, "step_time_sec": 8.230673027021112, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3969, "loss": 4.317612171173096, "lr": 0.0002, "elapsed_sec": 32892.86892390251, "step_time_sec": 8.231402851990424, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3970, "loss": 4.322591304779053, "lr": 0.0002, "elapsed_sec": 32901.0968542099, "step_time_sec": 8.227719587011961, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3971, "loss": 4.279318332672119, "lr": 0.0002, "elapsed_sec": 32909.32596230507, "step_time_sec": 8.228936049999902, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3972, "loss": 4.328319549560547, "lr": 0.0002, "elapsed_sec": 32917.556689977646, "step_time_sec": 8.230610858998261, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3973, "loss": 4.324944019317627, "lr": 0.0002, "elapsed_sec": 32925.7859351635, "step_time_sec": 8.229082112986362, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3974, "loss": 4.265122890472412, "lr": 0.0002, "elapsed_sec": 32934.01494407654, "step_time_sec": 8.228849687002366, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3975, "loss": 4.27731466293335, "lr": 0.0002, "elapsed_sec": 32942.24562764168, "step_time_sec": 8.230518356023822, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3976, "loss": 4.261387348175049, "lr": 0.0002, "elapsed_sec": 32950.47680211067, "step_time_sec": 8.231026393012144, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3977, "loss": 4.336571216583252, "lr": 0.0002, "elapsed_sec": 32958.70794796944, "step_time_sec": 8.2309858709923, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3978, "loss": 4.3192853927612305, "lr": 0.0002, "elapsed_sec": 32966.93785715103, "step_time_sec": 8.229776074993424, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3979, "loss": 4.277933120727539, "lr": 0.0002, "elapsed_sec": 32975.16700530052, "step_time_sec": 8.22904399401159, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3980, "loss": 3.875608205795288, "lr": 0.0002, "elapsed_sec": 32983.397824287415, "step_time_sec": 8.230585450015496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3981, "loss": 4.085686683654785, "lr": 0.0002, "elapsed_sec": 32991.6296005249, "step_time_sec": 8.23164887499297, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3982, "loss": 4.276954174041748, "lr": 0.0002, "elapsed_sec": 32999.86080312729, "step_time_sec": 8.231041310995352, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3983, "loss": 4.448758602142334, "lr": 0.0002, "elapsed_sec": 33008.091693878174, "step_time_sec": 8.230760587990517, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3984, "loss": 4.193026065826416, "lr": 0.0002, "elapsed_sec": 33016.321335315704, "step_time_sec": 8.229500008019386, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3985, "loss": 4.375730514526367, "lr": 0.0002, "elapsed_sec": 33024.54862546921, "step_time_sec": 8.227132077998249, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3986, "loss": 4.162057876586914, "lr": 0.0002, "elapsed_sec": 33032.780299663544, "step_time_sec": 8.231450802995823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3987, "loss": 4.346490859985352, "lr": 0.0002, "elapsed_sec": 33041.010892391205, "step_time_sec": 8.230470782000339, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3988, "loss": 4.337082862854004, "lr": 0.0002, "elapsed_sec": 33049.24341201782, "step_time_sec": 8.232357786007924, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3989, "loss": 4.206600666046143, "lr": 0.0002, "elapsed_sec": 33057.473999500275, "step_time_sec": 8.23039647898986, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3990, "loss": 3.991809129714966, "lr": 0.0002, "elapsed_sec": 33065.7046983242, "step_time_sec": 8.230544145015301, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3991, "loss": 4.3525872230529785, "lr": 0.0002, "elapsed_sec": 33073.93561697006, "step_time_sec": 8.230789952998748, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3992, "loss": 4.242486476898193, "lr": 0.0002, "elapsed_sec": 33082.166511297226, "step_time_sec": 8.23067199299112, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3993, "loss": 4.28725528717041, "lr": 0.0002, "elapsed_sec": 33090.397877693176, "step_time_sec": 8.231244169001002, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3994, "loss": 4.293519496917725, "lr": 0.0002, "elapsed_sec": 33098.62749624252, "step_time_sec": 8.229396659007762, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3995, "loss": 4.358375549316406, "lr": 0.0002, "elapsed_sec": 33106.85695910454, "step_time_sec": 8.229383494996, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3996, "loss": 4.184296607971191, "lr": 0.0002, "elapsed_sec": 33115.08480429649, "step_time_sec": 8.227626616018824, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3997, "loss": 4.311811923980713, "lr": 0.0002, "elapsed_sec": 33123.31324481964, "step_time_sec": 8.228318581008352, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3998, "loss": 4.366016387939453, "lr": 0.0002, "elapsed_sec": 33131.54138255119, "step_time_sec": 8.227976163005223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 3999, "loss": 4.380951404571533, "lr": 0.0002, "elapsed_sec": 33139.772032022476, "step_time_sec": 8.230492333008442, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4000, "loss": 4.422539710998535, "lr": 0.0002, "elapsed_sec": 33148.00298333168, "step_time_sec": 51.73892071301816, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": true, "probe_ran": true, "probe_elapsed_sec": 0.985696245013969, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4001, "loss": 4.384857177734375, "lr": 0.0002, "elapsed_sec": 33199.73869919777, "step_time_sec": 8.227458298992133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4002, "loss": 4.305457592010498, "lr": 0.0002, "elapsed_sec": 33207.9682302475, "step_time_sec": 8.22931918100221, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4003, "loss": 4.266195297241211, "lr": 0.0002, "elapsed_sec": 33216.197984695435, "step_time_sec": 8.229645629005972, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4004, "loss": 4.339996814727783, "lr": 0.0002, "elapsed_sec": 33224.42846369743, "step_time_sec": 8.230269521998707, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4005, "loss": 4.194450378417969, "lr": 0.0002, "elapsed_sec": 33232.659590005875, "step_time_sec": 8.230951881007059, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4006, "loss": 4.402194023132324, "lr": 0.0002, "elapsed_sec": 33240.89109253883, "step_time_sec": 8.23136763900402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4007, "loss": 4.266725540161133, "lr": 0.0002, "elapsed_sec": 33249.121705532074, "step_time_sec": 8.230460794991814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4008, "loss": 4.371466159820557, "lr": 0.0002, "elapsed_sec": 33257.35180926323, "step_time_sec": 8.229947310988791, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4009, "loss": 4.332723140716553, "lr": 0.0002, "elapsed_sec": 33265.58226943016, "step_time_sec": 8.230347445991356, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4010, "loss": 4.452934265136719, "lr": 0.0002, "elapsed_sec": 33273.8131582737, "step_time_sec": 8.230686078983126, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4011, "loss": 4.192954063415527, "lr": 0.0002, "elapsed_sec": 33282.04400587082, "step_time_sec": 8.230753706011456, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4012, "loss": 4.215887546539307, "lr": 0.0002, "elapsed_sec": 33290.27407646179, "step_time_sec": 8.229857912985608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4013, "loss": 4.303738117218018, "lr": 0.0002, "elapsed_sec": 33298.50505781174, "step_time_sec": 8.230865249002818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4014, "loss": 4.256423473358154, "lr": 0.0002, "elapsed_sec": 33306.73493909836, "step_time_sec": 8.229655689996434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4015, "loss": 4.268110752105713, "lr": 0.0002, "elapsed_sec": 33314.96511387825, "step_time_sec": 8.230090469995048, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4016, "loss": 4.526261329650879, "lr": 0.0002, "elapsed_sec": 33323.19543838501, "step_time_sec": 8.230127691989765, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4017, "loss": 4.3115386962890625, "lr": 0.0002, "elapsed_sec": 33331.426327228546, "step_time_sec": 8.230715515994234, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4018, "loss": 4.212771892547607, "lr": 0.0002, "elapsed_sec": 33339.65464448929, "step_time_sec": 8.22815282997908, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4019, "loss": 4.275023937225342, "lr": 0.0002, "elapsed_sec": 33347.88516449928, "step_time_sec": 8.230341685004532, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4020, "loss": 4.392457485198975, "lr": 0.0002, "elapsed_sec": 33356.11663079262, "step_time_sec": 8.231302751984913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4021, "loss": 4.197851181030273, "lr": 0.0002, "elapsed_sec": 33364.34714126587, "step_time_sec": 8.230374895007117, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4022, "loss": 4.314797401428223, "lr": 0.0002, "elapsed_sec": 33372.57848119736, "step_time_sec": 8.231181431008736, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4023, "loss": 4.460278034210205, "lr": 0.0002, "elapsed_sec": 33380.81063747406, "step_time_sec": 8.232024247990921, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4024, "loss": 4.308619022369385, "lr": 0.0002, "elapsed_sec": 33389.04028105736, "step_time_sec": 8.229474349995144, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4025, "loss": 4.307256698608398, "lr": 0.0002, "elapsed_sec": 33397.270745038986, "step_time_sec": 8.23035060701659, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4026, "loss": 4.383181571960449, "lr": 0.0002, "elapsed_sec": 33405.5018620491, "step_time_sec": 8.230937371001346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4027, "loss": 4.403078079223633, "lr": 0.0002, "elapsed_sec": 33413.73400807381, "step_time_sec": 8.231924410996726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4028, "loss": 4.267844200134277, "lr": 0.0002, "elapsed_sec": 33421.964697122574, "step_time_sec": 8.230510556983063, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4029, "loss": 4.341007232666016, "lr": 0.0002, "elapsed_sec": 33430.194853782654, "step_time_sec": 8.230023088020971, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4030, "loss": 4.257221221923828, "lr": 0.0002, "elapsed_sec": 33438.42445421219, "step_time_sec": 8.229481965012383, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4031, "loss": 4.42524528503418, "lr": 0.0002, "elapsed_sec": 33446.65190720558, "step_time_sec": 8.227241774002323, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4032, "loss": 4.283503532409668, "lr": 0.0002, "elapsed_sec": 33454.882897138596, "step_time_sec": 8.23082916601561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4033, "loss": 4.271056652069092, "lr": 0.0002, "elapsed_sec": 33463.11347723007, "step_time_sec": 8.230418316001305, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4034, "loss": 4.375495910644531, "lr": 0.0002, "elapsed_sec": 33471.3446059227, "step_time_sec": 8.230976151011419, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4035, "loss": 4.244600296020508, "lr": 0.0002, "elapsed_sec": 33479.57454895973, "step_time_sec": 8.229798173997551, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4036, "loss": 4.324399471282959, "lr": 0.0002, "elapsed_sec": 33487.804590940475, "step_time_sec": 8.229921593010658, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4037, "loss": 4.256872177124023, "lr": 0.0002, "elapsed_sec": 33496.03405499458, "step_time_sec": 8.229253776022233, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4038, "loss": 4.359719753265381, "lr": 0.0002, "elapsed_sec": 33504.26244068146, "step_time_sec": 8.22821473798831, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4039, "loss": 4.252533912658691, "lr": 0.0002, "elapsed_sec": 33512.49233698845, "step_time_sec": 8.229724341013934, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4040, "loss": 4.270328521728516, "lr": 0.0002, "elapsed_sec": 33520.72142601013, "step_time_sec": 8.228976220998447, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4041, "loss": 4.183221817016602, "lr": 0.0002, "elapsed_sec": 33528.95025014877, "step_time_sec": 8.228632746002404, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4042, "loss": 4.198371887207031, "lr": 0.0002, "elapsed_sec": 33537.18160867691, "step_time_sec": 8.231181522016414, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4043, "loss": 4.3062424659729, "lr": 0.0002, "elapsed_sec": 33545.4115626812, "step_time_sec": 8.229786346986657, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4044, "loss": 4.023455619812012, "lr": 0.0002, "elapsed_sec": 33553.641612291336, "step_time_sec": 8.229984003002755, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4045, "loss": 4.386758327484131, "lr": 0.0002, "elapsed_sec": 33561.872718811035, "step_time_sec": 8.230846355989343, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4046, "loss": 4.333779335021973, "lr": 0.0002, "elapsed_sec": 33570.103511333466, "step_time_sec": 8.23065638000844, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4047, "loss": 4.460576057434082, "lr": 0.0002, "elapsed_sec": 33578.33442950249, "step_time_sec": 8.230788190994645, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4048, "loss": 4.312905788421631, "lr": 0.0002, "elapsed_sec": 33586.56486725807, "step_time_sec": 8.230271022010129, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4049, "loss": 4.160184860229492, "lr": 0.0002, "elapsed_sec": 33594.79610800743, "step_time_sec": 8.231046241009608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4050, "loss": 4.214859962463379, "lr": 0.0002, "elapsed_sec": 33603.02510547638, "step_time_sec": 8.228832053020597, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4051, "loss": 4.249477386474609, "lr": 0.0002, "elapsed_sec": 33611.256120443344, "step_time_sec": 8.230830610991688, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4052, "loss": 4.374999046325684, "lr": 0.0002, "elapsed_sec": 33619.48470830917, "step_time_sec": 8.22846692497842, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4053, "loss": 4.179116725921631, "lr": 0.0002, "elapsed_sec": 33627.71289277077, "step_time_sec": 8.227964288002113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4054, "loss": 4.229220390319824, "lr": 0.0002, "elapsed_sec": 33635.93907403946, "step_time_sec": 8.226047530974029, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4055, "loss": 4.299295425415039, "lr": 0.0002, "elapsed_sec": 33644.16793465614, "step_time_sec": 8.228672770987032, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4056, "loss": 4.322876453399658, "lr": 0.0002, "elapsed_sec": 33652.397762298584, "step_time_sec": 8.229662704019574, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4057, "loss": 4.34381628036499, "lr": 0.0002, "elapsed_sec": 33660.62776708603, "step_time_sec": 8.229830460011726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4058, "loss": 4.265216827392578, "lr": 0.0002, "elapsed_sec": 33668.85787034035, "step_time_sec": 8.229991811997024, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4059, "loss": 4.426596641540527, "lr": 0.0002, "elapsed_sec": 33677.08831238747, "step_time_sec": 8.23030676401686, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4060, "loss": 4.254561901092529, "lr": 0.0002, "elapsed_sec": 33685.31840586662, "step_time_sec": 8.229881947976537, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4061, "loss": 4.252817630767822, "lr": 0.0002, "elapsed_sec": 33693.54955244064, "step_time_sec": 8.231037249992369, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4062, "loss": 4.493114948272705, "lr": 0.0002, "elapsed_sec": 33702.90769267082, "step_time_sec": 9.358018344995799, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4063, "loss": 4.2273359298706055, "lr": 0.0002, "elapsed_sec": 33711.138095378876, "step_time_sec": 8.230218339012936, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4064, "loss": 4.245636940002441, "lr": 0.0002, "elapsed_sec": 33719.3677406311, "step_time_sec": 8.229415062989574, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4065, "loss": 4.363088130950928, "lr": 0.0002, "elapsed_sec": 33727.598266363144, "step_time_sec": 8.230364038987318, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4066, "loss": 4.304383277893066, "lr": 0.0002, "elapsed_sec": 33735.83019733429, "step_time_sec": 8.23179884700221, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4067, "loss": 4.3014020919799805, "lr": 0.0002, "elapsed_sec": 33744.06062245369, "step_time_sec": 8.23026472600759, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4068, "loss": 4.458287715911865, "lr": 0.0002, "elapsed_sec": 33752.289804935455, "step_time_sec": 8.229022110986989, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4069, "loss": 4.077003002166748, "lr": 0.0002, "elapsed_sec": 33760.519089221954, "step_time_sec": 8.22911913198186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4070, "loss": 4.341283798217773, "lr": 0.0002, "elapsed_sec": 33768.75053071976, "step_time_sec": 8.231301274994621, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4071, "loss": 4.070695877075195, "lr": 0.0002, "elapsed_sec": 33776.98085260391, "step_time_sec": 8.2301411619992, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4072, "loss": 4.279664993286133, "lr": 0.0002, "elapsed_sec": 33785.21238398552, "step_time_sec": 8.231389958993532, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4073, "loss": 4.2595672607421875, "lr": 0.0002, "elapsed_sec": 33793.442472696304, "step_time_sec": 8.22995317200548, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4074, "loss": 4.345858573913574, "lr": 0.0002, "elapsed_sec": 33801.67375636101, "step_time_sec": 8.231093782000244, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4075, "loss": 4.208197116851807, "lr": 0.0002, "elapsed_sec": 33809.9043636322, "step_time_sec": 8.230445424007485, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4076, "loss": 4.217564582824707, "lr": 0.0002, "elapsed_sec": 33818.13450217247, "step_time_sec": 8.229994446999626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4077, "loss": 4.275092124938965, "lr": 0.0002, "elapsed_sec": 33826.364958524704, "step_time_sec": 8.230332434992306, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4078, "loss": 4.3203301429748535, "lr": 0.0002, "elapsed_sec": 33834.59695291519, "step_time_sec": 8.231792887003394, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4079, "loss": 4.2566752433776855, "lr": 0.0002, "elapsed_sec": 33842.82767057419, "step_time_sec": 8.230569406005088, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4080, "loss": 4.35008430480957, "lr": 0.0002, "elapsed_sec": 33851.05932021141, "step_time_sec": 8.231525240000337, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4081, "loss": 4.1932783126831055, "lr": 0.0002, "elapsed_sec": 33859.288756370544, "step_time_sec": 8.229252903023735, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4082, "loss": 4.331544399261475, "lr": 0.0002, "elapsed_sec": 33867.518914699554, "step_time_sec": 8.229992469015997, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4083, "loss": 4.206451892852783, "lr": 0.0002, "elapsed_sec": 33875.747609853745, "step_time_sec": 8.228541630000109, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4084, "loss": 4.30452299118042, "lr": 0.0002, "elapsed_sec": 33883.97490596771, "step_time_sec": 8.227180988993496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4085, "loss": 4.232844829559326, "lr": 0.0002, "elapsed_sec": 33892.20516037941, "step_time_sec": 8.230077160988003, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4086, "loss": 4.148730278015137, "lr": 0.0002, "elapsed_sec": 33900.43563771248, "step_time_sec": 8.230322903982596, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4087, "loss": 4.320339202880859, "lr": 0.0002, "elapsed_sec": 33908.66425037384, "step_time_sec": 8.228517479001312, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4088, "loss": 4.266329288482666, "lr": 0.0002, "elapsed_sec": 33916.89208650589, "step_time_sec": 8.227622528997017, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4089, "loss": 4.352088451385498, "lr": 0.0002, "elapsed_sec": 33925.121044158936, "step_time_sec": 8.228763149003498, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4090, "loss": 4.086559772491455, "lr": 0.0002, "elapsed_sec": 33933.34761548042, "step_time_sec": 8.226408405986149, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4091, "loss": 4.485457897186279, "lr": 0.0002, "elapsed_sec": 33941.57823085785, "step_time_sec": 8.230512491980335, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4092, "loss": 4.136302471160889, "lr": 0.0002, "elapsed_sec": 33949.810003995895, "step_time_sec": 8.231596344994614, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4093, "loss": 4.2744460105896, "lr": 0.0002, "elapsed_sec": 33958.03824162483, "step_time_sec": 8.228059456014307, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4094, "loss": 4.256343841552734, "lr": 0.0002, "elapsed_sec": 33966.26855683327, "step_time_sec": 8.23020341299707, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4095, "loss": 4.223748207092285, "lr": 0.0002, "elapsed_sec": 33974.4979827404, "step_time_sec": 8.229233322985237, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4096, "loss": 4.284183025360107, "lr": 0.0002, "elapsed_sec": 33982.72527503967, "step_time_sec": 8.227126787998714, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4097, "loss": 4.247807025909424, "lr": 0.0002, "elapsed_sec": 33990.95485544205, "step_time_sec": 8.229428873979487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4098, "loss": 4.169789791107178, "lr": 0.0002, "elapsed_sec": 33999.183745622635, "step_time_sec": 8.228748528985307, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4099, "loss": 4.259757041931152, "lr": 0.0002, "elapsed_sec": 34007.41255545616, "step_time_sec": 8.228623881994281, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4100, "loss": 4.243464946746826, "lr": 0.0002, "elapsed_sec": 34015.640749931335, "step_time_sec": 8.228026256983867, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4101, "loss": 4.445199012756348, "lr": 0.0002, "elapsed_sec": 34023.870812654495, "step_time_sec": 8.229965359001653, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4102, "loss": 4.2407026290893555, "lr": 0.0002, "elapsed_sec": 34032.102819919586, "step_time_sec": 8.231805896997685, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4103, "loss": 4.159867763519287, "lr": 0.0002, "elapsed_sec": 34040.334028959274, "step_time_sec": 8.231067554996116, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4104, "loss": 4.239254474639893, "lr": 0.0002, "elapsed_sec": 34048.56440806389, "step_time_sec": 8.23021532202256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4105, "loss": 4.326719284057617, "lr": 0.0002, "elapsed_sec": 34056.79671216011, "step_time_sec": 8.232144548004726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4106, "loss": 4.336667060852051, "lr": 0.0002, "elapsed_sec": 34065.02676510811, "step_time_sec": 8.229904150008224, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4107, "loss": 4.1862993240356445, "lr": 0.0002, "elapsed_sec": 34073.257529735565, "step_time_sec": 8.230588854989037, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4108, "loss": 4.400637149810791, "lr": 0.0002, "elapsed_sec": 34081.48550200462, "step_time_sec": 8.227883703017142, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4109, "loss": 4.187801837921143, "lr": 0.0002, "elapsed_sec": 34089.71510863304, "step_time_sec": 8.229432138003176, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4110, "loss": 4.237775802612305, "lr": 0.0002, "elapsed_sec": 34097.94576907158, "step_time_sec": 8.23046348898788, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4111, "loss": 4.178308963775635, "lr": 0.0002, "elapsed_sec": 34106.176043987274, "step_time_sec": 8.230112279008608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4112, "loss": 4.175769329071045, "lr": 0.0002, "elapsed_sec": 34114.406440258026, "step_time_sec": 8.230243673024233, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4113, "loss": 4.17741584777832, "lr": 0.0002, "elapsed_sec": 34122.636019706726, "step_time_sec": 8.22942785199848, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4114, "loss": 4.2991790771484375, "lr": 0.0002, "elapsed_sec": 34130.86629509926, "step_time_sec": 8.230100601998856, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4115, "loss": 4.2946672439575195, "lr": 0.0002, "elapsed_sec": 34139.096051216125, "step_time_sec": 8.229668834013864, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4116, "loss": 4.208618640899658, "lr": 0.0002, "elapsed_sec": 34147.32666540146, "step_time_sec": 8.230389438016573, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4117, "loss": 4.210371494293213, "lr": 0.0002, "elapsed_sec": 34155.55725026131, "step_time_sec": 8.230512410984375, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4118, "loss": 4.232898712158203, "lr": 0.0002, "elapsed_sec": 34163.78754615784, "step_time_sec": 8.230124997993698, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4119, "loss": 4.366714000701904, "lr": 0.0002, "elapsed_sec": 34172.01846885681, "step_time_sec": 8.23071033298038, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4120, "loss": 4.254580497741699, "lr": 0.0002, "elapsed_sec": 34180.24860239029, "step_time_sec": 8.229973162990063, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4121, "loss": 4.2276835441589355, "lr": 0.0002, "elapsed_sec": 34188.47762465477, "step_time_sec": 8.228921707981499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4122, "loss": 4.2650861740112305, "lr": 0.0002, "elapsed_sec": 34196.706067085266, "step_time_sec": 8.228269685001578, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4123, "loss": 4.187106132507324, "lr": 0.0002, "elapsed_sec": 34204.9356007576, "step_time_sec": 8.229318394005531, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4124, "loss": 4.202491760253906, "lr": 0.0002, "elapsed_sec": 34213.16677904129, "step_time_sec": 8.231018838996533, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4125, "loss": 4.341306686401367, "lr": 0.0002, "elapsed_sec": 34221.397476673126, "step_time_sec": 8.23054849699838, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4126, "loss": 4.365311622619629, "lr": 0.0002, "elapsed_sec": 34229.62870597839, "step_time_sec": 8.231142192991683, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4127, "loss": 4.2230329513549805, "lr": 0.0002, "elapsed_sec": 34237.857099056244, "step_time_sec": 8.228164753003512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4128, "loss": 4.235674858093262, "lr": 0.0002, "elapsed_sec": 34246.08785247803, "step_time_sec": 8.230642031005118, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4129, "loss": 4.215052127838135, "lr": 0.0002, "elapsed_sec": 34254.31916832924, "step_time_sec": 8.231108632986434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4130, "loss": 4.2000555992126465, "lr": 0.0002, "elapsed_sec": 34262.549669742584, "step_time_sec": 8.230357663007453, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4131, "loss": 4.1839799880981445, "lr": 0.0002, "elapsed_sec": 34270.78058767319, "step_time_sec": 8.230740681989118, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4132, "loss": 4.145847320556641, "lr": 0.0002, "elapsed_sec": 34279.009843587875, "step_time_sec": 8.229133902001195, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4133, "loss": 4.277129650115967, "lr": 0.0002, "elapsed_sec": 34287.23965382576, "step_time_sec": 8.22959660901688, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4134, "loss": 4.220885276794434, "lr": 0.0002, "elapsed_sec": 34295.46726179123, "step_time_sec": 8.227463381976122, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4135, "loss": 4.277261734008789, "lr": 0.0002, "elapsed_sec": 34303.69751429558, "step_time_sec": 8.230115261976607, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4136, "loss": 4.219878673553467, "lr": 0.0002, "elapsed_sec": 34311.92606806755, "step_time_sec": 8.228419315011706, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4137, "loss": 4.2660322189331055, "lr": 0.0002, "elapsed_sec": 34320.154601573944, "step_time_sec": 8.228321008005878, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4138, "loss": 4.225204944610596, "lr": 0.0002, "elapsed_sec": 34328.38332223892, "step_time_sec": 8.228537096991204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4139, "loss": 4.209633827209473, "lr": 0.0002, "elapsed_sec": 34336.614518880844, "step_time_sec": 8.231122663011774, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4140, "loss": 4.306357383728027, "lr": 0.0002, "elapsed_sec": 34344.84499812126, "step_time_sec": 8.230246456019813, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4141, "loss": 4.3075408935546875, "lr": 0.0002, "elapsed_sec": 34353.075548648834, "step_time_sec": 8.230364326998824, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4142, "loss": 4.176600933074951, "lr": 0.0002, "elapsed_sec": 34361.30649638176, "step_time_sec": 8.23078117100522, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4143, "loss": 4.194456100463867, "lr": 0.0002, "elapsed_sec": 34369.53761482239, "step_time_sec": 8.2310041479941, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4144, "loss": 4.318521499633789, "lr": 0.0002, "elapsed_sec": 34377.7653670311, "step_time_sec": 8.227590851020068, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4145, "loss": 4.267191410064697, "lr": 0.0002, "elapsed_sec": 34385.99464392662, "step_time_sec": 8.229112878994783, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4146, "loss": 4.221352577209473, "lr": 0.0002, "elapsed_sec": 34394.223366737366, "step_time_sec": 8.228613095998298, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4147, "loss": 4.206750392913818, "lr": 0.0002, "elapsed_sec": 34402.45470023155, "step_time_sec": 8.231094100978225, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4148, "loss": 4.2610955238342285, "lr": 0.0002, "elapsed_sec": 34410.68575954437, "step_time_sec": 8.230918549990747, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4149, "loss": 4.1735076904296875, "lr": 0.0002, "elapsed_sec": 34418.91620492935, "step_time_sec": 8.23029818199575, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4150, "loss": 4.289871692657471, "lr": 0.0002, "elapsed_sec": 34427.14606714249, "step_time_sec": 8.229721591022098, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4151, "loss": 4.333330154418945, "lr": 0.0002, "elapsed_sec": 34435.376002550125, "step_time_sec": 8.229762739996659, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4152, "loss": 4.246454238891602, "lr": 0.0002, "elapsed_sec": 34443.60615038872, "step_time_sec": 8.229948700987734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4153, "loss": 4.326076507568359, "lr": 0.0002, "elapsed_sec": 34451.83817625046, "step_time_sec": 8.231919874000596, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4154, "loss": 4.235642910003662, "lr": 0.0002, "elapsed_sec": 34460.068803071976, "step_time_sec": 8.230446248984663, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4155, "loss": 4.292428493499756, "lr": 0.0002, "elapsed_sec": 34468.29710268974, "step_time_sec": 8.228090232005343, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4156, "loss": 4.122279167175293, "lr": 0.0002, "elapsed_sec": 34476.526305913925, "step_time_sec": 8.229056397016393, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4157, "loss": 4.213035583496094, "lr": 0.0002, "elapsed_sec": 34484.75753188133, "step_time_sec": 8.23111958100344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4158, "loss": 4.177075386047363, "lr": 0.0002, "elapsed_sec": 34492.98907995224, "step_time_sec": 8.23134176901658, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4159, "loss": 4.082457542419434, "lr": 0.0002, "elapsed_sec": 34501.22002649307, "step_time_sec": 8.23076078898157, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4160, "loss": 4.310046195983887, "lr": 0.0002, "elapsed_sec": 34509.45117998123, "step_time_sec": 8.231035776989302, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4161, "loss": 4.12709903717041, "lr": 0.0002, "elapsed_sec": 34517.681569337845, "step_time_sec": 8.230195234995335, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4162, "loss": 4.284276008605957, "lr": 0.0002, "elapsed_sec": 34525.911006212234, "step_time_sec": 8.229278216022067, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4163, "loss": 4.133804798126221, "lr": 0.0002, "elapsed_sec": 34534.14221763611, "step_time_sec": 8.23109895500238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4164, "loss": 4.112847805023193, "lr": 0.0002, "elapsed_sec": 34542.373071432114, "step_time_sec": 8.230613558000186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4165, "loss": 4.152198791503906, "lr": 0.0002, "elapsed_sec": 34550.604615688324, "step_time_sec": 8.23138351700618, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4166, "loss": 4.20428466796875, "lr": 0.0002, "elapsed_sec": 34558.83511304855, "step_time_sec": 8.230371104000369, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4167, "loss": 4.251889228820801, "lr": 0.0002, "elapsed_sec": 34567.06604504585, "step_time_sec": 8.230826017999789, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4168, "loss": 4.433748722076416, "lr": 0.0002, "elapsed_sec": 34575.294405698776, "step_time_sec": 8.228221750003286, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4169, "loss": 4.101216793060303, "lr": 0.0002, "elapsed_sec": 34583.5234541893, "step_time_sec": 8.228823273006128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4170, "loss": 4.166567802429199, "lr": 0.0002, "elapsed_sec": 34591.754405260086, "step_time_sec": 8.230807398998877, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4171, "loss": 4.23553466796875, "lr": 0.0002, "elapsed_sec": 34599.985154390335, "step_time_sec": 8.230583260999992, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4172, "loss": 4.286533832550049, "lr": 0.0002, "elapsed_sec": 34608.21602392197, "step_time_sec": 8.230767305009067, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4173, "loss": 4.480080604553223, "lr": 0.0002, "elapsed_sec": 34616.44503855705, "step_time_sec": 8.228811210981803, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4174, "loss": 4.177270412445068, "lr": 0.0002, "elapsed_sec": 34624.675112485886, "step_time_sec": 8.229935151000973, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4175, "loss": 4.228646278381348, "lr": 0.0002, "elapsed_sec": 34632.90258836746, "step_time_sec": 8.22730056400178, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4176, "loss": 4.2085137367248535, "lr": 0.0002, "elapsed_sec": 34641.13256978989, "step_time_sec": 8.229820028005634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4177, "loss": 4.281313419342041, "lr": 0.0002, "elapsed_sec": 34649.36374902725, "step_time_sec": 8.231063029990764, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4178, "loss": 4.341627597808838, "lr": 0.0002, "elapsed_sec": 34657.59204173088, "step_time_sec": 8.228094961988972, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4179, "loss": 4.159278392791748, "lr": 0.0002, "elapsed_sec": 34665.82021212578, "step_time_sec": 8.228058967011748, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4180, "loss": 4.280425548553467, "lr": 0.0002, "elapsed_sec": 34674.051664590836, "step_time_sec": 8.231251057994086, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4181, "loss": 4.155552387237549, "lr": 0.0002, "elapsed_sec": 34682.28232336044, "step_time_sec": 8.230551907006884, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4182, "loss": 4.173027038574219, "lr": 0.0002, "elapsed_sec": 34690.51324558258, "step_time_sec": 8.230808064021403, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4183, "loss": 4.305334091186523, "lr": 0.0002, "elapsed_sec": 34698.74385070801, "step_time_sec": 8.23039453101228, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4184, "loss": 4.168464660644531, "lr": 0.0002, "elapsed_sec": 34706.975350141525, "step_time_sec": 8.231299726990983, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4185, "loss": 4.306933879852295, "lr": 0.0002, "elapsed_sec": 34715.20340800285, "step_time_sec": 8.227918674994726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4186, "loss": 4.388588905334473, "lr": 0.0002, "elapsed_sec": 34723.43445968628, "step_time_sec": 8.230898403009633, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4187, "loss": 4.193704128265381, "lr": 0.0002, "elapsed_sec": 34731.667002916336, "step_time_sec": 8.232401096000103, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4188, "loss": 4.119025707244873, "lr": 0.0002, "elapsed_sec": 34739.89766287804, "step_time_sec": 8.230502922990127, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4189, "loss": 4.310399532318115, "lr": 0.0002, "elapsed_sec": 34748.12813258171, "step_time_sec": 8.230291850020876, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4190, "loss": 4.150895118713379, "lr": 0.0002, "elapsed_sec": 34756.356452941895, "step_time_sec": 8.228167213994311, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4191, "loss": 4.223357677459717, "lr": 0.0002, "elapsed_sec": 34764.587906360626, "step_time_sec": 8.231322708015796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4192, "loss": 4.158988952636719, "lr": 0.0002, "elapsed_sec": 34772.81887817383, "step_time_sec": 8.230779742007144, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4193, "loss": 4.176754474639893, "lr": 0.0002, "elapsed_sec": 34781.047790527344, "step_time_sec": 8.228784533013823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4194, "loss": 4.265151500701904, "lr": 0.0002, "elapsed_sec": 34789.27626109123, "step_time_sec": 8.228365399001632, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4195, "loss": 4.271710395812988, "lr": 0.0002, "elapsed_sec": 34797.506242752075, "step_time_sec": 8.229813540005125, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4196, "loss": 4.302209854125977, "lr": 0.0002, "elapsed_sec": 34805.73575282097, "step_time_sec": 8.229298153979471, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4197, "loss": 4.299983978271484, "lr": 0.0002, "elapsed_sec": 34813.96444654465, "step_time_sec": 8.228649226017296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4198, "loss": 4.135176658630371, "lr": 0.0002, "elapsed_sec": 34822.1939368248, "step_time_sec": 8.229257673985558, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4199, "loss": 4.142453193664551, "lr": 0.0002, "elapsed_sec": 34830.42457461357, "step_time_sec": 8.230436305020703, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4200, "loss": 4.398089408874512, "lr": 0.0002, "elapsed_sec": 34838.65573000908, "step_time_sec": 8.23098977500922, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4201, "loss": 4.060574054718018, "lr": 0.0002, "elapsed_sec": 34846.88739466667, "step_time_sec": 8.231539783999324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4202, "loss": 4.385723114013672, "lr": 0.0002, "elapsed_sec": 34855.1184027195, "step_time_sec": 8.23083910398418, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4203, "loss": 4.401528835296631, "lr": 0.0002, "elapsed_sec": 34863.349467754364, "step_time_sec": 8.230893965024734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4204, "loss": 4.209896564483643, "lr": 0.0002, "elapsed_sec": 34871.57958030701, "step_time_sec": 8.229978642018978, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4205, "loss": 4.173553943634033, "lr": 0.0002, "elapsed_sec": 34879.810252428055, "step_time_sec": 8.230569870996987, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4206, "loss": 4.086690425872803, "lr": 0.0002, "elapsed_sec": 34888.039330005646, "step_time_sec": 8.22882388200378, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4207, "loss": 4.35880184173584, "lr": 0.0002, "elapsed_sec": 34896.26861333847, "step_time_sec": 8.229134824010544, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4208, "loss": 4.258025169372559, "lr": 0.0002, "elapsed_sec": 34904.49708032608, "step_time_sec": 8.228346207994036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4209, "loss": 4.274044513702393, "lr": 0.0002, "elapsed_sec": 34912.72744512558, "step_time_sec": 8.230163344007451, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4210, "loss": 4.337063312530518, "lr": 0.0002, "elapsed_sec": 34920.95828270912, "step_time_sec": 8.23070816899417, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4211, "loss": 4.284075736999512, "lr": 0.0002, "elapsed_sec": 34929.185697078705, "step_time_sec": 8.227199511020444, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4212, "loss": 4.220357894897461, "lr": 0.0002, "elapsed_sec": 34937.414078474045, "step_time_sec": 8.228284837998217, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4213, "loss": 4.420162677764893, "lr": 0.0002, "elapsed_sec": 34945.64201378822, "step_time_sec": 8.227736275002826, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4214, "loss": 4.27372407913208, "lr": 0.0002, "elapsed_sec": 34953.87058329582, "step_time_sec": 8.228407937975135, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4215, "loss": 4.199394702911377, "lr": 0.0002, "elapsed_sec": 34962.09833908081, "step_time_sec": 8.227619251003489, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4216, "loss": 4.248223304748535, "lr": 0.0002, "elapsed_sec": 34970.33052825928, "step_time_sec": 8.23199285400915, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4217, "loss": 4.125733852386475, "lr": 0.0002, "elapsed_sec": 34978.560658454895, "step_time_sec": 8.230032171006314, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4218, "loss": 4.469295501708984, "lr": 0.0002, "elapsed_sec": 34986.79175710678, "step_time_sec": 8.230857668997487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4219, "loss": 4.342073440551758, "lr": 0.0002, "elapsed_sec": 34995.02099776268, "step_time_sec": 8.229158078000182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4220, "loss": 4.428220748901367, "lr": 0.0002, "elapsed_sec": 35003.25237894058, "step_time_sec": 8.231120896991342, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4221, "loss": 4.245110034942627, "lr": 0.0002, "elapsed_sec": 35011.48302102089, "step_time_sec": 8.230544054007623, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4222, "loss": 4.183611869812012, "lr": 0.0002, "elapsed_sec": 35019.71380972862, "step_time_sec": 8.23064102299395, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4223, "loss": 4.341169357299805, "lr": 0.0002, "elapsed_sec": 35027.945497751236, "step_time_sec": 8.231453554995824, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4224, "loss": 4.305819988250732, "lr": 0.0002, "elapsed_sec": 35036.17771291733, "step_time_sec": 8.232069202989805, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4225, "loss": 4.093105316162109, "lr": 0.0002, "elapsed_sec": 35044.40875315666, "step_time_sec": 8.23092908700346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4226, "loss": 4.226653099060059, "lr": 0.0002, "elapsed_sec": 35052.63871049881, "step_time_sec": 8.229790557990782, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4227, "loss": 4.229520797729492, "lr": 0.0002, "elapsed_sec": 35060.86824846268, "step_time_sec": 8.22932750498876, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4228, "loss": 4.289247512817383, "lr": 0.0002, "elapsed_sec": 35069.09851002693, "step_time_sec": 8.23011346597923, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4229, "loss": 4.392785549163818, "lr": 0.0002, "elapsed_sec": 35077.32879829407, "step_time_sec": 8.230188282002928, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4230, "loss": 4.182607650756836, "lr": 0.0002, "elapsed_sec": 35085.55934739113, "step_time_sec": 8.230366386007518, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4231, "loss": 4.124802589416504, "lr": 0.0002, "elapsed_sec": 35093.78979611397, "step_time_sec": 8.230259943025885, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4232, "loss": 4.268449306488037, "lr": 0.0002, "elapsed_sec": 35102.02042245865, "step_time_sec": 8.230452757008607, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4233, "loss": 4.1261725425720215, "lr": 0.0002, "elapsed_sec": 35110.25037813187, "step_time_sec": 8.229810992983403, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4234, "loss": 4.269807815551758, "lr": 0.0002, "elapsed_sec": 35118.48155212402, "step_time_sec": 8.231073289993219, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4235, "loss": 4.054732799530029, "lr": 0.0002, "elapsed_sec": 35126.712481975555, "step_time_sec": 8.230702569999266, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4236, "loss": 4.173426628112793, "lr": 0.0002, "elapsed_sec": 35134.943778038025, "step_time_sec": 8.231152861000737, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4237, "loss": 4.204068660736084, "lr": 0.0002, "elapsed_sec": 35143.174157857895, "step_time_sec": 8.230203563987743, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4238, "loss": 4.217112064361572, "lr": 0.0002, "elapsed_sec": 35151.40451693535, "step_time_sec": 8.230195843003457, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4239, "loss": 4.215391159057617, "lr": 0.0002, "elapsed_sec": 35159.631803274155, "step_time_sec": 8.22717362100957, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4240, "loss": 4.207130432128906, "lr": 0.0002, "elapsed_sec": 35167.86071419716, "step_time_sec": 8.228734702977818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4241, "loss": 4.0351481437683105, "lr": 0.0002, "elapsed_sec": 35176.089383363724, "step_time_sec": 8.228473966999445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4242, "loss": 4.20946741104126, "lr": 0.0002, "elapsed_sec": 35184.31982088089, "step_time_sec": 8.230260287004057, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4243, "loss": 4.08980131149292, "lr": 0.0002, "elapsed_sec": 35192.55011510849, "step_time_sec": 8.23018044201308, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4244, "loss": 4.134890556335449, "lr": 0.0002, "elapsed_sec": 35200.77945399284, "step_time_sec": 8.229151894978713, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4245, "loss": 4.170654773712158, "lr": 0.0002, "elapsed_sec": 35209.011201143265, "step_time_sec": 8.231600394006819, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4246, "loss": 4.343956470489502, "lr": 0.0002, "elapsed_sec": 35217.24206542969, "step_time_sec": 8.230700517015066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4247, "loss": 4.2544846534729, "lr": 0.0002, "elapsed_sec": 35225.47058010101, "step_time_sec": 8.228340346016921, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4248, "loss": 4.077783584594727, "lr": 0.0002, "elapsed_sec": 35233.69923353195, "step_time_sec": 8.228582140000071, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4249, "loss": 4.395805358886719, "lr": 0.0002, "elapsed_sec": 35241.928347587585, "step_time_sec": 8.228916476975428, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4250, "loss": 4.270262718200684, "lr": 0.0002, "elapsed_sec": 35250.15786409378, "step_time_sec": 8.229324782994809, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4251, "loss": 4.315188884735107, "lr": 0.0002, "elapsed_sec": 35258.3891723156, "step_time_sec": 8.231161481002346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4252, "loss": 4.202799320220947, "lr": 0.0002, "elapsed_sec": 35266.61947393417, "step_time_sec": 8.230181405990152, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4253, "loss": 4.265208721160889, "lr": 0.0002, "elapsed_sec": 35274.85061573982, "step_time_sec": 8.230990369978826, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4254, "loss": 4.115179061889648, "lr": 0.0002, "elapsed_sec": 35283.081615448, "step_time_sec": 8.230874959990615, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4255, "loss": 4.173925399780273, "lr": 0.0002, "elapsed_sec": 35291.312359809875, "step_time_sec": 8.230527625011746, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4256, "loss": 4.186551094055176, "lr": 0.0002, "elapsed_sec": 35299.54263758659, "step_time_sec": 8.230165012006182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4257, "loss": 4.279425144195557, "lr": 0.0002, "elapsed_sec": 35307.773438453674, "step_time_sec": 8.230660330998944, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4258, "loss": 4.1713762283325195, "lr": 0.0002, "elapsed_sec": 35316.00411653519, "step_time_sec": 8.230479187011952, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4259, "loss": 4.201166152954102, "lr": 0.0002, "elapsed_sec": 35324.234282016754, "step_time_sec": 8.230003316013608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4260, "loss": 4.156345844268799, "lr": 0.0002, "elapsed_sec": 35332.46443390846, "step_time_sec": 8.230034776002867, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4261, "loss": 4.192668437957764, "lr": 0.0002, "elapsed_sec": 35340.69545674324, "step_time_sec": 8.230806214007316, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4262, "loss": 4.264127731323242, "lr": 0.0002, "elapsed_sec": 35348.923382759094, "step_time_sec": 8.227772954996908, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4263, "loss": 4.197007179260254, "lr": 0.0002, "elapsed_sec": 35357.15338873863, "step_time_sec": 8.229900254984386, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4264, "loss": 4.296494960784912, "lr": 0.0002, "elapsed_sec": 35365.383482694626, "step_time_sec": 8.229856794001535, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4265, "loss": 4.168158054351807, "lr": 0.0002, "elapsed_sec": 35373.61405134201, "step_time_sec": 8.23041637899587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4266, "loss": 4.314492225646973, "lr": 0.0002, "elapsed_sec": 35381.84447979927, "step_time_sec": 8.23028325100313, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4267, "loss": 4.161206245422363, "lr": 0.0002, "elapsed_sec": 35390.07585477829, "step_time_sec": 8.231256621016655, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4268, "loss": 4.227755069732666, "lr": 0.0002, "elapsed_sec": 35398.30607032776, "step_time_sec": 8.230068205011776, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4269, "loss": 4.232768535614014, "lr": 0.0002, "elapsed_sec": 35406.53422212601, "step_time_sec": 8.227922314981697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4270, "loss": 4.311140060424805, "lr": 0.0002, "elapsed_sec": 35414.76487016678, "step_time_sec": 8.230513552000048, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4271, "loss": 4.247509479522705, "lr": 0.0002, "elapsed_sec": 35422.995722055435, "step_time_sec": 8.230682616005652, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4272, "loss": 4.086180210113525, "lr": 0.0002, "elapsed_sec": 35431.22706198692, "step_time_sec": 8.231213633989682, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4273, "loss": 4.233015537261963, "lr": 0.0002, "elapsed_sec": 35439.458716630936, "step_time_sec": 8.231453857995803, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4274, "loss": 4.1061506271362305, "lr": 0.0002, "elapsed_sec": 35447.68909263611, "step_time_sec": 8.230283626005985, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4275, "loss": 4.2913103103637695, "lr": 0.0002, "elapsed_sec": 35455.920766830444, "step_time_sec": 8.231435202993453, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4276, "loss": 4.185235977172852, "lr": 0.0002, "elapsed_sec": 35464.14986205101, "step_time_sec": 8.228941003995715, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4277, "loss": 4.160810470581055, "lr": 0.0002, "elapsed_sec": 35472.38073825836, "step_time_sec": 8.230724648979958, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4278, "loss": 4.258241176605225, "lr": 0.0002, "elapsed_sec": 35480.609469890594, "step_time_sec": 8.228562348987907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4279, "loss": 4.217000961303711, "lr": 0.0002, "elapsed_sec": 35488.838871240616, "step_time_sec": 8.22925941698486, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4280, "loss": 4.3814921379089355, "lr": 0.0002, "elapsed_sec": 35497.06881523132, "step_time_sec": 8.229757705004886, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4281, "loss": 4.323488712310791, "lr": 0.0002, "elapsed_sec": 35505.298085689545, "step_time_sec": 8.229134245979367, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4282, "loss": 4.145863056182861, "lr": 0.0002, "elapsed_sec": 35513.528462171555, "step_time_sec": 8.230187394015957, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4283, "loss": 4.262333393096924, "lr": 0.0002, "elapsed_sec": 35521.757563591, "step_time_sec": 8.22895557997981, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4284, "loss": 4.176297187805176, "lr": 0.0002, "elapsed_sec": 35529.98777937889, "step_time_sec": 8.23012952899444, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4285, "loss": 4.251290798187256, "lr": 0.0002, "elapsed_sec": 35538.216791152954, "step_time_sec": 8.228821653989144, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4286, "loss": 4.325854301452637, "lr": 0.0002, "elapsed_sec": 35546.44635653496, "step_time_sec": 8.229397752991645, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4287, "loss": 4.295139312744141, "lr": 0.0002, "elapsed_sec": 35554.67511677742, "step_time_sec": 8.22861020200071, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4288, "loss": 4.209898471832275, "lr": 0.0002, "elapsed_sec": 35562.905888319016, "step_time_sec": 8.230599220987642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4289, "loss": 4.307301998138428, "lr": 0.0002, "elapsed_sec": 35571.1363389492, "step_time_sec": 8.230261135991896, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4290, "loss": 4.25217342376709, "lr": 0.0002, "elapsed_sec": 35579.366763830185, "step_time_sec": 8.230250856024213, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4291, "loss": 4.2863969802856445, "lr": 0.0002, "elapsed_sec": 35587.597249269485, "step_time_sec": 8.230305618984858, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4292, "loss": 4.217835426330566, "lr": 0.0002, "elapsed_sec": 35595.828231573105, "step_time_sec": 8.230826127983164, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4293, "loss": 4.276263236999512, "lr": 0.0002, "elapsed_sec": 35604.05898451805, "step_time_sec": 8.230593494023196, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4294, "loss": 4.209785461425781, "lr": 0.0002, "elapsed_sec": 35612.28965997696, "step_time_sec": 8.23051672600559, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4295, "loss": 4.1529974937438965, "lr": 0.0002, "elapsed_sec": 35620.52094101906, "step_time_sec": 8.23111257198616, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4296, "loss": 4.246535301208496, "lr": 0.0002, "elapsed_sec": 35628.75211238861, "step_time_sec": 8.23101747999317, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4297, "loss": 4.175181865692139, "lr": 0.0002, "elapsed_sec": 35636.982788562775, "step_time_sec": 8.230515750998165, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4298, "loss": 4.244187831878662, "lr": 0.0002, "elapsed_sec": 35645.21082878113, "step_time_sec": 8.227937907999149, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4299, "loss": 4.290534973144531, "lr": 0.0002, "elapsed_sec": 35653.442783117294, "step_time_sec": 8.2317797630094, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4300, "loss": 4.309642791748047, "lr": 0.0002, "elapsed_sec": 35661.67347574234, "step_time_sec": 8.230466383975, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4301, "loss": 4.309964179992676, "lr": 0.0002, "elapsed_sec": 35669.90243721008, "step_time_sec": 8.22886592298164, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4302, "loss": 4.275486469268799, "lr": 0.0002, "elapsed_sec": 35678.130984306335, "step_time_sec": 8.228317613975378, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4303, "loss": 4.226673603057861, "lr": 0.0002, "elapsed_sec": 35686.36182117462, "step_time_sec": 8.230656311992789, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4304, "loss": 4.213547706604004, "lr": 0.0002, "elapsed_sec": 35694.59314060211, "step_time_sec": 8.231153394008288, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4305, "loss": 4.3043341636657715, "lr": 0.0002, "elapsed_sec": 35702.82198023796, "step_time_sec": 8.228681704000337, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4306, "loss": 4.279517650604248, "lr": 0.0002, "elapsed_sec": 35711.05281043053, "step_time_sec": 8.230670744000236, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4307, "loss": 4.270864963531494, "lr": 0.0002, "elapsed_sec": 35719.28226208687, "step_time_sec": 8.229345845000353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4308, "loss": 4.283788204193115, "lr": 0.0002, "elapsed_sec": 35727.51136445999, "step_time_sec": 8.228893146006158, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4309, "loss": 4.301950931549072, "lr": 0.0002, "elapsed_sec": 35735.74195146561, "step_time_sec": 8.230425297981128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4310, "loss": 4.178194522857666, "lr": 0.0002, "elapsed_sec": 35743.97291922569, "step_time_sec": 8.230826733022695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4311, "loss": 4.267842769622803, "lr": 0.0002, "elapsed_sec": 35752.20412540436, "step_time_sec": 8.231049208989134, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4312, "loss": 4.102701663970947, "lr": 0.0002, "elapsed_sec": 35760.4348526001, "step_time_sec": 8.230559096991783, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4313, "loss": 4.3118085861206055, "lr": 0.0002, "elapsed_sec": 35768.66598153114, "step_time_sec": 8.231032528012292, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4314, "loss": 4.1616530418396, "lr": 0.0002, "elapsed_sec": 35776.89697813988, "step_time_sec": 8.230801408004481, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4315, "loss": 4.262795448303223, "lr": 0.0002, "elapsed_sec": 35785.12796354294, "step_time_sec": 8.230834366986528, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4316, "loss": 4.258522987365723, "lr": 0.0002, "elapsed_sec": 35793.35930418968, "step_time_sec": 8.231166642013704, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4317, "loss": 4.194303035736084, "lr": 0.0002, "elapsed_sec": 35801.588628292084, "step_time_sec": 8.229131808009697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4318, "loss": 4.184474945068359, "lr": 0.0002, "elapsed_sec": 35809.81877946854, "step_time_sec": 8.230003990989644, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4319, "loss": 4.338397026062012, "lr": 0.0002, "elapsed_sec": 35818.04756689072, "step_time_sec": 8.228585823992034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4320, "loss": 4.211977481842041, "lr": 0.0002, "elapsed_sec": 35826.27841711044, "step_time_sec": 8.230729057017015, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4321, "loss": 4.2314653396606445, "lr": 0.0002, "elapsed_sec": 35834.50875735283, "step_time_sec": 8.230190007016063, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4322, "loss": 4.283417224884033, "lr": 0.0002, "elapsed_sec": 35842.739018678665, "step_time_sec": 8.230100267013768, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4323, "loss": 4.258543491363525, "lr": 0.0002, "elapsed_sec": 35850.969910383224, "step_time_sec": 8.230725959001575, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4324, "loss": 4.215366840362549, "lr": 0.0002, "elapsed_sec": 35859.19886445999, "step_time_sec": 8.228785461978987, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4325, "loss": 4.192568302154541, "lr": 0.0002, "elapsed_sec": 35867.43007016182, "step_time_sec": 8.231044640007894, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4326, "loss": 4.212600231170654, "lr": 0.0002, "elapsed_sec": 35875.66050648689, "step_time_sec": 8.230284129997017, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4327, "loss": 4.287402629852295, "lr": 0.0002, "elapsed_sec": 35883.89120006561, "step_time_sec": 8.230587550991913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4328, "loss": 4.327098369598389, "lr": 0.0002, "elapsed_sec": 35892.121985435486, "step_time_sec": 8.230568908009445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4329, "loss": 4.249000549316406, "lr": 0.0002, "elapsed_sec": 35900.3529086113, "step_time_sec": 8.230826282990165, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4330, "loss": 4.091454982757568, "lr": 0.0002, "elapsed_sec": 35908.582738637924, "step_time_sec": 8.229681563010672, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4331, "loss": 4.2629170417785645, "lr": 0.0002, "elapsed_sec": 35916.810854673386, "step_time_sec": 8.22792367899092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4332, "loss": 4.3571271896362305, "lr": 0.0002, "elapsed_sec": 35925.040347099304, "step_time_sec": 8.229345469007967, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4333, "loss": 4.2968244552612305, "lr": 0.0002, "elapsed_sec": 35933.27052497864, "step_time_sec": 8.230010605999269, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4334, "loss": 4.224268913269043, "lr": 0.0002, "elapsed_sec": 35941.500334739685, "step_time_sec": 8.229667714971583, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4335, "loss": 4.131846904754639, "lr": 0.0002, "elapsed_sec": 35949.729786634445, "step_time_sec": 8.229333163006231, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4336, "loss": 4.240236282348633, "lr": 0.0002, "elapsed_sec": 35957.95952630043, "step_time_sec": 8.229532537981868, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4337, "loss": 4.225462913513184, "lr": 0.0002, "elapsed_sec": 35966.18880701065, "step_time_sec": 8.229130836989498, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4338, "loss": 4.2891974449157715, "lr": 0.0002, "elapsed_sec": 35974.41861486435, "step_time_sec": 8.229704024008242, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4339, "loss": 4.238627910614014, "lr": 0.0002, "elapsed_sec": 35982.64803195, "step_time_sec": 8.229223769012606, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4340, "loss": 4.149302959442139, "lr": 0.0002, "elapsed_sec": 35990.87815141678, "step_time_sec": 8.229954373004148, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4341, "loss": 4.177601337432861, "lr": 0.0002, "elapsed_sec": 35999.10784244537, "step_time_sec": 8.229537145001814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4342, "loss": 4.22507381439209, "lr": 0.0002, "elapsed_sec": 36007.337990283966, "step_time_sec": 8.229995597997913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4343, "loss": 4.317497730255127, "lr": 0.0002, "elapsed_sec": 36015.5673058033, "step_time_sec": 8.229209817014635, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4344, "loss": 4.299892902374268, "lr": 0.0002, "elapsed_sec": 36023.79600191116, "step_time_sec": 8.228512267000042, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4345, "loss": 4.2494401931762695, "lr": 0.0002, "elapsed_sec": 36032.026094675064, "step_time_sec": 8.229882329993416, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4346, "loss": 4.347548484802246, "lr": 0.0002, "elapsed_sec": 36040.25584959984, "step_time_sec": 8.229678059986327, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4347, "loss": 4.26312255859375, "lr": 0.0002, "elapsed_sec": 36048.4868812561, "step_time_sec": 8.230846554011805, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4348, "loss": 4.2643327713012695, "lr": 0.0002, "elapsed_sec": 36056.71468448639, "step_time_sec": 8.227605007996317, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4349, "loss": 4.065457820892334, "lr": 0.0002, "elapsed_sec": 36064.94375562668, "step_time_sec": 8.22893482001382, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4350, "loss": 4.266186237335205, "lr": 0.0002, "elapsed_sec": 36073.17410612106, "step_time_sec": 8.230180701997597, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4351, "loss": 4.162608623504639, "lr": 0.0002, "elapsed_sec": 36081.405364751816, "step_time_sec": 8.23112428301829, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4352, "loss": 4.324957370758057, "lr": 0.0002, "elapsed_sec": 36089.63419389725, "step_time_sec": 8.228687341994373, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4353, "loss": 4.258346080780029, "lr": 0.0002, "elapsed_sec": 36097.861971616745, "step_time_sec": 8.227601288002916, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4354, "loss": 4.055879592895508, "lr": 0.0002, "elapsed_sec": 36106.0922498703, "step_time_sec": 8.230171417002566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4355, "loss": 4.367944717407227, "lr": 0.0002, "elapsed_sec": 36114.32379961014, "step_time_sec": 8.231387307983823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4356, "loss": 4.1921515464782715, "lr": 0.0002, "elapsed_sec": 36122.61374783516, "step_time_sec": 8.232608089980204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4357, "loss": 4.257331848144531, "lr": 0.0002, "elapsed_sec": 36130.841875076294, "step_time_sec": 8.22799572898657, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4358, "loss": 4.1654462814331055, "lr": 0.0002, "elapsed_sec": 36139.069722652435, "step_time_sec": 8.22773058200255, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4359, "loss": 4.184958457946777, "lr": 0.0002, "elapsed_sec": 36147.29856157303, "step_time_sec": 8.228675854014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4360, "loss": 4.164156913757324, "lr": 0.0002, "elapsed_sec": 36155.52827453613, "step_time_sec": 8.229527004004922, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4361, "loss": 4.141138553619385, "lr": 0.0002, "elapsed_sec": 36163.75978207588, "step_time_sec": 8.231377807998797, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4362, "loss": 4.20925760269165, "lr": 0.0002, "elapsed_sec": 36171.99026918411, "step_time_sec": 8.230284685996594, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4363, "loss": 3.9524648189544678, "lr": 0.0002, "elapsed_sec": 36180.22107577324, "step_time_sec": 8.230650365003385, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4364, "loss": 4.152484893798828, "lr": 0.0002, "elapsed_sec": 36188.451288461685, "step_time_sec": 8.230063816998154, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4365, "loss": 4.189659595489502, "lr": 0.0002, "elapsed_sec": 36196.68024301529, "step_time_sec": 8.228847777005285, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4366, "loss": 4.392133712768555, "lr": 0.0002, "elapsed_sec": 36204.909650564194, "step_time_sec": 8.22925909399055, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4367, "loss": 4.17482328414917, "lr": 0.0002, "elapsed_sec": 36213.139149427414, "step_time_sec": 8.229293154989136, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4368, "loss": 4.264339447021484, "lr": 0.0002, "elapsed_sec": 36221.36876678467, "step_time_sec": 8.229460137983551, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4369, "loss": 4.300177574157715, "lr": 0.0002, "elapsed_sec": 36229.5985853672, "step_time_sec": 8.229719606024446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4370, "loss": 4.3875041007995605, "lr": 0.0002, "elapsed_sec": 36237.82813882828, "step_time_sec": 8.229334623989416, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4371, "loss": 4.369135856628418, "lr": 0.0002, "elapsed_sec": 36246.057794332504, "step_time_sec": 8.229507776006358, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4372, "loss": 4.25364351272583, "lr": 0.0002, "elapsed_sec": 36254.28675889969, "step_time_sec": 8.228804615995614, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4373, "loss": 4.1338582038879395, "lr": 0.0002, "elapsed_sec": 36262.51525259018, "step_time_sec": 8.228365726012271, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4374, "loss": 4.185194969177246, "lr": 0.0002, "elapsed_sec": 36270.74606561661, "step_time_sec": 8.230659172986634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4375, "loss": 4.174018383026123, "lr": 0.0002, "elapsed_sec": 36278.97565674782, "step_time_sec": 8.229461500013713, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4376, "loss": 4.194162368774414, "lr": 0.0002, "elapsed_sec": 36287.20514702797, "step_time_sec": 8.22930809398531, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4377, "loss": 4.043723106384277, "lr": 0.0002, "elapsed_sec": 36295.43462562561, "step_time_sec": 8.229340556979878, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4378, "loss": 4.112540245056152, "lr": 0.0002, "elapsed_sec": 36303.6648979187, "step_time_sec": 8.230079291999573, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4379, "loss": 4.1676926612854, "lr": 0.0002, "elapsed_sec": 36311.89269852638, "step_time_sec": 8.227670494001359, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4380, "loss": 4.304519176483154, "lr": 0.0002, "elapsed_sec": 36320.12198352814, "step_time_sec": 8.229189364006743, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4381, "loss": 4.187586784362793, "lr": 0.0002, "elapsed_sec": 36328.35091972351, "step_time_sec": 8.228705088986317, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4382, "loss": 4.3834919929504395, "lr": 0.0002, "elapsed_sec": 36336.578838825226, "step_time_sec": 8.227761690999614, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4383, "loss": 4.119029998779297, "lr": 0.0002, "elapsed_sec": 36344.80693411827, "step_time_sec": 8.227958197996486, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4384, "loss": 4.411799907684326, "lr": 0.0002, "elapsed_sec": 36353.03485131264, "step_time_sec": 8.227775152015965, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4385, "loss": 4.120015621185303, "lr": 0.0002, "elapsed_sec": 36361.264479637146, "step_time_sec": 8.229481073009083, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4386, "loss": 4.312099456787109, "lr": 0.0002, "elapsed_sec": 36369.495659828186, "step_time_sec": 8.23103305400582, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4387, "loss": 4.044397830963135, "lr": 0.0002, "elapsed_sec": 36377.72499918938, "step_time_sec": 8.22914950401173, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4388, "loss": 4.2005181312561035, "lr": 0.0002, "elapsed_sec": 36385.95365476608, "step_time_sec": 8.228488420019858, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4389, "loss": 4.161875247955322, "lr": 0.0002, "elapsed_sec": 36394.181222200394, "step_time_sec": 8.227493883983698, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4390, "loss": 4.165921688079834, "lr": 0.0002, "elapsed_sec": 36402.411669015884, "step_time_sec": 8.230221308011096, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4391, "loss": 4.153286933898926, "lr": 0.0002, "elapsed_sec": 36410.643698215485, "step_time_sec": 8.231922683015, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4392, "loss": 4.259501934051514, "lr": 0.0002, "elapsed_sec": 36418.873455524445, "step_time_sec": 8.229566678986885, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4393, "loss": 4.044652462005615, "lr": 0.0002, "elapsed_sec": 36427.104997873306, "step_time_sec": 8.23138977898634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4394, "loss": 4.211978435516357, "lr": 0.0002, "elapsed_sec": 36435.33618569374, "step_time_sec": 8.231087707012193, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4395, "loss": 4.314662456512451, "lr": 0.0002, "elapsed_sec": 36443.56581020355, "step_time_sec": 8.229386232007528, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4396, "loss": 4.046874523162842, "lr": 0.0002, "elapsed_sec": 36451.79493808746, "step_time_sec": 8.22897741800989, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4397, "loss": 4.238431930541992, "lr": 0.0002, "elapsed_sec": 36460.02586722374, "step_time_sec": 8.23080667501199, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4398, "loss": 4.1858391761779785, "lr": 0.0002, "elapsed_sec": 36468.257158756256, "step_time_sec": 8.231167727994034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4399, "loss": 4.1849164962768555, "lr": 0.0002, "elapsed_sec": 36476.48799610138, "step_time_sec": 8.230634678999195, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4400, "loss": 4.2425642013549805, "lr": 0.0002, "elapsed_sec": 36484.71907043457, "step_time_sec": 8.230887155019445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4401, "loss": 4.141513347625732, "lr": 0.0002, "elapsed_sec": 36492.950293540955, "step_time_sec": 8.231085278006503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4402, "loss": 4.135616302490234, "lr": 0.0002, "elapsed_sec": 36501.1817984581, "step_time_sec": 8.231378313008463, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4403, "loss": 4.231319427490234, "lr": 0.0002, "elapsed_sec": 36509.410571336746, "step_time_sec": 8.228635008999845, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4404, "loss": 4.153349876403809, "lr": 0.0002, "elapsed_sec": 36517.64000082016, "step_time_sec": 8.229242555971723, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4405, "loss": 4.136769771575928, "lr": 0.0002, "elapsed_sec": 36525.87049102783, "step_time_sec": 8.230367054988164, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4406, "loss": 4.406148433685303, "lr": 0.0002, "elapsed_sec": 36534.10128092766, "step_time_sec": 8.230590516002849, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4407, "loss": 4.240412712097168, "lr": 0.0002, "elapsed_sec": 36542.33230996132, "step_time_sec": 8.230873185006203, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4408, "loss": 4.331292629241943, "lr": 0.0002, "elapsed_sec": 36550.56320476532, "step_time_sec": 8.23079051199602, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4409, "loss": 4.216827869415283, "lr": 0.0002, "elapsed_sec": 36558.79389500618, "step_time_sec": 8.230485713982489, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4410, "loss": 4.249743938446045, "lr": 0.0002, "elapsed_sec": 36567.02500915527, "step_time_sec": 8.230974568985403, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4411, "loss": 4.20154333114624, "lr": 0.0002, "elapsed_sec": 36575.253744363785, "step_time_sec": 8.228588295984082, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4412, "loss": 4.371633052825928, "lr": 0.0002, "elapsed_sec": 36583.48414850235, "step_time_sec": 8.230233540001791, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4413, "loss": 4.270978927612305, "lr": 0.0002, "elapsed_sec": 36591.71493721008, "step_time_sec": 8.230642351001734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4414, "loss": 4.2607340812683105, "lr": 0.0002, "elapsed_sec": 36599.94582056999, "step_time_sec": 8.230732245981926, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4415, "loss": 4.110293388366699, "lr": 0.0002, "elapsed_sec": 36608.176048994064, "step_time_sec": 8.23006121799699, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4416, "loss": 4.120038986206055, "lr": 0.0002, "elapsed_sec": 36616.406727313995, "step_time_sec": 8.230523603007896, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4417, "loss": 4.1619415283203125, "lr": 0.0002, "elapsed_sec": 36624.63723039627, "step_time_sec": 8.230386733979685, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4418, "loss": 4.190310955047607, "lr": 0.0002, "elapsed_sec": 36632.86537837982, "step_time_sec": 8.227942251018248, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4419, "loss": 4.204882621765137, "lr": 0.0002, "elapsed_sec": 36641.09369087219, "step_time_sec": 8.228188227018109, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4420, "loss": 4.304381370544434, "lr": 0.0002, "elapsed_sec": 36649.32309794426, "step_time_sec": 8.229281303996686, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4421, "loss": 4.178755283355713, "lr": 0.0002, "elapsed_sec": 36657.553376197815, "step_time_sec": 8.230099489999702, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4422, "loss": 4.260101795196533, "lr": 0.0002, "elapsed_sec": 36665.783391714096, "step_time_sec": 8.229900981998071, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4423, "loss": 4.297863960266113, "lr": 0.0002, "elapsed_sec": 36674.01269888878, "step_time_sec": 8.229117933980888, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4424, "loss": 4.222700119018555, "lr": 0.0002, "elapsed_sec": 36682.242916584015, "step_time_sec": 8.23003019898897, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4425, "loss": 4.101283073425293, "lr": 0.0002, "elapsed_sec": 36690.47337293625, "step_time_sec": 8.230318854999496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4426, "loss": 4.390987873077393, "lr": 0.0002, "elapsed_sec": 36698.701536893845, "step_time_sec": 8.228029866004363, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4427, "loss": 4.281476020812988, "lr": 0.0002, "elapsed_sec": 36706.93179726601, "step_time_sec": 8.23017396000796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4428, "loss": 4.216615200042725, "lr": 0.0002, "elapsed_sec": 36715.165484428406, "step_time_sec": 8.23351946900948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4429, "loss": 4.401393890380859, "lr": 0.0002, "elapsed_sec": 36723.39721465111, "step_time_sec": 8.231538128980901, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4430, "loss": 4.208059787750244, "lr": 0.0002, "elapsed_sec": 36731.62801027298, "step_time_sec": 8.230636464024428, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4431, "loss": 4.3284502029418945, "lr": 0.0002, "elapsed_sec": 36739.85845422745, "step_time_sec": 8.230278759001521, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4432, "loss": 4.192310810089111, "lr": 0.0002, "elapsed_sec": 36748.08914279938, "step_time_sec": 8.23052952499711, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4433, "loss": 4.271057605743408, "lr": 0.0002, "elapsed_sec": 36756.317984342575, "step_time_sec": 8.228606208984274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4434, "loss": 4.090157508850098, "lr": 0.0002, "elapsed_sec": 36764.54760813713, "step_time_sec": 8.22952823797823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4435, "loss": 4.354140281677246, "lr": 0.0002, "elapsed_sec": 36772.776252269745, "step_time_sec": 8.228451083006803, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4436, "loss": 4.344651699066162, "lr": 0.0002, "elapsed_sec": 36781.00517177582, "step_time_sec": 8.228829999017762, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4437, "loss": 4.194459915161133, "lr": 0.0002, "elapsed_sec": 36789.233670949936, "step_time_sec": 8.228320462018019, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4438, "loss": 4.179614067077637, "lr": 0.0002, "elapsed_sec": 36797.462050914764, "step_time_sec": 8.228218907024711, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4439, "loss": 4.237903594970703, "lr": 0.0002, "elapsed_sec": 36805.69318675995, "step_time_sec": 8.230969374009874, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4440, "loss": 4.122494220733643, "lr": 0.0002, "elapsed_sec": 36813.92321872711, "step_time_sec": 8.229896093980642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4441, "loss": 4.228018283843994, "lr": 0.0002, "elapsed_sec": 36822.15285587311, "step_time_sec": 8.229455833992688, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4442, "loss": 4.230996131896973, "lr": 0.0002, "elapsed_sec": 36830.38186454773, "step_time_sec": 8.228844423021656, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4443, "loss": 4.272576332092285, "lr": 0.0002, "elapsed_sec": 36838.609652519226, "step_time_sec": 8.227683404984418, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4444, "loss": 4.264408588409424, "lr": 0.0002, "elapsed_sec": 36846.840299129486, "step_time_sec": 8.230469775997335, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4445, "loss": 4.093929767608643, "lr": 0.0002, "elapsed_sec": 36855.06919717789, "step_time_sec": 8.22881628802861, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4446, "loss": 4.2583842277526855, "lr": 0.0002, "elapsed_sec": 36863.29875040054, "step_time_sec": 8.229317154007731, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4447, "loss": 4.189101219177246, "lr": 0.0002, "elapsed_sec": 36871.527871608734, "step_time_sec": 8.22898630599957, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4448, "loss": 4.082111358642578, "lr": 0.0002, "elapsed_sec": 36879.75539779663, "step_time_sec": 8.227407891012263, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4449, "loss": 4.23552131652832, "lr": 0.0002, "elapsed_sec": 36887.985251665115, "step_time_sec": 8.229642580990912, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4450, "loss": 4.1672444343566895, "lr": 0.0002, "elapsed_sec": 36896.21505689621, "step_time_sec": 8.229716428992106, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4451, "loss": 4.20103645324707, "lr": 0.0002, "elapsed_sec": 36904.44566440582, "step_time_sec": 8.230350603000261, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4452, "loss": 4.1907267570495605, "lr": 0.0002, "elapsed_sec": 36912.67660117149, "step_time_sec": 8.230774531984935, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4453, "loss": 4.143136024475098, "lr": 0.0002, "elapsed_sec": 36920.90690159798, "step_time_sec": 8.230217720993096, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4454, "loss": 4.176834583282471, "lr": 0.0002, "elapsed_sec": 36929.138214588165, "step_time_sec": 8.231115432019578, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4455, "loss": 4.226271152496338, "lr": 0.0002, "elapsed_sec": 36937.36861348152, "step_time_sec": 8.230284146004124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4456, "loss": 4.321059703826904, "lr": 0.0002, "elapsed_sec": 36945.60051059723, "step_time_sec": 8.231734838016564, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4457, "loss": 4.199167728424072, "lr": 0.0002, "elapsed_sec": 36953.83038663864, "step_time_sec": 8.229689582018182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4458, "loss": 4.198840141296387, "lr": 0.0002, "elapsed_sec": 36962.05859899521, "step_time_sec": 8.228068494994659, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4459, "loss": 4.125287055969238, "lr": 0.0002, "elapsed_sec": 36970.28918480873, "step_time_sec": 8.23042309700395, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4460, "loss": 4.179234027862549, "lr": 0.0002, "elapsed_sec": 36978.52001166344, "step_time_sec": 8.230680574983126, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4461, "loss": 4.198362827301025, "lr": 0.0002, "elapsed_sec": 36986.74929904938, "step_time_sec": 8.229177425004309, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4462, "loss": 4.254469871520996, "lr": 0.0002, "elapsed_sec": 36994.979449272156, "step_time_sec": 8.229994149005506, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4463, "loss": 4.234610557556152, "lr": 0.0002, "elapsed_sec": 37003.21052622795, "step_time_sec": 8.230873341992265, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4464, "loss": 4.311409950256348, "lr": 0.0002, "elapsed_sec": 37011.44287824631, "step_time_sec": 8.232262776000425, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4465, "loss": 4.159437656402588, "lr": 0.0002, "elapsed_sec": 37019.673595905304, "step_time_sec": 8.230566341982922, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4466, "loss": 4.187036991119385, "lr": 0.0002, "elapsed_sec": 37027.904832839966, "step_time_sec": 8.231022088992177, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4467, "loss": 4.0898566246032715, "lr": 0.0002, "elapsed_sec": 37036.13209724426, "step_time_sec": 8.227157404005993, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4468, "loss": 4.229953765869141, "lr": 0.0002, "elapsed_sec": 37044.36152458191, "step_time_sec": 8.229223069996806, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4469, "loss": 4.192503929138184, "lr": 0.0002, "elapsed_sec": 37052.5924346447, "step_time_sec": 8.230752912990283, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4470, "loss": 3.977433443069458, "lr": 0.0002, "elapsed_sec": 37060.82342362404, "step_time_sec": 8.230841574986698, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4471, "loss": 4.186487674713135, "lr": 0.0002, "elapsed_sec": 37069.05417609215, "step_time_sec": 8.230615303997183, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4472, "loss": 4.165550231933594, "lr": 0.0002, "elapsed_sec": 37077.28473901749, "step_time_sec": 8.230414302001009, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4473, "loss": 4.106449604034424, "lr": 0.0002, "elapsed_sec": 37085.51385760307, "step_time_sec": 8.228957156999968, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4474, "loss": 4.114609241485596, "lr": 0.0002, "elapsed_sec": 37093.742668390274, "step_time_sec": 8.228724326996598, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4475, "loss": 4.2983551025390625, "lr": 0.0002, "elapsed_sec": 37101.97388839722, "step_time_sec": 8.231011372001376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4476, "loss": 4.149158477783203, "lr": 0.0002, "elapsed_sec": 37110.204110622406, "step_time_sec": 8.23010311301914, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4477, "loss": 4.215029716491699, "lr": 0.0002, "elapsed_sec": 37118.43580007553, "step_time_sec": 8.231489170982968, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4478, "loss": 4.115890979766846, "lr": 0.0002, "elapsed_sec": 37126.667387247086, "step_time_sec": 8.23146651199204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4479, "loss": 4.2483086585998535, "lr": 0.0002, "elapsed_sec": 37134.89862990379, "step_time_sec": 8.231101324985502, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4480, "loss": 4.229401588439941, "lr": 0.0002, "elapsed_sec": 37143.12702178955, "step_time_sec": 8.228197243006434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4481, "loss": 4.209546089172363, "lr": 0.0002, "elapsed_sec": 37151.35611343384, "step_time_sec": 8.22897270298563, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4482, "loss": 4.411577224731445, "lr": 0.0002, "elapsed_sec": 37159.587246418, "step_time_sec": 8.230929911980638, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4483, "loss": 4.1431884765625, "lr": 0.0002, "elapsed_sec": 37167.81806755066, "step_time_sec": 8.230644878000021, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4484, "loss": 4.16637659072876, "lr": 0.0002, "elapsed_sec": 37176.04919743538, "step_time_sec": 8.231040623009903, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4485, "loss": 4.234988689422607, "lr": 0.0002, "elapsed_sec": 37184.279940366745, "step_time_sec": 8.230513418995542, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4486, "loss": 4.188275337219238, "lr": 0.0002, "elapsed_sec": 37192.51105594635, "step_time_sec": 8.230977296974743, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4487, "loss": 4.097301006317139, "lr": 0.0002, "elapsed_sec": 37200.74186253548, "step_time_sec": 8.230640101013705, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4488, "loss": 4.238630771636963, "lr": 0.0002, "elapsed_sec": 37208.97044157982, "step_time_sec": 8.22842488597962, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4489, "loss": 4.288151741027832, "lr": 0.0002, "elapsed_sec": 37217.19951438904, "step_time_sec": 8.228978573984932, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4490, "loss": 4.198022842407227, "lr": 0.0002, "elapsed_sec": 37225.42836976051, "step_time_sec": 8.228634966013487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4491, "loss": 4.26085090637207, "lr": 0.0002, "elapsed_sec": 37233.6587433815, "step_time_sec": 8.230226218001917, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4492, "loss": 4.192955493927002, "lr": 0.0002, "elapsed_sec": 37241.889830350876, "step_time_sec": 8.23094804197899, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4493, "loss": 4.083198070526123, "lr": 0.0002, "elapsed_sec": 37250.11997818947, "step_time_sec": 8.23003241600236, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4494, "loss": 4.129360675811768, "lr": 0.0002, "elapsed_sec": 37258.34950399399, "step_time_sec": 8.229378867021296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4495, "loss": 4.1066670417785645, "lr": 0.0002, "elapsed_sec": 37266.57843542099, "step_time_sec": 8.228690312011167, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4496, "loss": 4.111039638519287, "lr": 0.0002, "elapsed_sec": 37274.80888724327, "step_time_sec": 8.230301161005627, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4497, "loss": 4.214442729949951, "lr": 0.0002, "elapsed_sec": 37283.03976583481, "step_time_sec": 8.230741389008472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4498, "loss": 4.224349021911621, "lr": 0.0002, "elapsed_sec": 37291.27052593231, "step_time_sec": 8.230669659009436, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4499, "loss": 4.100751876831055, "lr": 0.0002, "elapsed_sec": 37299.50162625313, "step_time_sec": 8.230901180999354, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4500, "loss": 4.327052116394043, "lr": 0.0002, "elapsed_sec": 37307.729874134064, "step_time_sec": 29.707921674009413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4501, "loss": 4.215421199798584, "lr": 0.0002, "elapsed_sec": 37337.445558309555, "step_time_sec": 8.235374940995825, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4502, "loss": 4.180518627166748, "lr": 0.0002, "elapsed_sec": 37345.662382125854, "step_time_sec": 8.216639988007955, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4503, "loss": 4.170928955078125, "lr": 0.0002, "elapsed_sec": 37353.87950181961, "step_time_sec": 8.216949050984113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4504, "loss": 4.164516448974609, "lr": 0.0002, "elapsed_sec": 37362.09684562683, "step_time_sec": 8.21722792001674, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4505, "loss": 4.036762237548828, "lr": 0.0002, "elapsed_sec": 37370.31532454491, "step_time_sec": 8.218346774985548, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4506, "loss": 4.248170375823975, "lr": 0.0002, "elapsed_sec": 37378.53237938881, "step_time_sec": 8.216912115021842, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4507, "loss": 4.087418079376221, "lr": 0.0002, "elapsed_sec": 37386.752640247345, "step_time_sec": 8.220031925011426, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4508, "loss": 4.154626369476318, "lr": 0.0002, "elapsed_sec": 37394.98382282257, "step_time_sec": 8.231121103017358, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4509, "loss": 4.089838027954102, "lr": 0.0002, "elapsed_sec": 37403.21581029892, "step_time_sec": 8.231746610021219, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4510, "loss": 4.0511579513549805, "lr": 0.0002, "elapsed_sec": 37411.44621062279, "step_time_sec": 8.230301778996363, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4511, "loss": 4.148171901702881, "lr": 0.0002, "elapsed_sec": 37419.674380779266, "step_time_sec": 8.227985941019142, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4512, "loss": 4.223855972290039, "lr": 0.0002, "elapsed_sec": 37427.905218839645, "step_time_sec": 8.230662418995053, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4513, "loss": 4.1039838790893555, "lr": 0.0002, "elapsed_sec": 37436.135283231735, "step_time_sec": 8.229911475005792, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4514, "loss": 4.209411144256592, "lr": 0.0002, "elapsed_sec": 37444.3662481308, "step_time_sec": 8.230798336997395, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4515, "loss": 4.23700475692749, "lr": 0.0002, "elapsed_sec": 37452.596997499466, "step_time_sec": 8.230582378979307, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4516, "loss": 4.138341903686523, "lr": 0.0002, "elapsed_sec": 37460.82733821869, "step_time_sec": 8.230165581015171, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4517, "loss": 4.106832981109619, "lr": 0.0002, "elapsed_sec": 37469.057512521744, "step_time_sec": 8.229997501010075, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4518, "loss": 4.123337268829346, "lr": 0.0002, "elapsed_sec": 37477.28689408302, "step_time_sec": 8.229285378998611, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4519, "loss": 4.312911510467529, "lr": 0.0002, "elapsed_sec": 37485.51616644859, "step_time_sec": 8.229111400985857, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4520, "loss": 4.088366985321045, "lr": 0.0002, "elapsed_sec": 37493.747754096985, "step_time_sec": 8.231369935005205, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4521, "loss": 4.181707382202148, "lr": 0.0002, "elapsed_sec": 37501.97948551178, "step_time_sec": 8.231577797996579, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4522, "loss": 4.07486629486084, "lr": 0.0002, "elapsed_sec": 37510.20936059952, "step_time_sec": 8.229734307999024, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4523, "loss": 4.193417549133301, "lr": 0.0002, "elapsed_sec": 37518.44157767296, "step_time_sec": 8.232074567000382, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4524, "loss": 3.9615328311920166, "lr": 0.0002, "elapsed_sec": 37526.67241048813, "step_time_sec": 8.23070525098592, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4525, "loss": 4.091105937957764, "lr": 0.0002, "elapsed_sec": 37534.90113282204, "step_time_sec": 8.228509887005202, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4526, "loss": 4.0483269691467285, "lr": 0.0002, "elapsed_sec": 37543.1319565773, "step_time_sec": 8.230688792013098, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4527, "loss": 4.230967044830322, "lr": 0.0002, "elapsed_sec": 37551.36328101158, "step_time_sec": 8.23121883199201, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4528, "loss": 4.185821056365967, "lr": 0.0002, "elapsed_sec": 37559.5942595005, "step_time_sec": 8.2308153519989, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4529, "loss": 4.2938971519470215, "lr": 0.0002, "elapsed_sec": 37567.82435083389, "step_time_sec": 8.229946021019714, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4530, "loss": 4.125638484954834, "lr": 0.0002, "elapsed_sec": 37576.05574464798, "step_time_sec": 8.231216888001654, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4531, "loss": 4.239560604095459, "lr": 0.0002, "elapsed_sec": 37584.28559398651, "step_time_sec": 8.22967050599982, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4532, "loss": 4.17262077331543, "lr": 0.0002, "elapsed_sec": 37592.5165348053, "step_time_sec": 8.230786864995025, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4533, "loss": 4.168882369995117, "lr": 0.0002, "elapsed_sec": 37600.74668097496, "step_time_sec": 8.23000360399601, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4534, "loss": 4.113263130187988, "lr": 0.0002, "elapsed_sec": 37608.97698664665, "step_time_sec": 8.230097336985637, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4535, "loss": 4.178826332092285, "lr": 0.0002, "elapsed_sec": 37617.20763421059, "step_time_sec": 8.230511554022087, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4536, "loss": 4.2247538566589355, "lr": 0.0002, "elapsed_sec": 37625.43807244301, "step_time_sec": 8.23028473899467, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4537, "loss": 3.900763511657715, "lr": 0.0002, "elapsed_sec": 37633.66936540604, "step_time_sec": 8.231208368000807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4538, "loss": 4.074326992034912, "lr": 0.0002, "elapsed_sec": 37641.90047264099, "step_time_sec": 8.230865015997551, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4539, "loss": 4.1889543533325195, "lr": 0.0002, "elapsed_sec": 37650.13155055046, "step_time_sec": 8.23094344700803, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4540, "loss": 4.14438533782959, "lr": 0.0002, "elapsed_sec": 37658.361711502075, "step_time_sec": 8.23003430699464, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4541, "loss": 4.334005832672119, "lr": 0.0002, "elapsed_sec": 37666.59052109718, "step_time_sec": 8.228665754984831, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4542, "loss": 4.144692420959473, "lr": 0.0002, "elapsed_sec": 37674.82297205925, "step_time_sec": 8.232235503994161, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4543, "loss": 4.072928428649902, "lr": 0.0002, "elapsed_sec": 37683.05314064026, "step_time_sec": 8.230046898999717, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4544, "loss": 4.221843719482422, "lr": 0.0002, "elapsed_sec": 37691.28414559364, "step_time_sec": 8.230821382021531, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4545, "loss": 4.095033645629883, "lr": 0.0002, "elapsed_sec": 37699.513996601105, "step_time_sec": 8.229732103005517, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4546, "loss": 4.032676696777344, "lr": 0.0002, "elapsed_sec": 37707.74360227585, "step_time_sec": 8.229396117996657, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4547, "loss": 4.083575248718262, "lr": 0.0002, "elapsed_sec": 37715.970851421356, "step_time_sec": 8.227143566997256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4548, "loss": 4.083851337432861, "lr": 0.0002, "elapsed_sec": 37724.19906377792, "step_time_sec": 8.228004869015422, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4549, "loss": 4.35017728805542, "lr": 0.0002, "elapsed_sec": 37732.42791414261, "step_time_sec": 8.22868217600626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4550, "loss": 4.143795013427734, "lr": 0.0002, "elapsed_sec": 37740.656860113144, "step_time_sec": 8.228775061987108, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4551, "loss": 4.156291484832764, "lr": 0.0002, "elapsed_sec": 37748.88540434837, "step_time_sec": 8.228393645986216, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4552, "loss": 4.185402870178223, "lr": 0.0002, "elapsed_sec": 37757.11632847786, "step_time_sec": 8.23077706102049, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4553, "loss": 4.410069465637207, "lr": 0.0002, "elapsed_sec": 37765.34741806984, "step_time_sec": 8.230941926012747, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4554, "loss": 4.127676010131836, "lr": 0.0002, "elapsed_sec": 37773.57504892349, "step_time_sec": 8.227493590005906, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4555, "loss": 4.138251304626465, "lr": 0.0002, "elapsed_sec": 37781.8058052063, "step_time_sec": 8.230654371000128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4556, "loss": 4.169552326202393, "lr": 0.0002, "elapsed_sec": 37790.03753042221, "step_time_sec": 8.231482823000988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4557, "loss": 4.27599573135376, "lr": 0.0002, "elapsed_sec": 37798.267441511154, "step_time_sec": 8.229819978005253, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4558, "loss": 4.1307878494262695, "lr": 0.0002, "elapsed_sec": 37806.49643087387, "step_time_sec": 8.228812750021461, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4559, "loss": 4.068817138671875, "lr": 0.0002, "elapsed_sec": 37814.726640701294, "step_time_sec": 8.230021952011157, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4560, "loss": 4.09800910949707, "lr": 0.0002, "elapsed_sec": 37822.95645928383, "step_time_sec": 8.229640706005739, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4561, "loss": 4.409309387207031, "lr": 0.0002, "elapsed_sec": 37831.1847178936, "step_time_sec": 8.22816724900622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4562, "loss": 4.1215105056762695, "lr": 0.0002, "elapsed_sec": 37839.41276693344, "step_time_sec": 8.227843569999095, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4563, "loss": 4.164560794830322, "lr": 0.0002, "elapsed_sec": 37847.64303231239, "step_time_sec": 8.230112259014277, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4564, "loss": 4.325926780700684, "lr": 0.0002, "elapsed_sec": 37855.87146115303, "step_time_sec": 8.228315501997713, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4565, "loss": 3.9853551387786865, "lr": 0.0002, "elapsed_sec": 37864.10010933876, "step_time_sec": 8.228449165995698, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4566, "loss": 4.281054973602295, "lr": 0.0002, "elapsed_sec": 37872.32932925224, "step_time_sec": 8.229084975988371, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4567, "loss": 3.751894235610962, "lr": 0.0002, "elapsed_sec": 37880.56060361862, "step_time_sec": 8.231110672000796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4568, "loss": 4.195075988769531, "lr": 0.0002, "elapsed_sec": 37888.792093515396, "step_time_sec": 8.23135611298494, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4569, "loss": 4.1971516609191895, "lr": 0.0002, "elapsed_sec": 37897.02335643768, "step_time_sec": 8.231059436016949, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4570, "loss": 4.2331438064575195, "lr": 0.0002, "elapsed_sec": 37905.25336480141, "step_time_sec": 8.229881022998597, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4571, "loss": 4.158837795257568, "lr": 0.0002, "elapsed_sec": 37913.4836473465, "step_time_sec": 8.23016388702672, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4572, "loss": 4.274033069610596, "lr": 0.0002, "elapsed_sec": 37921.7142829895, "step_time_sec": 8.230457157013007, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4573, "loss": 4.078027248382568, "lr": 0.0002, "elapsed_sec": 37929.94425082207, "step_time_sec": 8.229785591975087, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4574, "loss": 4.1257829666137695, "lr": 0.0002, "elapsed_sec": 37938.174021720886, "step_time_sec": 8.229604189982638, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4575, "loss": 4.257504940032959, "lr": 0.0002, "elapsed_sec": 37946.40486073494, "step_time_sec": 8.230713627999648, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4576, "loss": 4.036767482757568, "lr": 0.0002, "elapsed_sec": 37954.636452674866, "step_time_sec": 8.231461974006379, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4577, "loss": 4.220331192016602, "lr": 0.0002, "elapsed_sec": 37962.86701130867, "step_time_sec": 8.230342730996199, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4578, "loss": 4.239326477050781, "lr": 0.0002, "elapsed_sec": 37971.09917974472, "step_time_sec": 8.231998992996523, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4579, "loss": 4.193966865539551, "lr": 0.0002, "elapsed_sec": 37979.32933020592, "step_time_sec": 8.22998398297932, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4580, "loss": 4.1249284744262695, "lr": 0.0002, "elapsed_sec": 37987.560290813446, "step_time_sec": 8.230816986004356, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4581, "loss": 4.090322971343994, "lr": 0.0002, "elapsed_sec": 37995.79107618332, "step_time_sec": 8.23061661500833, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4582, "loss": 4.203220367431641, "lr": 0.0002, "elapsed_sec": 38004.02314782143, "step_time_sec": 8.231981615012046, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4583, "loss": 4.477423667907715, "lr": 0.0002, "elapsed_sec": 38012.25493168831, "step_time_sec": 8.231577689002734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4584, "loss": 4.205228328704834, "lr": 0.0002, "elapsed_sec": 38020.48511719704, "step_time_sec": 8.230030700011412, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4585, "loss": 4.26301383972168, "lr": 0.0002, "elapsed_sec": 38028.7164542675, "step_time_sec": 8.23118149099173, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4586, "loss": 4.282825469970703, "lr": 0.0002, "elapsed_sec": 38036.945805072784, "step_time_sec": 8.22922142301104, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4587, "loss": 4.222055912017822, "lr": 0.0002, "elapsed_sec": 38045.17504143715, "step_time_sec": 8.229050132009434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4588, "loss": 4.179165363311768, "lr": 0.0002, "elapsed_sec": 38053.40546941757, "step_time_sec": 8.23028446501121, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4589, "loss": 4.0595855712890625, "lr": 0.0002, "elapsed_sec": 38061.636035203934, "step_time_sec": 8.230440137005644, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4590, "loss": 4.345588207244873, "lr": 0.0002, "elapsed_sec": 38069.866708517075, "step_time_sec": 8.230497691984056, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4591, "loss": 4.118975639343262, "lr": 0.0002, "elapsed_sec": 38078.097140312195, "step_time_sec": 8.23024633701425, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4592, "loss": 4.04262113571167, "lr": 0.0002, "elapsed_sec": 38086.32817029953, "step_time_sec": 8.230881989002228, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4593, "loss": 4.060052871704102, "lr": 0.0002, "elapsed_sec": 38094.55903458595, "step_time_sec": 8.230709495022893, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4594, "loss": 4.16218900680542, "lr": 0.0002, "elapsed_sec": 38102.789501428604, "step_time_sec": 8.230307778983843, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4595, "loss": 4.242986679077148, "lr": 0.0002, "elapsed_sec": 38111.0209813118, "step_time_sec": 8.23131788498722, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4596, "loss": 4.231956481933594, "lr": 0.0002, "elapsed_sec": 38119.25182843208, "step_time_sec": 8.230747556022834, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4597, "loss": 4.390766620635986, "lr": 0.0002, "elapsed_sec": 38127.48373746872, "step_time_sec": 8.231720273004612, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4598, "loss": 4.350234508514404, "lr": 0.0002, "elapsed_sec": 38135.714569330215, "step_time_sec": 8.2306727420073, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4599, "loss": 4.217258930206299, "lr": 0.0002, "elapsed_sec": 38143.944298028946, "step_time_sec": 8.229594062024262, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4600, "loss": 4.102018356323242, "lr": 0.0002, "elapsed_sec": 38152.1732339859, "step_time_sec": 8.228779199009296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4601, "loss": 4.2400078773498535, "lr": 0.0002, "elapsed_sec": 38160.40319132805, "step_time_sec": 8.229804669012083, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4602, "loss": 4.3817338943481445, "lr": 0.0002, "elapsed_sec": 38168.63359332085, "step_time_sec": 8.230237602983834, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4603, "loss": 4.077898979187012, "lr": 0.0002, "elapsed_sec": 38176.8637907505, "step_time_sec": 8.23010022600647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4604, "loss": 4.243672847747803, "lr": 0.0002, "elapsed_sec": 38185.095027685165, "step_time_sec": 8.231008344009751, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4605, "loss": 4.081779956817627, "lr": 0.0002, "elapsed_sec": 38193.32575011253, "step_time_sec": 8.230573797016405, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4606, "loss": 4.0187788009643555, "lr": 0.0002, "elapsed_sec": 38201.556357860565, "step_time_sec": 8.230445167981088, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4607, "loss": 4.228039264678955, "lr": 0.0002, "elapsed_sec": 38209.78674888611, "step_time_sec": 8.230278343020473, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4608, "loss": 4.110009670257568, "lr": 0.0002, "elapsed_sec": 38218.017904281616, "step_time_sec": 8.230965865019243, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4609, "loss": 4.191781520843506, "lr": 0.0002, "elapsed_sec": 38226.248648405075, "step_time_sec": 8.230592150997836, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4610, "loss": 4.1817240715026855, "lr": 0.0002, "elapsed_sec": 38234.480055332184, "step_time_sec": 8.231264247995568, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4611, "loss": 4.3160786628723145, "lr": 0.0002, "elapsed_sec": 38242.7115881443, "step_time_sec": 8.23136869998416, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4612, "loss": 4.3210225105285645, "lr": 0.0002, "elapsed_sec": 38250.939472198486, "step_time_sec": 8.22776706100558, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4613, "loss": 4.257686138153076, "lr": 0.0002, "elapsed_sec": 38259.16811180115, "step_time_sec": 8.228459197998745, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4614, "loss": 4.120471954345703, "lr": 0.0002, "elapsed_sec": 38267.39906716347, "step_time_sec": 8.230796674994053, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4615, "loss": 4.1417365074157715, "lr": 0.0002, "elapsed_sec": 38275.63114929199, "step_time_sec": 8.231902970001101, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4616, "loss": 4.266590595245361, "lr": 0.0002, "elapsed_sec": 38283.86200714111, "step_time_sec": 8.23071305899066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4617, "loss": 4.103911399841309, "lr": 0.0002, "elapsed_sec": 38292.092206954956, "step_time_sec": 8.23013435502071, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4618, "loss": 4.216277599334717, "lr": 0.0002, "elapsed_sec": 38300.32359981537, "step_time_sec": 8.23116245502024, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4619, "loss": 4.230234146118164, "lr": 0.0002, "elapsed_sec": 38308.552531957626, "step_time_sec": 8.228781226003775, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4620, "loss": 4.245184898376465, "lr": 0.0002, "elapsed_sec": 38316.78387880325, "step_time_sec": 8.231175377004547, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4621, "loss": 4.136484622955322, "lr": 0.0002, "elapsed_sec": 38325.01524615288, "step_time_sec": 8.23122271199827, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4622, "loss": 4.251737117767334, "lr": 0.0002, "elapsed_sec": 38333.24642968178, "step_time_sec": 8.231030613998882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4623, "loss": 4.184845447540283, "lr": 0.0002, "elapsed_sec": 38341.47779750824, "step_time_sec": 8.231208731012885, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4624, "loss": 4.15277099609375, "lr": 0.0002, "elapsed_sec": 38349.70870041847, "step_time_sec": 8.230807978019584, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4625, "loss": 4.014227390289307, "lr": 0.0002, "elapsed_sec": 38357.937161922455, "step_time_sec": 8.228255560010439, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4626, "loss": 4.120833396911621, "lr": 0.0002, "elapsed_sec": 38366.166412353516, "step_time_sec": 8.229094485985115, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4627, "loss": 4.147994518280029, "lr": 0.0002, "elapsed_sec": 38374.39763402939, "step_time_sec": 8.231058153993217, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4628, "loss": 4.183567047119141, "lr": 0.0002, "elapsed_sec": 38382.628172159195, "step_time_sec": 8.230406365997624, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4629, "loss": 4.293208599090576, "lr": 0.0002, "elapsed_sec": 38390.859449863434, "step_time_sec": 8.231123706005746, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4630, "loss": 4.269391059875488, "lr": 0.0002, "elapsed_sec": 38399.08996319771, "step_time_sec": 8.230360782006755, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4631, "loss": 4.246944904327393, "lr": 0.0002, "elapsed_sec": 38407.321522951126, "step_time_sec": 8.231453140004305, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4632, "loss": 4.153900623321533, "lr": 0.0002, "elapsed_sec": 38415.5521941185, "step_time_sec": 8.230526805011323, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4633, "loss": 4.298314571380615, "lr": 0.0002, "elapsed_sec": 38423.78238725662, "step_time_sec": 8.22997428799863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4634, "loss": 4.152212619781494, "lr": 0.0002, "elapsed_sec": 38432.01143217087, "step_time_sec": 8.228920166002354, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4635, "loss": 4.1844048500061035, "lr": 0.0002, "elapsed_sec": 38440.24048662186, "step_time_sec": 8.228874302993063, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4636, "loss": 4.219970226287842, "lr": 0.0002, "elapsed_sec": 38448.469504117966, "step_time_sec": 8.228878702007933, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4637, "loss": 4.2551798820495605, "lr": 0.0002, "elapsed_sec": 38456.697751522064, "step_time_sec": 8.228041235008277, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4638, "loss": 4.190101146697998, "lr": 0.0002, "elapsed_sec": 38464.928550481796, "step_time_sec": 8.230644139985088, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4639, "loss": 4.320748805999756, "lr": 0.0002, "elapsed_sec": 38473.16026163101, "step_time_sec": 8.23156183201354, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4640, "loss": 4.186767101287842, "lr": 0.0002, "elapsed_sec": 38481.38871431351, "step_time_sec": 8.228311023005517, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4641, "loss": 4.1982831954956055, "lr": 0.0002, "elapsed_sec": 38489.61936593056, "step_time_sec": 8.230513253016397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4642, "loss": 4.193739414215088, "lr": 0.0002, "elapsed_sec": 38497.84752321243, "step_time_sec": 8.227976335998392, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4643, "loss": 4.243748188018799, "lr": 0.0002, "elapsed_sec": 38506.076577425, "step_time_sec": 8.228911279991735, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4644, "loss": 4.096246719360352, "lr": 0.0002, "elapsed_sec": 38514.30682635307, "step_time_sec": 8.230095189006533, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4645, "loss": 4.12103796005249, "lr": 0.0002, "elapsed_sec": 38522.53525972366, "step_time_sec": 8.22830931498902, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4646, "loss": 4.173638820648193, "lr": 0.0002, "elapsed_sec": 38530.76406741142, "step_time_sec": 8.228595424996456, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4647, "loss": 4.008982181549072, "lr": 0.0002, "elapsed_sec": 38538.99341392517, "step_time_sec": 8.22918471400044, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4648, "loss": 4.11875581741333, "lr": 0.0002, "elapsed_sec": 38547.22432112694, "step_time_sec": 8.230832906992873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4649, "loss": 4.381351470947266, "lr": 0.0002, "elapsed_sec": 38555.45612311363, "step_time_sec": 8.231580633990234, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4650, "loss": 4.248639106750488, "lr": 0.0002, "elapsed_sec": 38563.68379378319, "step_time_sec": 8.227560419007204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4651, "loss": 4.195187568664551, "lr": 0.0002, "elapsed_sec": 38571.915170907974, "step_time_sec": 8.231196134001948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4652, "loss": 4.244329452514648, "lr": 0.0002, "elapsed_sec": 38580.14379477501, "step_time_sec": 8.228448184003355, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4653, "loss": 4.445136547088623, "lr": 0.0002, "elapsed_sec": 38588.37367892265, "step_time_sec": 8.229752640996594, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4654, "loss": 4.1769537925720215, "lr": 0.0002, "elapsed_sec": 38596.603900909424, "step_time_sec": 8.230086055002175, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4655, "loss": 4.178707599639893, "lr": 0.0002, "elapsed_sec": 38604.83424830437, "step_time_sec": 8.230167398985941, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4656, "loss": 4.365199089050293, "lr": 0.0002, "elapsed_sec": 38613.06471800804, "step_time_sec": 8.230295123998076, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4657, "loss": 4.185773849487305, "lr": 0.0002, "elapsed_sec": 38621.2939927578, "step_time_sec": 8.229123281984357, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4658, "loss": 4.026432514190674, "lr": 0.0002, "elapsed_sec": 38629.52475953102, "step_time_sec": 8.230622686009156, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4659, "loss": 4.013809680938721, "lr": 0.0002, "elapsed_sec": 38637.7573120594, "step_time_sec": 8.232437634986127, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4660, "loss": 4.3220696449279785, "lr": 0.0002, "elapsed_sec": 38645.988776922226, "step_time_sec": 8.231261534994701, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4661, "loss": 4.1722731590271, "lr": 0.0002, "elapsed_sec": 38654.21788477898, "step_time_sec": 8.228937305015279, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4662, "loss": 4.300130367279053, "lr": 0.0002, "elapsed_sec": 38662.44713973999, "step_time_sec": 8.229145256977063, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4663, "loss": 4.258871078491211, "lr": 0.0002, "elapsed_sec": 38670.6781578064, "step_time_sec": 8.230883705982706, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4664, "loss": 4.000290870666504, "lr": 0.0002, "elapsed_sec": 38678.90945792198, "step_time_sec": 8.231091852998361, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4665, "loss": 4.314413070678711, "lr": 0.0002, "elapsed_sec": 38687.14068841934, "step_time_sec": 8.23115225500078, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4666, "loss": 4.108294486999512, "lr": 0.0002, "elapsed_sec": 38695.370309114456, "step_time_sec": 8.229454408021411, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4667, "loss": 4.269329071044922, "lr": 0.0002, "elapsed_sec": 38703.59868454933, "step_time_sec": 8.228147557005286, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4668, "loss": 4.138242244720459, "lr": 0.0002, "elapsed_sec": 38711.82988214493, "step_time_sec": 8.231045936001465, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4669, "loss": 4.179283142089844, "lr": 0.0002, "elapsed_sec": 38720.060537815094, "step_time_sec": 8.230491898983018, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4670, "loss": 4.382218360900879, "lr": 0.0002, "elapsed_sec": 38728.29174232483, "step_time_sec": 8.231048704998102, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4671, "loss": 4.121603965759277, "lr": 0.0002, "elapsed_sec": 38736.522443532944, "step_time_sec": 8.230553917994257, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4672, "loss": 4.108399391174316, "lr": 0.0002, "elapsed_sec": 38744.75334382057, "step_time_sec": 8.230790599976899, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4673, "loss": 4.3248090744018555, "lr": 0.0002, "elapsed_sec": 38752.985435009, "step_time_sec": 8.23193332698429, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4674, "loss": 4.230209827423096, "lr": 0.0002, "elapsed_sec": 38761.21580171585, "step_time_sec": 8.23018697599764, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4675, "loss": 4.212937355041504, "lr": 0.0002, "elapsed_sec": 38769.44656038284, "step_time_sec": 8.23058214300545, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4676, "loss": 4.180062294006348, "lr": 0.0002, "elapsed_sec": 38777.677101135254, "step_time_sec": 8.230376090010395, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4677, "loss": 4.223797798156738, "lr": 0.0002, "elapsed_sec": 38785.90702366829, "step_time_sec": 8.229842533008195, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4678, "loss": 4.1889967918396, "lr": 0.0002, "elapsed_sec": 38794.13825440407, "step_time_sec": 8.231035205011722, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4679, "loss": 4.183165550231934, "lr": 0.0002, "elapsed_sec": 38802.368273973465, "step_time_sec": 8.229862287000287, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4680, "loss": 4.329306602478027, "lr": 0.0002, "elapsed_sec": 38810.60005450249, "step_time_sec": 8.231574986013584, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4681, "loss": 4.081127643585205, "lr": 0.0002, "elapsed_sec": 38818.83117437363, "step_time_sec": 8.230982662003953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4682, "loss": 4.223178863525391, "lr": 0.0002, "elapsed_sec": 38827.062334775925, "step_time_sec": 8.231007619993761, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4683, "loss": 4.299455642700195, "lr": 0.0002, "elapsed_sec": 38835.29406809807, "step_time_sec": 8.231610228016507, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4684, "loss": 4.052258491516113, "lr": 0.0002, "elapsed_sec": 38843.525656461716, "step_time_sec": 8.231381510995561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4685, "loss": 4.146275043487549, "lr": 0.0002, "elapsed_sec": 38851.75619029999, "step_time_sec": 8.230382470006589, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4686, "loss": 4.159904479980469, "lr": 0.0002, "elapsed_sec": 38859.9876973629, "step_time_sec": 8.231383767008083, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4687, "loss": 4.198921203613281, "lr": 0.0002, "elapsed_sec": 38868.22051143646, "step_time_sec": 8.232705733011244, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4688, "loss": 4.1642746925354, "lr": 0.0002, "elapsed_sec": 38876.451587677, "step_time_sec": 8.230831196997315, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4689, "loss": 4.131927967071533, "lr": 0.0002, "elapsed_sec": 38884.68256807327, "step_time_sec": 8.230840958975023, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4690, "loss": 4.100308418273926, "lr": 0.0002, "elapsed_sec": 38892.911714315414, "step_time_sec": 8.229042597988155, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4691, "loss": 4.163623809814453, "lr": 0.0002, "elapsed_sec": 38901.14169216156, "step_time_sec": 8.229823592992034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4692, "loss": 4.153451919555664, "lr": 0.0002, "elapsed_sec": 38909.37262248993, "step_time_sec": 8.230701101012528, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4693, "loss": 4.1646318435668945, "lr": 0.0002, "elapsed_sec": 38917.603944301605, "step_time_sec": 8.231248649011832, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4694, "loss": 4.140693187713623, "lr": 0.0002, "elapsed_sec": 38925.835273742676, "step_time_sec": 8.231081058009295, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4695, "loss": 4.18088960647583, "lr": 0.0002, "elapsed_sec": 38934.063653469086, "step_time_sec": 8.228240819007624, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4696, "loss": 4.140697956085205, "lr": 0.0002, "elapsed_sec": 38942.293984889984, "step_time_sec": 8.230176571989432, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4697, "loss": 4.21890115737915, "lr": 0.0002, "elapsed_sec": 38950.52501177788, "step_time_sec": 8.230841348995455, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4698, "loss": 4.169214248657227, "lr": 0.0002, "elapsed_sec": 38958.75676012039, "step_time_sec": 8.231585800007451, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4699, "loss": 4.197021007537842, "lr": 0.0002, "elapsed_sec": 38966.98806643486, "step_time_sec": 8.231149669009028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4700, "loss": 4.213146209716797, "lr": 0.0002, "elapsed_sec": 38975.219495773315, "step_time_sec": 8.231282511987956, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4701, "loss": 4.216726303100586, "lr": 0.0002, "elapsed_sec": 38983.44850611687, "step_time_sec": 8.228834930021549, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4702, "loss": 4.118172645568848, "lr": 0.0002, "elapsed_sec": 38991.67638206482, "step_time_sec": 8.227726313023595, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4703, "loss": 4.278123378753662, "lr": 0.0002, "elapsed_sec": 38999.90712714195, "step_time_sec": 8.230577542009996, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4704, "loss": 4.250976085662842, "lr": 0.0002, "elapsed_sec": 39008.137776851654, "step_time_sec": 8.230522805009969, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4705, "loss": 4.255608558654785, "lr": 0.0002, "elapsed_sec": 39016.368288993835, "step_time_sec": 8.230326049990254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4706, "loss": 4.085888862609863, "lr": 0.0002, "elapsed_sec": 39024.59892034531, "step_time_sec": 8.230448888993124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4707, "loss": 4.188760280609131, "lr": 0.0002, "elapsed_sec": 39032.82998585701, "step_time_sec": 8.230903054005466, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4708, "loss": 3.9792869091033936, "lr": 0.0002, "elapsed_sec": 39041.059888362885, "step_time_sec": 8.229753819992766, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4709, "loss": 4.2230401039123535, "lr": 0.0002, "elapsed_sec": 39049.28933477402, "step_time_sec": 8.229273417993682, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4710, "loss": 4.1382551193237305, "lr": 0.0002, "elapsed_sec": 39057.520582437515, "step_time_sec": 8.231099536002148, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4711, "loss": 4.208985805511475, "lr": 0.0002, "elapsed_sec": 39065.74993920326, "step_time_sec": 8.229227894014912, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4712, "loss": 4.202207088470459, "lr": 0.0002, "elapsed_sec": 39073.97705245018, "step_time_sec": 8.226946712995414, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4713, "loss": 4.236143112182617, "lr": 0.0002, "elapsed_sec": 39082.207780599594, "step_time_sec": 8.230507961998228, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4714, "loss": 4.2035231590271, "lr": 0.0002, "elapsed_sec": 39090.43702292442, "step_time_sec": 8.229080486024031, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4715, "loss": 4.189476013183594, "lr": 0.0002, "elapsed_sec": 39098.66808605194, "step_time_sec": 8.230916849017376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4716, "loss": 4.164924144744873, "lr": 0.0002, "elapsed_sec": 39106.89657306671, "step_time_sec": 8.228341934998753, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4717, "loss": 4.067230701446533, "lr": 0.0002, "elapsed_sec": 39115.12621617317, "step_time_sec": 8.229463589028455, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4718, "loss": 4.231043338775635, "lr": 0.0002, "elapsed_sec": 39123.35673332214, "step_time_sec": 8.230393456004094, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4719, "loss": 4.109531402587891, "lr": 0.0002, "elapsed_sec": 39131.588200330734, "step_time_sec": 8.231294212018838, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4720, "loss": 4.129161357879639, "lr": 0.0002, "elapsed_sec": 39139.8166809082, "step_time_sec": 8.228288102982333, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4721, "loss": 4.128143310546875, "lr": 0.0002, "elapsed_sec": 39148.0458714962, "step_time_sec": 8.228997997997794, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4722, "loss": 4.281868934631348, "lr": 0.0002, "elapsed_sec": 39156.27573180199, "step_time_sec": 8.229704374010907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4723, "loss": 4.258768081665039, "lr": 0.0002, "elapsed_sec": 39164.506549835205, "step_time_sec": 8.230672868026886, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4724, "loss": 4.054107189178467, "lr": 0.0002, "elapsed_sec": 39172.73951101303, "step_time_sec": 8.23279929900309, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4725, "loss": 4.211211204528809, "lr": 0.0002, "elapsed_sec": 39180.97090053558, "step_time_sec": 8.231223479000619, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4726, "loss": 4.222815036773682, "lr": 0.0002, "elapsed_sec": 39189.20139694214, "step_time_sec": 8.230315958004212, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4727, "loss": 4.204822063446045, "lr": 0.0002, "elapsed_sec": 39197.43235564232, "step_time_sec": 8.230810317996657, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4728, "loss": 4.3526291847229, "lr": 0.0002, "elapsed_sec": 39205.66316461563, "step_time_sec": 8.230597946996568, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4729, "loss": 4.20927619934082, "lr": 0.0002, "elapsed_sec": 39213.894282341, "step_time_sec": 8.230971215991303, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4730, "loss": 4.14154052734375, "lr": 0.0002, "elapsed_sec": 39222.12534379959, "step_time_sec": 8.230932759004645, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4731, "loss": 4.205669403076172, "lr": 0.0002, "elapsed_sec": 39230.356805086136, "step_time_sec": 8.231243643007474, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4732, "loss": 4.127157211303711, "lr": 0.0002, "elapsed_sec": 39238.58805823326, "step_time_sec": 8.23116012502578, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4733, "loss": 4.17045783996582, "lr": 0.0002, "elapsed_sec": 39246.8183195591, "step_time_sec": 8.230020652001258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4734, "loss": 4.3852858543396, "lr": 0.0002, "elapsed_sec": 39255.04883098602, "step_time_sec": 8.230439232022036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4735, "loss": 4.139902114868164, "lr": 0.0002, "elapsed_sec": 39263.27737355232, "step_time_sec": 8.228356781997718, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4736, "loss": 4.024810314178467, "lr": 0.0002, "elapsed_sec": 39271.506027936935, "step_time_sec": 8.228453413001262, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4737, "loss": 4.248948574066162, "lr": 0.0002, "elapsed_sec": 39279.73552942276, "step_time_sec": 8.229333797004074, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4738, "loss": 4.14621114730835, "lr": 0.0002, "elapsed_sec": 39287.96280646324, "step_time_sec": 8.227124372991966, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4739, "loss": 4.043681621551514, "lr": 0.0002, "elapsed_sec": 39296.19426393509, "step_time_sec": 8.231262412999058, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4740, "loss": 4.123745441436768, "lr": 0.0002, "elapsed_sec": 39304.42541503906, "step_time_sec": 8.23098775401013, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4741, "loss": 4.1306986808776855, "lr": 0.0002, "elapsed_sec": 39312.65504360199, "step_time_sec": 8.229494491999503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4742, "loss": 4.264501571655273, "lr": 0.0002, "elapsed_sec": 39320.8850774765, "step_time_sec": 8.229804532020353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4743, "loss": 4.18209171295166, "lr": 0.0002, "elapsed_sec": 39329.11475586891, "step_time_sec": 8.229531166987726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4744, "loss": 4.189463138580322, "lr": 0.0002, "elapsed_sec": 39337.34534788132, "step_time_sec": 8.230492797010811, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4745, "loss": 4.289370536804199, "lr": 0.0002, "elapsed_sec": 39345.57499575615, "step_time_sec": 8.229455577005865, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4746, "loss": 4.001054763793945, "lr": 0.0002, "elapsed_sec": 39353.80661892891, "step_time_sec": 8.231452295003692, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4747, "loss": 4.178142547607422, "lr": 0.0002, "elapsed_sec": 39362.03600811958, "step_time_sec": 8.229182293987833, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4748, "loss": 4.113479137420654, "lr": 0.0002, "elapsed_sec": 39370.26409649849, "step_time_sec": 8.227918489981676, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4749, "loss": 4.171435356140137, "lr": 0.0002, "elapsed_sec": 39378.49563097954, "step_time_sec": 8.231423459015787, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4750, "loss": 4.304432392120361, "lr": 0.0002, "elapsed_sec": 39386.72635483742, "step_time_sec": 8.230515390983783, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4751, "loss": 4.123574733734131, "lr": 0.0002, "elapsed_sec": 39394.95799899101, "step_time_sec": 8.231535631988663, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4752, "loss": 3.9227771759033203, "lr": 0.0002, "elapsed_sec": 39403.18738126755, "step_time_sec": 8.229180306982016, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4753, "loss": 4.198126792907715, "lr": 0.0002, "elapsed_sec": 39411.41804409027, "step_time_sec": 8.230512900016038, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4754, "loss": 4.047615051269531, "lr": 0.0002, "elapsed_sec": 39419.649896383286, "step_time_sec": 8.231718455994269, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4755, "loss": 4.2636637687683105, "lr": 0.0002, "elapsed_sec": 39427.88052845001, "step_time_sec": 8.230466656008502, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4756, "loss": 4.173144340515137, "lr": 0.0002, "elapsed_sec": 39436.11193108559, "step_time_sec": 8.23123920999933, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4757, "loss": 4.208985805511475, "lr": 0.0002, "elapsed_sec": 39444.341680288315, "step_time_sec": 8.229581905994564, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4758, "loss": 4.149138927459717, "lr": 0.0002, "elapsed_sec": 39452.57184958458, "step_time_sec": 8.230037041008472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4759, "loss": 3.968796730041504, "lr": 0.0002, "elapsed_sec": 39460.80256009102, "step_time_sec": 8.230506185995182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4760, "loss": 4.15922737121582, "lr": 0.0002, "elapsed_sec": 39469.03299164772, "step_time_sec": 8.230274061003001, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4761, "loss": 4.15969705581665, "lr": 0.0002, "elapsed_sec": 39477.26268386841, "step_time_sec": 8.229565007000929, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4762, "loss": 4.236702919006348, "lr": 0.0002, "elapsed_sec": 39485.49359869957, "step_time_sec": 8.23074614998768, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4763, "loss": 4.117542743682861, "lr": 0.0002, "elapsed_sec": 39493.72491312027, "step_time_sec": 8.23120951099554, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4764, "loss": 4.265200138092041, "lr": 0.0002, "elapsed_sec": 39501.95579624176, "step_time_sec": 8.230638036009623, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4765, "loss": 4.270793914794922, "lr": 0.0002, "elapsed_sec": 39510.18595743179, "step_time_sec": 8.230052711005555, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4766, "loss": 4.249652862548828, "lr": 0.0002, "elapsed_sec": 39518.41473841667, "step_time_sec": 8.228571551007917, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4767, "loss": 4.2314019203186035, "lr": 0.0002, "elapsed_sec": 39526.64396286011, "step_time_sec": 8.229078047006624, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4768, "loss": 4.092706203460693, "lr": 0.0002, "elapsed_sec": 39534.874960422516, "step_time_sec": 8.230798844975652, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4769, "loss": 4.1728925704956055, "lr": 0.0002, "elapsed_sec": 39543.10405039787, "step_time_sec": 8.22897795701283, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4770, "loss": 4.055973529815674, "lr": 0.0002, "elapsed_sec": 39551.33276081085, "step_time_sec": 8.228475848009111, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4771, "loss": 4.019777774810791, "lr": 0.0002, "elapsed_sec": 39559.56220841408, "step_time_sec": 8.229349606001051, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4772, "loss": 4.269648551940918, "lr": 0.0002, "elapsed_sec": 39567.79266023636, "step_time_sec": 8.230309887992917, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4773, "loss": 4.32487678527832, "lr": 0.0002, "elapsed_sec": 39576.023447752, "step_time_sec": 8.230574457003968, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4774, "loss": 4.057849407196045, "lr": 0.0002, "elapsed_sec": 39584.25477004051, "step_time_sec": 8.231151368992869, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4775, "loss": 4.087983131408691, "lr": 0.0002, "elapsed_sec": 39592.48581671715, "step_time_sec": 8.230874085013056, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4776, "loss": 4.229857444763184, "lr": 0.0002, "elapsed_sec": 39600.71643328667, "step_time_sec": 8.230454348988133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4777, "loss": 4.1798810958862305, "lr": 0.0002, "elapsed_sec": 39608.94743132591, "step_time_sec": 8.23086110499571, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4778, "loss": 4.144527912139893, "lr": 0.0002, "elapsed_sec": 39617.17702412605, "step_time_sec": 8.229399400006514, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4779, "loss": 4.213230133056641, "lr": 0.0002, "elapsed_sec": 39625.40751051903, "step_time_sec": 8.230305976991076, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4780, "loss": 4.191300392150879, "lr": 0.0002, "elapsed_sec": 39633.63856482506, "step_time_sec": 8.230923327995697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4781, "loss": 4.20861291885376, "lr": 0.0002, "elapsed_sec": 39641.86965632439, "step_time_sec": 8.230943505011965, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4782, "loss": 4.055591583251953, "lr": 0.0002, "elapsed_sec": 39650.10173845291, "step_time_sec": 8.231894873984857, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4783, "loss": 4.212401390075684, "lr": 0.0002, "elapsed_sec": 39658.33000063896, "step_time_sec": 8.22816821798915, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4784, "loss": 4.1790361404418945, "lr": 0.0002, "elapsed_sec": 39666.559871435165, "step_time_sec": 8.22964421400684, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4785, "loss": 4.158243656158447, "lr": 0.0002, "elapsed_sec": 39674.79064822197, "step_time_sec": 8.230612584011396, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4786, "loss": 4.145784378051758, "lr": 0.0002, "elapsed_sec": 39683.02169108391, "step_time_sec": 8.230925979994936, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4787, "loss": 4.0442280769348145, "lr": 0.0002, "elapsed_sec": 39691.2530105114, "step_time_sec": 8.23116645301343, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4788, "loss": 4.049762725830078, "lr": 0.0002, "elapsed_sec": 39699.48233819008, "step_time_sec": 8.229148439015262, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4789, "loss": 4.04815673828125, "lr": 0.0002, "elapsed_sec": 39707.71136021614, "step_time_sec": 8.228877456014743, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4790, "loss": 4.180858135223389, "lr": 0.0002, "elapsed_sec": 39715.942517757416, "step_time_sec": 8.230979512998601, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4791, "loss": 3.944145917892456, "lr": 0.0002, "elapsed_sec": 39724.17394924164, "step_time_sec": 8.231331297021825, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4792, "loss": 4.023338317871094, "lr": 0.0002, "elapsed_sec": 39732.405155181885, "step_time_sec": 8.231000201019924, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4793, "loss": 4.186649799346924, "lr": 0.0002, "elapsed_sec": 39740.63603305817, "step_time_sec": 8.230700410000281, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4794, "loss": 4.281209468841553, "lr": 0.0002, "elapsed_sec": 39748.867204904556, "step_time_sec": 8.23105091098114, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4795, "loss": 4.09832763671875, "lr": 0.0002, "elapsed_sec": 39757.09867095947, "step_time_sec": 8.231244165013777, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4796, "loss": 4.228309631347656, "lr": 0.0002, "elapsed_sec": 39765.33053588867, "step_time_sec": 8.231728508981178, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4797, "loss": 4.178319454193115, "lr": 0.0002, "elapsed_sec": 39773.55998516083, "step_time_sec": 8.229314915981377, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4798, "loss": 4.218904972076416, "lr": 0.0002, "elapsed_sec": 39781.79154586792, "step_time_sec": 8.231328124995343, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4799, "loss": 4.129065990447998, "lr": 0.0002, "elapsed_sec": 39790.02252984047, "step_time_sec": 8.230854959023418, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4800, "loss": 4.1348676681518555, "lr": 0.0002, "elapsed_sec": 39798.254039764404, "step_time_sec": 8.23135257299873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4801, "loss": 4.1828107833862305, "lr": 0.0002, "elapsed_sec": 39806.486339092255, "step_time_sec": 8.232072215003427, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4802, "loss": 4.110348224639893, "lr": 0.0002, "elapsed_sec": 39814.71741247177, "step_time_sec": 8.230912610015366, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4803, "loss": 4.108824729919434, "lr": 0.0002, "elapsed_sec": 39822.94830060005, "step_time_sec": 8.230742167012068, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4804, "loss": 4.203644275665283, "lr": 0.0002, "elapsed_sec": 39831.18053984642, "step_time_sec": 8.232109559001401, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4805, "loss": 4.222957134246826, "lr": 0.0002, "elapsed_sec": 39839.4114382267, "step_time_sec": 8.230713366006967, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4806, "loss": 4.241837024688721, "lr": 0.0002, "elapsed_sec": 39847.63872075081, "step_time_sec": 8.227185299998382, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4807, "loss": 4.147539138793945, "lr": 0.0002, "elapsed_sec": 39855.86905550957, "step_time_sec": 8.230116734979674, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4808, "loss": 4.082638740539551, "lr": 0.0002, "elapsed_sec": 39864.098467588425, "step_time_sec": 8.229306159977568, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4809, "loss": 4.094160556793213, "lr": 0.0002, "elapsed_sec": 39872.32944083214, "step_time_sec": 8.23076005699113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4810, "loss": 4.262234687805176, "lr": 0.0002, "elapsed_sec": 39880.56107211113, "step_time_sec": 8.231475527980365, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4811, "loss": 4.105875015258789, "lr": 0.0002, "elapsed_sec": 39888.79228210449, "step_time_sec": 8.231055311014643, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4812, "loss": 4.140719413757324, "lr": 0.0002, "elapsed_sec": 39897.02261996269, "step_time_sec": 8.23016756598372, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4813, "loss": 4.315629005432129, "lr": 0.0002, "elapsed_sec": 39905.2526743412, "step_time_sec": 8.229921352991369, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4814, "loss": 4.094555854797363, "lr": 0.0002, "elapsed_sec": 39913.48333621025, "step_time_sec": 8.230495732015697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4815, "loss": 4.20700740814209, "lr": 0.0002, "elapsed_sec": 39921.71378970146, "step_time_sec": 8.230276817979757, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4816, "loss": 4.223714351654053, "lr": 0.0002, "elapsed_sec": 39929.94322490692, "step_time_sec": 8.229314775002422, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4817, "loss": 4.136070728302002, "lr": 0.0002, "elapsed_sec": 39938.17420768738, "step_time_sec": 8.2308511690062, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4818, "loss": 4.101373195648193, "lr": 0.0002, "elapsed_sec": 39946.40526390076, "step_time_sec": 8.2309059209947, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4819, "loss": 4.072392463684082, "lr": 0.0002, "elapsed_sec": 39954.637650966644, "step_time_sec": 8.232194031996187, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4820, "loss": 4.3271284103393555, "lr": 0.0002, "elapsed_sec": 39962.866431474686, "step_time_sec": 8.228610071004368, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4821, "loss": 3.880185842514038, "lr": 0.0002, "elapsed_sec": 39971.09584403038, "step_time_sec": 8.229296808014624, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4822, "loss": 4.278838634490967, "lr": 0.0002, "elapsed_sec": 39979.32746529579, "step_time_sec": 8.231482108996715, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4823, "loss": 4.053826332092285, "lr": 0.0002, "elapsed_sec": 39987.55756521225, "step_time_sec": 8.2299029820133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4824, "loss": 4.232839107513428, "lr": 0.0002, "elapsed_sec": 39995.78919291496, "step_time_sec": 8.231478984002024, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4825, "loss": 4.006518840789795, "lr": 0.0002, "elapsed_sec": 40004.01831936836, "step_time_sec": 8.229001253988827, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4826, "loss": 4.1856303215026855, "lr": 0.0002, "elapsed_sec": 40012.24788022041, "step_time_sec": 8.229403188975994, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4827, "loss": 4.208083629608154, "lr": 0.0002, "elapsed_sec": 40020.476637125015, "step_time_sec": 8.228594593994785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4828, "loss": 4.073457717895508, "lr": 0.0002, "elapsed_sec": 40028.70398974419, "step_time_sec": 8.227167122997344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4829, "loss": 4.157803058624268, "lr": 0.0002, "elapsed_sec": 40036.93411016464, "step_time_sec": 8.22997609700542, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4830, "loss": 4.1789445877075195, "lr": 0.0002, "elapsed_sec": 40045.163096904755, "step_time_sec": 8.228804762009531, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4831, "loss": 4.182295799255371, "lr": 0.0002, "elapsed_sec": 40053.39399433136, "step_time_sec": 8.230762519000564, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4832, "loss": 3.932948589324951, "lr": 0.0002, "elapsed_sec": 40061.62456655502, "step_time_sec": 8.230355006002355, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4833, "loss": 4.189553260803223, "lr": 0.0002, "elapsed_sec": 40069.85629630089, "step_time_sec": 8.231624646985438, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4834, "loss": 4.195915699005127, "lr": 0.0002, "elapsed_sec": 40078.08660507202, "step_time_sec": 8.230128786002751, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4835, "loss": 4.070405960083008, "lr": 0.0002, "elapsed_sec": 40086.31568455696, "step_time_sec": 8.228926227020565, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4836, "loss": 4.171840667724609, "lr": 0.0002, "elapsed_sec": 40094.54530596733, "step_time_sec": 8.229527347022668, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4837, "loss": 4.02459192276001, "lr": 0.0002, "elapsed_sec": 40102.773285865784, "step_time_sec": 8.227749357989524, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4838, "loss": 4.024844169616699, "lr": 0.0002, "elapsed_sec": 40111.002990722656, "step_time_sec": 8.229611822986044, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4839, "loss": 4.192543029785156, "lr": 0.0002, "elapsed_sec": 40119.23257803917, "step_time_sec": 8.22938211102155, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4840, "loss": 4.235642910003662, "lr": 0.0002, "elapsed_sec": 40127.46204185486, "step_time_sec": 8.22930493002059, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4841, "loss": 4.216531276702881, "lr": 0.0002, "elapsed_sec": 40135.691331624985, "step_time_sec": 8.229125674988609, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4842, "loss": 4.120482444763184, "lr": 0.0002, "elapsed_sec": 40143.92214870453, "step_time_sec": 8.230643390998011, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4843, "loss": 4.1429595947265625, "lr": 0.0002, "elapsed_sec": 40152.15386748314, "step_time_sec": 8.231608589994721, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4844, "loss": 4.135645866394043, "lr": 0.0002, "elapsed_sec": 40160.385219335556, "step_time_sec": 8.23123676297837, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4845, "loss": 4.193605422973633, "lr": 0.0002, "elapsed_sec": 40168.614619255066, "step_time_sec": 8.229223825997906, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4846, "loss": 4.112692832946777, "lr": 0.0002, "elapsed_sec": 40176.843418598175, "step_time_sec": 8.228597123990767, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4847, "loss": 4.100427627563477, "lr": 0.0002, "elapsed_sec": 40185.07467365265, "step_time_sec": 8.231105562997982, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4848, "loss": 4.065142631530762, "lr": 0.0002, "elapsed_sec": 40193.30704665184, "step_time_sec": 8.232267198996851, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4849, "loss": 4.084775447845459, "lr": 0.0002, "elapsed_sec": 40201.538001298904, "step_time_sec": 8.230750287999399, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4850, "loss": 4.187477111816406, "lr": 0.0002, "elapsed_sec": 40209.76568365097, "step_time_sec": 8.227575580007397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4851, "loss": 4.2825727462768555, "lr": 0.0002, "elapsed_sec": 40217.99641227722, "step_time_sec": 8.230503522994695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4852, "loss": 4.093852519989014, "lr": 0.0002, "elapsed_sec": 40226.224831581116, "step_time_sec": 8.228287023986923, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4853, "loss": 4.137714385986328, "lr": 0.0002, "elapsed_sec": 40234.45538520813, "step_time_sec": 8.230441660998622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4854, "loss": 4.191329002380371, "lr": 0.0002, "elapsed_sec": 40242.683448553085, "step_time_sec": 8.22787115999381, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4855, "loss": 4.0210394859313965, "lr": 0.0002, "elapsed_sec": 40250.91228556633, "step_time_sec": 8.22867555700941, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4856, "loss": 4.214766979217529, "lr": 0.0002, "elapsed_sec": 40259.14041376114, "step_time_sec": 8.227975629997673, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4857, "loss": 4.018596649169922, "lr": 0.0002, "elapsed_sec": 40267.36789774895, "step_time_sec": 8.227340140001616, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4858, "loss": 3.921992778778076, "lr": 0.0002, "elapsed_sec": 40275.59953069687, "step_time_sec": 8.231522121001035, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4859, "loss": 4.035555839538574, "lr": 0.0002, "elapsed_sec": 40283.83011317253, "step_time_sec": 8.230361133988481, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4860, "loss": 4.122793197631836, "lr": 0.0002, "elapsed_sec": 40292.06113290787, "step_time_sec": 8.23085146601079, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4861, "loss": 4.209839820861816, "lr": 0.0002, "elapsed_sec": 40300.2928276062, "step_time_sec": 8.231549224990886, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4862, "loss": 4.04495096206665, "lr": 0.0002, "elapsed_sec": 40308.52401685715, "step_time_sec": 8.231088170025032, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4863, "loss": 4.028587818145752, "lr": 0.0002, "elapsed_sec": 40316.75481200218, "step_time_sec": 8.230609846010339, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4864, "loss": 4.020422458648682, "lr": 0.0002, "elapsed_sec": 40324.985023736954, "step_time_sec": 8.230093555001076, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4865, "loss": 4.116155624389648, "lr": 0.0002, "elapsed_sec": 40333.21434569359, "step_time_sec": 8.229103621008107, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4866, "loss": 4.021243095397949, "lr": 0.0002, "elapsed_sec": 40341.44552350044, "step_time_sec": 8.231023673986783, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4867, "loss": 4.159909725189209, "lr": 0.0002, "elapsed_sec": 40349.67614150047, "step_time_sec": 8.230555652990006, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4868, "loss": 4.181685924530029, "lr": 0.0002, "elapsed_sec": 40357.906863212585, "step_time_sec": 8.230489526002202, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4869, "loss": 4.103095054626465, "lr": 0.0002, "elapsed_sec": 40366.13839626312, "step_time_sec": 8.23139372598962, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4870, "loss": 4.092365264892578, "lr": 0.0002, "elapsed_sec": 40374.36946606636, "step_time_sec": 8.230906802986283, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4871, "loss": 4.0237932205200195, "lr": 0.0002, "elapsed_sec": 40382.598717689514, "step_time_sec": 8.229087555984734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4872, "loss": 4.150581359863281, "lr": 0.0002, "elapsed_sec": 40390.829260349274, "step_time_sec": 8.230420181003865, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4873, "loss": 4.064591407775879, "lr": 0.0002, "elapsed_sec": 40399.05942249298, "step_time_sec": 8.229963970021345, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4874, "loss": 3.9791641235351562, "lr": 0.0002, "elapsed_sec": 40408.05229949951, "step_time_sec": 8.992724087001989, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4875, "loss": 4.226855754852295, "lr": 0.0002, "elapsed_sec": 40416.28268647194, "step_time_sec": 8.230228698026622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4876, "loss": 4.241078853607178, "lr": 0.0002, "elapsed_sec": 40424.512050151825, "step_time_sec": 8.229233758000191, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4877, "loss": 4.012004375457764, "lr": 0.0002, "elapsed_sec": 40432.741911649704, "step_time_sec": 8.22970242597512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4878, "loss": 4.145430088043213, "lr": 0.0002, "elapsed_sec": 40440.97218608856, "step_time_sec": 8.230134179990273, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4879, "loss": 4.0836591720581055, "lr": 0.0002, "elapsed_sec": 40449.20343065262, "step_time_sec": 8.23104796098778, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4880, "loss": 4.167796611785889, "lr": 0.0002, "elapsed_sec": 40457.43345594406, "step_time_sec": 8.229865417000838, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4881, "loss": 4.2596588134765625, "lr": 0.0002, "elapsed_sec": 40465.6644411087, "step_time_sec": 8.23085433000233, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4882, "loss": 4.062922477722168, "lr": 0.0002, "elapsed_sec": 40473.895221710205, "step_time_sec": 8.230617568013258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4883, "loss": 4.193446159362793, "lr": 0.0002, "elapsed_sec": 40482.12472009659, "step_time_sec": 8.229307706002146, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4884, "loss": 4.131062030792236, "lr": 0.0002, "elapsed_sec": 40490.35497570038, "step_time_sec": 8.230079625005601, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4885, "loss": 3.9876601696014404, "lr": 0.0002, "elapsed_sec": 40498.58516025543, "step_time_sec": 8.230109361000359, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4886, "loss": 4.269044876098633, "lr": 0.0002, "elapsed_sec": 40506.816024541855, "step_time_sec": 8.230621497990796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4887, "loss": 4.154828071594238, "lr": 0.0002, "elapsed_sec": 40515.04699397087, "step_time_sec": 8.230860399024095, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4888, "loss": 4.079864978790283, "lr": 0.0002, "elapsed_sec": 40523.274490594864, "step_time_sec": 8.227238344989019, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4889, "loss": 3.8119633197784424, "lr": 0.0002, "elapsed_sec": 40531.50351166725, "step_time_sec": 8.228896076005185, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4890, "loss": 4.1137614250183105, "lr": 0.0002, "elapsed_sec": 40539.733179330826, "step_time_sec": 8.229561991989613, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4891, "loss": 3.9188594818115234, "lr": 0.0002, "elapsed_sec": 40547.96386408806, "step_time_sec": 8.230459127022186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4892, "loss": 3.9949421882629395, "lr": 0.0002, "elapsed_sec": 40556.19515132904, "step_time_sec": 8.231129268999211, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4893, "loss": 4.101755142211914, "lr": 0.0002, "elapsed_sec": 40564.424043655396, "step_time_sec": 8.228726811998058, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4894, "loss": 4.2687668800354, "lr": 0.0002, "elapsed_sec": 40572.65468645096, "step_time_sec": 8.230476961005479, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4895, "loss": 4.252265453338623, "lr": 0.0002, "elapsed_sec": 40580.88640117645, "step_time_sec": 8.231570912001189, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4896, "loss": 4.23968505859375, "lr": 0.0002, "elapsed_sec": 40589.116820812225, "step_time_sec": 8.230247460014652, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4897, "loss": 4.312224388122559, "lr": 0.0002, "elapsed_sec": 40597.34776544571, "step_time_sec": 8.230858794995584, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4898, "loss": 4.083508491516113, "lr": 0.0002, "elapsed_sec": 40605.579337358475, "step_time_sec": 8.231396641000174, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4899, "loss": 4.237307548522949, "lr": 0.0002, "elapsed_sec": 40613.80978822708, "step_time_sec": 8.230264933023136, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4900, "loss": 4.105461597442627, "lr": 0.0002, "elapsed_sec": 40622.039288282394, "step_time_sec": 8.229324579995591, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4901, "loss": 4.126272201538086, "lr": 0.0002, "elapsed_sec": 40630.269439697266, "step_time_sec": 8.230017516994849, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4902, "loss": 4.128223419189453, "lr": 0.0002, "elapsed_sec": 40638.50106048584, "step_time_sec": 8.231450113991741, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4903, "loss": 4.076272964477539, "lr": 0.0002, "elapsed_sec": 40646.73245573044, "step_time_sec": 8.231222303002141, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4904, "loss": 4.027131080627441, "lr": 0.0002, "elapsed_sec": 40654.96374773979, "step_time_sec": 8.231192062987247, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4905, "loss": 4.261019706726074, "lr": 0.0002, "elapsed_sec": 40663.192771434784, "step_time_sec": 8.228853212989634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4906, "loss": 4.149847030639648, "lr": 0.0002, "elapsed_sec": 40671.42118763924, "step_time_sec": 8.228210100001888, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4907, "loss": 3.9618401527404785, "lr": 0.0002, "elapsed_sec": 40679.65178346634, "step_time_sec": 8.230445219989633, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4908, "loss": 4.120598316192627, "lr": 0.0002, "elapsed_sec": 40687.882584095, "step_time_sec": 8.230629981000675, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4909, "loss": 4.121438026428223, "lr": 0.0002, "elapsed_sec": 40696.11163020134, "step_time_sec": 8.22895193999284, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4910, "loss": 4.129966735839844, "lr": 0.0002, "elapsed_sec": 40704.341098070145, "step_time_sec": 8.229254825011594, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4911, "loss": 4.035172462463379, "lr": 0.0002, "elapsed_sec": 40712.568172216415, "step_time_sec": 8.226915075007128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4912, "loss": 4.057765960693359, "lr": 0.0002, "elapsed_sec": 40720.80023980141, "step_time_sec": 8.231964095000876, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4913, "loss": 4.088717937469482, "lr": 0.0002, "elapsed_sec": 40729.03149676323, "step_time_sec": 8.231081630976405, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4914, "loss": 4.239010334014893, "lr": 0.0002, "elapsed_sec": 40737.262018442154, "step_time_sec": 8.23034072999144, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4915, "loss": 4.194765567779541, "lr": 0.0002, "elapsed_sec": 40745.49282455444, "step_time_sec": 8.23062487700372, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4916, "loss": 4.038645267486572, "lr": 0.0002, "elapsed_sec": 40753.72438406944, "step_time_sec": 8.231503137008986, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4917, "loss": 4.146258354187012, "lr": 0.0002, "elapsed_sec": 40761.95493030548, "step_time_sec": 8.23027839299175, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4918, "loss": 4.2110772132873535, "lr": 0.0002, "elapsed_sec": 40770.1855802536, "step_time_sec": 8.230562167009339, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4919, "loss": 4.208174705505371, "lr": 0.0002, "elapsed_sec": 40778.4132232666, "step_time_sec": 8.227462919981917, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4920, "loss": 4.190239429473877, "lr": 0.0002, "elapsed_sec": 40786.64142847061, "step_time_sec": 8.228070740005933, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4921, "loss": 4.358401298522949, "lr": 0.0002, "elapsed_sec": 40794.872816085815, "step_time_sec": 8.231220244022552, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4922, "loss": 4.0452561378479, "lr": 0.0002, "elapsed_sec": 40803.102914094925, "step_time_sec": 8.229934158996912, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4923, "loss": 4.075738430023193, "lr": 0.0002, "elapsed_sec": 40811.33308887482, "step_time_sec": 8.230024244985543, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4924, "loss": 4.171076774597168, "lr": 0.0002, "elapsed_sec": 40819.56084179878, "step_time_sec": 8.227615323994542, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4925, "loss": 4.13142204284668, "lr": 0.0002, "elapsed_sec": 40827.78936910629, "step_time_sec": 8.228350810008124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4926, "loss": 4.17083215713501, "lr": 0.0002, "elapsed_sec": 40836.01739859581, "step_time_sec": 8.227874019998126, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4927, "loss": 4.129404544830322, "lr": 0.0002, "elapsed_sec": 40844.24869918823, "step_time_sec": 8.231142391014146, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4928, "loss": 4.037005424499512, "lr": 0.0002, "elapsed_sec": 40852.48084688187, "step_time_sec": 8.232015877991216, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4929, "loss": 4.243237495422363, "lr": 0.0002, "elapsed_sec": 40860.7118742466, "step_time_sec": 8.230882014991948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4930, "loss": 4.022912979125977, "lr": 0.0002, "elapsed_sec": 40868.94140815735, "step_time_sec": 8.229410358006135, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4931, "loss": 4.159771919250488, "lr": 0.0002, "elapsed_sec": 40877.17171835899, "step_time_sec": 8.23013086800347, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4932, "loss": 4.128360271453857, "lr": 0.0002, "elapsed_sec": 40885.399754047394, "step_time_sec": 8.227921015990432, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4933, "loss": 4.262119770050049, "lr": 0.0002, "elapsed_sec": 40893.62829852104, "step_time_sec": 8.228352707985323, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4934, "loss": 4.301794052124023, "lr": 0.0002, "elapsed_sec": 40901.85676574707, "step_time_sec": 8.228291643987177, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4935, "loss": 4.104297637939453, "lr": 0.0002, "elapsed_sec": 40910.087181568146, "step_time_sec": 8.230262914992636, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4936, "loss": 4.245960235595703, "lr": 0.0002, "elapsed_sec": 40918.31826043129, "step_time_sec": 8.231009629002074, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4937, "loss": 4.155942440032959, "lr": 0.0002, "elapsed_sec": 40926.54601573944, "step_time_sec": 8.227563894004561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4938, "loss": 4.148094177246094, "lr": 0.0002, "elapsed_sec": 40934.775309085846, "step_time_sec": 8.229096560011385, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4939, "loss": 4.267841339111328, "lr": 0.0002, "elapsed_sec": 40943.00357961655, "step_time_sec": 8.228120358020533, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4940, "loss": 4.158804893493652, "lr": 0.0002, "elapsed_sec": 40951.23216199875, "step_time_sec": 8.228500055993209, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4941, "loss": 4.136814117431641, "lr": 0.0002, "elapsed_sec": 40959.46127271652, "step_time_sec": 8.228874791006092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4942, "loss": 4.1855244636535645, "lr": 0.0002, "elapsed_sec": 40967.691907167435, "step_time_sec": 8.230488496017642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4943, "loss": 4.349798202514648, "lr": 0.0002, "elapsed_sec": 40975.92366075516, "step_time_sec": 8.231599020014983, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4944, "loss": 4.380064964294434, "lr": 0.0002, "elapsed_sec": 40984.154900312424, "step_time_sec": 8.231128962012008, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4945, "loss": 4.124135494232178, "lr": 0.0002, "elapsed_sec": 40992.38317346573, "step_time_sec": 8.228102369990665, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4946, "loss": 4.310083389282227, "lr": 0.0002, "elapsed_sec": 41000.61396098137, "step_time_sec": 8.230668330012122, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4947, "loss": 4.178817272186279, "lr": 0.0002, "elapsed_sec": 41008.84491753578, "step_time_sec": 8.230762054998195, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4948, "loss": 4.135049819946289, "lr": 0.0002, "elapsed_sec": 41017.075901031494, "step_time_sec": 8.230831604014384, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4949, "loss": 4.148942947387695, "lr": 0.0002, "elapsed_sec": 41025.307210206985, "step_time_sec": 8.231143412995152, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4950, "loss": 4.186212539672852, "lr": 0.0002, "elapsed_sec": 41033.53890633583, "step_time_sec": 8.23151056500501, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4951, "loss": 4.111554145812988, "lr": 0.0002, "elapsed_sec": 41041.77013516426, "step_time_sec": 8.231082723010331, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4952, "loss": 4.004767417907715, "lr": 0.0002, "elapsed_sec": 41050.00068664551, "step_time_sec": 8.23041046698927, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4953, "loss": 4.057829856872559, "lr": 0.0002, "elapsed_sec": 41058.23254585266, "step_time_sec": 8.231671191984788, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4954, "loss": 4.02223539352417, "lr": 0.0002, "elapsed_sec": 41066.46281385422, "step_time_sec": 8.230119306012057, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4955, "loss": 4.107914447784424, "lr": 0.0002, "elapsed_sec": 41074.691078186035, "step_time_sec": 8.228141613013577, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4956, "loss": 4.1854777336120605, "lr": 0.0002, "elapsed_sec": 41082.92016458511, "step_time_sec": 8.2289051960106, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4957, "loss": 4.446480751037598, "lr": 0.0002, "elapsed_sec": 41091.14897608757, "step_time_sec": 8.228665397007717, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4958, "loss": 4.035767078399658, "lr": 0.0002, "elapsed_sec": 41099.38012313843, "step_time_sec": 8.231003054999746, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4959, "loss": 4.088088035583496, "lr": 0.0002, "elapsed_sec": 41107.610597372055, "step_time_sec": 8.230314629006898, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4960, "loss": 4.38602352142334, "lr": 0.0002, "elapsed_sec": 41115.84034538269, "step_time_sec": 8.22963851498207, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4961, "loss": 4.19476318359375, "lr": 0.0002, "elapsed_sec": 41124.06883239746, "step_time_sec": 8.228323484014254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4962, "loss": 4.131997585296631, "lr": 0.0002, "elapsed_sec": 41132.29870414734, "step_time_sec": 8.229689648986096, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4963, "loss": 4.043463230133057, "lr": 0.0002, "elapsed_sec": 41140.52974939346, "step_time_sec": 8.23092393798288, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4964, "loss": 4.1213812828063965, "lr": 0.0002, "elapsed_sec": 41148.76073551178, "step_time_sec": 8.230797306983732, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4965, "loss": 4.191317081451416, "lr": 0.0002, "elapsed_sec": 41156.99151110649, "step_time_sec": 8.2306022499979, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4966, "loss": 4.170833587646484, "lr": 0.0002, "elapsed_sec": 41165.221326589584, "step_time_sec": 8.229667120001977, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4967, "loss": 4.079400062561035, "lr": 0.0002, "elapsed_sec": 41173.450507164, "step_time_sec": 8.22903347099782, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4968, "loss": 3.9807703495025635, "lr": 0.0002, "elapsed_sec": 41181.682144641876, "step_time_sec": 8.231489823985612, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4969, "loss": 4.175074577331543, "lr": 0.0002, "elapsed_sec": 41189.91273331642, "step_time_sec": 8.230441905994667, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4970, "loss": 4.239867687225342, "lr": 0.0002, "elapsed_sec": 41198.14444422722, "step_time_sec": 8.231525633018464, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4971, "loss": 4.116968154907227, "lr": 0.0002, "elapsed_sec": 41206.37297081947, "step_time_sec": 8.228359961998649, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4972, "loss": 4.123140335083008, "lr": 0.0002, "elapsed_sec": 41214.602725982666, "step_time_sec": 8.22963244098355, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4973, "loss": 4.142414093017578, "lr": 0.0002, "elapsed_sec": 41222.8316988945, "step_time_sec": 8.228804145997856, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4974, "loss": 4.06960916519165, "lr": 0.0002, "elapsed_sec": 41231.06067442894, "step_time_sec": 8.228900415007956, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4975, "loss": 4.154255390167236, "lr": 0.0002, "elapsed_sec": 41239.28911304474, "step_time_sec": 8.228246671002125, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4976, "loss": 4.108661651611328, "lr": 0.0002, "elapsed_sec": 41247.52020716667, "step_time_sec": 8.23093222398893, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4977, "loss": 4.245575428009033, "lr": 0.0002, "elapsed_sec": 41255.751536369324, "step_time_sec": 8.231172965985024, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4978, "loss": 3.991666078567505, "lr": 0.0002, "elapsed_sec": 41263.97940015793, "step_time_sec": 8.22765677300049, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4979, "loss": 4.174072265625, "lr": 0.0002, "elapsed_sec": 41272.2081694603, "step_time_sec": 8.228635256004054, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4980, "loss": 4.145503044128418, "lr": 0.0002, "elapsed_sec": 41280.437495708466, "step_time_sec": 8.229158289002953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4981, "loss": 3.826537609100342, "lr": 0.0002, "elapsed_sec": 41288.72461438179, "step_time_sec": 8.23868027201388, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4982, "loss": 4.23162317276001, "lr": 0.0002, "elapsed_sec": 41296.9562330246, "step_time_sec": 8.231485121999867, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4983, "loss": 4.188747406005859, "lr": 0.0002, "elapsed_sec": 41305.18527817726, "step_time_sec": 8.228858347982168, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4984, "loss": 4.233436107635498, "lr": 0.0002, "elapsed_sec": 41313.414417505264, "step_time_sec": 8.229003537009703, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4985, "loss": 4.260443687438965, "lr": 0.0002, "elapsed_sec": 41321.64517688751, "step_time_sec": 8.230579916009447, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4986, "loss": 4.018224239349365, "lr": 0.0002, "elapsed_sec": 41329.876858234406, "step_time_sec": 8.231579926010454, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4987, "loss": 4.19993257522583, "lr": 0.0002, "elapsed_sec": 41338.10722208023, "step_time_sec": 8.230179134989157, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4988, "loss": 4.114248275756836, "lr": 0.0002, "elapsed_sec": 41346.33813238144, "step_time_sec": 8.230731685995124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4989, "loss": 4.078666687011719, "lr": 0.0002, "elapsed_sec": 41354.56881022453, "step_time_sec": 8.230527110979892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4990, "loss": 4.193437576293945, "lr": 0.0002, "elapsed_sec": 41362.796788692474, "step_time_sec": 8.22783594802604, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4991, "loss": 4.007796764373779, "lr": 0.0002, "elapsed_sec": 41371.02790808678, "step_time_sec": 8.231024126987904, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4992, "loss": 4.004022121429443, "lr": 0.0002, "elapsed_sec": 41379.25920677185, "step_time_sec": 8.23114827601239, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4993, "loss": 4.081519603729248, "lr": 0.0002, "elapsed_sec": 41387.49112558365, "step_time_sec": 8.231714686000487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4994, "loss": 4.068916320800781, "lr": 0.0002, "elapsed_sec": 41395.72219371796, "step_time_sec": 8.230909896985395, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4995, "loss": 4.193236351013184, "lr": 0.0002, "elapsed_sec": 41403.95398402214, "step_time_sec": 8.231633666990092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4996, "loss": 4.2847185134887695, "lr": 0.0002, "elapsed_sec": 41412.185415029526, "step_time_sec": 8.231271020980785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4997, "loss": 4.179982662200928, "lr": 0.0002, "elapsed_sec": 41420.41682100296, "step_time_sec": 8.231239742017351, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4998, "loss": 4.083010196685791, "lr": 0.0002, "elapsed_sec": 41428.64738917351, "step_time_sec": 8.230431578005664, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 4999, "loss": 4.150969982147217, "lr": 0.0002, "elapsed_sec": 41436.87807559967, "step_time_sec": 8.230506111984141, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5000, "loss": 4.157702922821045, "lr": 0.0002, "elapsed_sec": 41445.10971069336, "step_time_sec": 52.32811249999213, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": true, "probe_ran": true, "probe_elapsed_sec": 0.9972883530135732, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5001, "loss": 4.213318347930908, "lr": 0.0002, "elapsed_sec": 41497.44739174843, "step_time_sec": 8.241022692993283, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5002, "loss": 4.120612621307373, "lr": 0.0002, "elapsed_sec": 41505.67849493027, "step_time_sec": 8.230876163986977, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5003, "loss": 4.119429588317871, "lr": 0.0002, "elapsed_sec": 41513.9090526104, "step_time_sec": 8.230405301990686, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5004, "loss": 4.203478813171387, "lr": 0.0002, "elapsed_sec": 41522.139808893204, "step_time_sec": 8.23062989799655, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5005, "loss": 4.169849395751953, "lr": 0.0002, "elapsed_sec": 41530.36986756325, "step_time_sec": 8.229891056020278, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5006, "loss": 4.020706653594971, "lr": 0.0002, "elapsed_sec": 41538.60073399544, "step_time_sec": 8.230730006005615, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5007, "loss": 4.154813289642334, "lr": 0.0002, "elapsed_sec": 41546.829611063, "step_time_sec": 8.228685194975697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5008, "loss": 4.195794105529785, "lr": 0.0002, "elapsed_sec": 41555.06021857262, "step_time_sec": 8.23052717000246, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5009, "loss": 4.140559673309326, "lr": 0.0002, "elapsed_sec": 41563.28957247734, "step_time_sec": 8.229213251004694, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5010, "loss": 3.9954617023468018, "lr": 0.0002, "elapsed_sec": 41571.517939567566, "step_time_sec": 8.228184584993869, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5011, "loss": 4.215422630310059, "lr": 0.0002, "elapsed_sec": 41579.74695467949, "step_time_sec": 8.228846468991833, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5012, "loss": 4.118930816650391, "lr": 0.0002, "elapsed_sec": 41587.97441124916, "step_time_sec": 8.227268059010385, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5013, "loss": 4.174018383026123, "lr": 0.0002, "elapsed_sec": 41596.20351743698, "step_time_sec": 8.22901545302011, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5014, "loss": 4.106155872344971, "lr": 0.0002, "elapsed_sec": 41604.432181835175, "step_time_sec": 8.228516539995326, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5015, "loss": 4.117334842681885, "lr": 0.0002, "elapsed_sec": 41612.66211295128, "step_time_sec": 8.2298242340039, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5016, "loss": 4.07647705078125, "lr": 0.0002, "elapsed_sec": 41620.892876148224, "step_time_sec": 8.230558373994427, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5017, "loss": 4.109013557434082, "lr": 0.0002, "elapsed_sec": 41629.122488737106, "step_time_sec": 8.229400291020283, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5018, "loss": 4.182013034820557, "lr": 0.0002, "elapsed_sec": 41637.3507604599, "step_time_sec": 8.22814656898845, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5019, "loss": 4.1852850914001465, "lr": 0.0002, "elapsed_sec": 41645.58003902435, "step_time_sec": 8.229105708014686, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5020, "loss": 4.0728983879089355, "lr": 0.0002, "elapsed_sec": 41653.80806851387, "step_time_sec": 8.227917979005724, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5021, "loss": 4.10052490234375, "lr": 0.0002, "elapsed_sec": 41662.037475824356, "step_time_sec": 8.229190172976814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5022, "loss": 4.05230712890625, "lr": 0.0002, "elapsed_sec": 41670.26496577263, "step_time_sec": 8.227316745993448, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5023, "loss": 4.27181339263916, "lr": 0.0002, "elapsed_sec": 41678.492483615875, "step_time_sec": 8.227428483980475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5024, "loss": 4.029228687286377, "lr": 0.0002, "elapsed_sec": 41686.72215104103, "step_time_sec": 8.229495080013294, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5025, "loss": 4.18944787979126, "lr": 0.0002, "elapsed_sec": 41694.95084309578, "step_time_sec": 8.22851189202629, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5026, "loss": 4.084731101989746, "lr": 0.0002, "elapsed_sec": 41703.18078446388, "step_time_sec": 8.22983929599286, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5027, "loss": 4.09210205078125, "lr": 0.0002, "elapsed_sec": 41711.41154098511, "step_time_sec": 8.230578540998977, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5028, "loss": 4.098411560058594, "lr": 0.0002, "elapsed_sec": 41719.64193439484, "step_time_sec": 8.230262096010847, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5029, "loss": 4.2571001052856445, "lr": 0.0002, "elapsed_sec": 41727.87253713608, "step_time_sec": 8.230357318010647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5030, "loss": 4.009481430053711, "lr": 0.0002, "elapsed_sec": 41736.10278344154, "step_time_sec": 8.23010924301343, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5031, "loss": 4.131728172302246, "lr": 0.0002, "elapsed_sec": 41744.33304476738, "step_time_sec": 8.230087711999658, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5032, "loss": 4.058544635772705, "lr": 0.0002, "elapsed_sec": 41752.56190919876, "step_time_sec": 8.228675238002324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5033, "loss": 4.0866827964782715, "lr": 0.0002, "elapsed_sec": 41760.791497945786, "step_time_sec": 8.229443627002183, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5034, "loss": 4.126616954803467, "lr": 0.0002, "elapsed_sec": 41769.0213739872, "step_time_sec": 8.229771273996448, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5035, "loss": 4.149439334869385, "lr": 0.0002, "elapsed_sec": 41777.250975608826, "step_time_sec": 8.22940145098255, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5036, "loss": 3.948035955429077, "lr": 0.0002, "elapsed_sec": 41785.480172634125, "step_time_sec": 8.229019070975482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5037, "loss": 4.171747207641602, "lr": 0.0002, "elapsed_sec": 41793.710765600204, "step_time_sec": 8.230448140995577, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5038, "loss": 4.056668758392334, "lr": 0.0002, "elapsed_sec": 41801.94122362137, "step_time_sec": 8.230305337987375, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5039, "loss": 4.144433975219727, "lr": 0.0002, "elapsed_sec": 41810.17211604118, "step_time_sec": 8.23070522697526, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5040, "loss": 4.22010612487793, "lr": 0.0002, "elapsed_sec": 41818.40237045288, "step_time_sec": 8.230183375009801, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5041, "loss": 4.210634231567383, "lr": 0.0002, "elapsed_sec": 41826.63358783722, "step_time_sec": 8.231032518000575, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5042, "loss": 4.110995769500732, "lr": 0.0002, "elapsed_sec": 41834.865005254745, "step_time_sec": 8.231259766005678, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5043, "loss": 4.161782264709473, "lr": 0.0002, "elapsed_sec": 41843.094069719315, "step_time_sec": 8.228855049994309, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5044, "loss": 4.068431854248047, "lr": 0.0002, "elapsed_sec": 41851.32288527489, "step_time_sec": 8.228700698993634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5045, "loss": 4.148701190948486, "lr": 0.0002, "elapsed_sec": 41859.55144572258, "step_time_sec": 8.228429262002464, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5046, "loss": 4.245339393615723, "lr": 0.0002, "elapsed_sec": 41867.780895233154, "step_time_sec": 8.229229238000698, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5047, "loss": 4.168095111846924, "lr": 0.0002, "elapsed_sec": 41876.01222419739, "step_time_sec": 8.231191975006368, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5048, "loss": 4.282081604003906, "lr": 0.0002, "elapsed_sec": 41884.24337530136, "step_time_sec": 8.230969385011122, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5049, "loss": 4.117944240570068, "lr": 0.0002, "elapsed_sec": 41892.47315168381, "step_time_sec": 8.2296449150017, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5050, "loss": 4.075160980224609, "lr": 0.0002, "elapsed_sec": 41900.702489852905, "step_time_sec": 8.229170411010273, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5051, "loss": 4.313336372375488, "lr": 0.0002, "elapsed_sec": 41908.93335199356, "step_time_sec": 8.230778164986987, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5052, "loss": 4.224307060241699, "lr": 0.0002, "elapsed_sec": 41917.16368198395, "step_time_sec": 8.230117439990863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5053, "loss": 4.158719539642334, "lr": 0.0002, "elapsed_sec": 41925.39510345459, "step_time_sec": 8.231265325011918, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5054, "loss": 4.1297783851623535, "lr": 0.0002, "elapsed_sec": 41933.62672615051, "step_time_sec": 8.231446329999017, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5055, "loss": 4.166304111480713, "lr": 0.0002, "elapsed_sec": 41941.857563495636, "step_time_sec": 8.23072266401141, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5056, "loss": 4.135865211486816, "lr": 0.0002, "elapsed_sec": 41950.08714604378, "step_time_sec": 8.229383450001478, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5057, "loss": 4.185489654541016, "lr": 0.0002, "elapsed_sec": 41958.31746697426, "step_time_sec": 8.230170392984292, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5058, "loss": 4.198424816131592, "lr": 0.0002, "elapsed_sec": 41966.54873871803, "step_time_sec": 8.231176030007191, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5059, "loss": 4.15989351272583, "lr": 0.0002, "elapsed_sec": 41974.77940607071, "step_time_sec": 8.230530143977376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5060, "loss": 3.9910409450531006, "lr": 0.0002, "elapsed_sec": 41983.01069736481, "step_time_sec": 8.231080992001807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5061, "loss": 4.074871063232422, "lr": 0.0002, "elapsed_sec": 41991.23998570442, "step_time_sec": 8.22914713897626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5062, "loss": 3.937312364578247, "lr": 0.0002, "elapsed_sec": 41999.46995306015, "step_time_sec": 8.229839782987256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5063, "loss": 4.229654788970947, "lr": 0.0002, "elapsed_sec": 42007.69827413559, "step_time_sec": 8.22817126699374, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5064, "loss": 4.187760829925537, "lr": 0.0002, "elapsed_sec": 42015.92748785019, "step_time_sec": 8.229014854005072, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5065, "loss": 4.169967174530029, "lr": 0.0002, "elapsed_sec": 42024.155710697174, "step_time_sec": 8.228047297016019, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5066, "loss": 4.136377811431885, "lr": 0.0002, "elapsed_sec": 42032.386441230774, "step_time_sec": 8.230634480016306, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5067, "loss": 4.047455310821533, "lr": 0.0002, "elapsed_sec": 42040.61741757393, "step_time_sec": 8.23075106999022, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5068, "loss": 4.208549976348877, "lr": 0.0002, "elapsed_sec": 42048.84595537186, "step_time_sec": 8.228388877003454, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5069, "loss": 4.002881050109863, "lr": 0.0002, "elapsed_sec": 42057.0744304657, "step_time_sec": 8.228371833014535, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5070, "loss": 4.15671443939209, "lr": 0.0002, "elapsed_sec": 42065.30137181282, "step_time_sec": 8.226741885009687, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5071, "loss": 4.158766746520996, "lr": 0.0002, "elapsed_sec": 42073.53071331978, "step_time_sec": 8.229227130010258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5072, "loss": 4.174067497253418, "lr": 0.0002, "elapsed_sec": 42081.758729696274, "step_time_sec": 8.22784116899129, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5073, "loss": 4.148746013641357, "lr": 0.0002, "elapsed_sec": 42089.98886799812, "step_time_sec": 8.229987803992117, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5074, "loss": 4.259110450744629, "lr": 0.0002, "elapsed_sec": 42098.21951818466, "step_time_sec": 8.230429186980473, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5075, "loss": 4.113971710205078, "lr": 0.0002, "elapsed_sec": 42106.448899269104, "step_time_sec": 8.229247740993742, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5076, "loss": 4.166997909545898, "lr": 0.0002, "elapsed_sec": 42114.67777514458, "step_time_sec": 8.22877272000187, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5077, "loss": 4.059295177459717, "lr": 0.0002, "elapsed_sec": 42122.907183885574, "step_time_sec": 8.229204216011567, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5078, "loss": 4.110495567321777, "lr": 0.0002, "elapsed_sec": 42131.138446092606, "step_time_sec": 8.231168508995324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5079, "loss": 4.08615255355835, "lr": 0.0002, "elapsed_sec": 42139.36888742447, "step_time_sec": 8.230212939000921, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5080, "loss": 3.966477155685425, "lr": 0.0002, "elapsed_sec": 42147.59992313385, "step_time_sec": 8.230879811977502, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5081, "loss": 4.246054172515869, "lr": 0.0002, "elapsed_sec": 42155.831055402756, "step_time_sec": 8.23098260597908, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5082, "loss": 4.1619110107421875, "lr": 0.0002, "elapsed_sec": 42164.062467098236, "step_time_sec": 8.231283872009953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5083, "loss": 4.022072792053223, "lr": 0.0002, "elapsed_sec": 42172.293209552765, "step_time_sec": 8.23058276201482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5084, "loss": 4.1136956214904785, "lr": 0.0002, "elapsed_sec": 42180.52340865135, "step_time_sec": 8.23000220002723, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5085, "loss": 4.157798767089844, "lr": 0.0002, "elapsed_sec": 42188.75207424164, "step_time_sec": 8.228489506000187, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5086, "loss": 4.083215236663818, "lr": 0.0002, "elapsed_sec": 42196.98256993294, "step_time_sec": 8.230393353005638, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5087, "loss": 4.249209403991699, "lr": 0.0002, "elapsed_sec": 42205.213984012604, "step_time_sec": 8.231211398000596, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5088, "loss": 4.131107330322266, "lr": 0.0002, "elapsed_sec": 42213.44384217262, "step_time_sec": 8.229723207012285, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5089, "loss": 4.027151107788086, "lr": 0.0002, "elapsed_sec": 42221.67515087128, "step_time_sec": 8.231147387996316, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5090, "loss": 4.30109167098999, "lr": 0.0002, "elapsed_sec": 42229.908095121384, "step_time_sec": 8.23286795700551, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5091, "loss": 4.215742588043213, "lr": 0.0002, "elapsed_sec": 42238.13562178612, "step_time_sec": 8.227300252998248, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5092, "loss": 4.172149181365967, "lr": 0.0002, "elapsed_sec": 42246.36369514465, "step_time_sec": 8.227911613008473, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5093, "loss": 4.082561016082764, "lr": 0.0002, "elapsed_sec": 42254.59309744835, "step_time_sec": 8.229263479996007, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5094, "loss": 4.011248588562012, "lr": 0.0002, "elapsed_sec": 42262.82250523567, "step_time_sec": 8.229240954009583, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5095, "loss": 4.199491024017334, "lr": 0.0002, "elapsed_sec": 42271.05213546753, "step_time_sec": 8.229513505997602, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5096, "loss": 4.134302616119385, "lr": 0.0002, "elapsed_sec": 42279.28280758858, "step_time_sec": 8.23048581200419, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5097, "loss": 4.038638591766357, "lr": 0.0002, "elapsed_sec": 42287.513556957245, "step_time_sec": 8.230597524991026, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5098, "loss": 4.02587890625, "lr": 0.0002, "elapsed_sec": 42295.74470257759, "step_time_sec": 8.230985902977409, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5099, "loss": 4.096292495727539, "lr": 0.0002, "elapsed_sec": 42303.975712776184, "step_time_sec": 8.230883531010477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5100, "loss": 4.128557205200195, "lr": 0.0002, "elapsed_sec": 42312.206370830536, "step_time_sec": 8.230557164002676, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5101, "loss": 4.20503044128418, "lr": 0.0002, "elapsed_sec": 42320.43796849251, "step_time_sec": 8.23142268700758, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5102, "loss": 4.169356822967529, "lr": 0.0002, "elapsed_sec": 42328.666771650314, "step_time_sec": 8.228601869021077, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5103, "loss": 4.285913944244385, "lr": 0.0002, "elapsed_sec": 42336.896933317184, "step_time_sec": 8.230018358997768, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5104, "loss": 4.108333110809326, "lr": 0.0002, "elapsed_sec": 42345.128111600876, "step_time_sec": 8.231065891013714, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5105, "loss": 4.017391681671143, "lr": 0.0002, "elapsed_sec": 42353.35949063301, "step_time_sec": 8.231194532010704, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5106, "loss": 4.0750837326049805, "lr": 0.0002, "elapsed_sec": 42361.59053850174, "step_time_sec": 8.230872248997912, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5107, "loss": 4.186354160308838, "lr": 0.0002, "elapsed_sec": 42369.82028913498, "step_time_sec": 8.229637578013353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5108, "loss": 4.099082946777344, "lr": 0.0002, "elapsed_sec": 42378.04944014549, "step_time_sec": 8.22894783000811, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5109, "loss": 4.025119781494141, "lr": 0.0002, "elapsed_sec": 42386.27977895737, "step_time_sec": 8.230213314003777, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5110, "loss": 4.113507270812988, "lr": 0.0002, "elapsed_sec": 42394.510691165924, "step_time_sec": 8.230744888976915, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5111, "loss": 4.091292858123779, "lr": 0.0002, "elapsed_sec": 42402.74235081673, "step_time_sec": 8.231502567999996, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5112, "loss": 4.037569522857666, "lr": 0.0002, "elapsed_sec": 42410.97217822075, "step_time_sec": 8.229690527979983, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5113, "loss": 4.158668041229248, "lr": 0.0002, "elapsed_sec": 42419.20165014267, "step_time_sec": 8.229307938017882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5114, "loss": 4.193227767944336, "lr": 0.0002, "elapsed_sec": 42427.43217611313, "step_time_sec": 8.230436310026562, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5115, "loss": 4.035955905914307, "lr": 0.0002, "elapsed_sec": 42435.663445711136, "step_time_sec": 8.231055564014241, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5116, "loss": 3.991971969604492, "lr": 0.0002, "elapsed_sec": 42443.89470553398, "step_time_sec": 8.23110859200824, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5117, "loss": 4.065884590148926, "lr": 0.0002, "elapsed_sec": 42452.12508034706, "step_time_sec": 8.230230947985547, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5118, "loss": 4.151365280151367, "lr": 0.0002, "elapsed_sec": 42460.35614490509, "step_time_sec": 8.230952664016513, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5119, "loss": 4.181751251220703, "lr": 0.0002, "elapsed_sec": 42468.587282180786, "step_time_sec": 8.230950291996123, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5120, "loss": 4.051326751708984, "lr": 0.0002, "elapsed_sec": 42476.81660795212, "step_time_sec": 8.229155823006295, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5121, "loss": 4.074948310852051, "lr": 0.0002, "elapsed_sec": 42485.04595351219, "step_time_sec": 8.229261666012462, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5122, "loss": 4.10170841217041, "lr": 0.0002, "elapsed_sec": 42493.275643348694, "step_time_sec": 8.229455491993576, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5123, "loss": 4.15969181060791, "lr": 0.0002, "elapsed_sec": 42501.50589990616, "step_time_sec": 8.230102568981238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5124, "loss": 3.960071086883545, "lr": 0.0002, "elapsed_sec": 42509.73507523537, "step_time_sec": 8.229053332004696, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5125, "loss": 4.073862552642822, "lr": 0.0002, "elapsed_sec": 42517.96303200722, "step_time_sec": 8.22775909100892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5126, "loss": 4.16685676574707, "lr": 0.0002, "elapsed_sec": 42526.19195008278, "step_time_sec": 8.228787623986136, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5127, "loss": 4.036871910095215, "lr": 0.0002, "elapsed_sec": 42534.42366147041, "step_time_sec": 8.231547139992472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5128, "loss": 4.072980880737305, "lr": 0.0002, "elapsed_sec": 42542.65380716324, "step_time_sec": 8.23001726300572, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5129, "loss": 4.11726188659668, "lr": 0.0002, "elapsed_sec": 42550.88491296768, "step_time_sec": 8.230959020991577, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5130, "loss": 4.190931797027588, "lr": 0.0002, "elapsed_sec": 42559.11492085457, "step_time_sec": 8.229806313989684, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5131, "loss": 4.193874359130859, "lr": 0.0002, "elapsed_sec": 42567.34332036972, "step_time_sec": 8.2283051230188, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5132, "loss": 4.173878192901611, "lr": 0.0002, "elapsed_sec": 42575.57396173477, "step_time_sec": 8.230485850013793, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5133, "loss": 4.135875701904297, "lr": 0.0002, "elapsed_sec": 42583.80359959602, "step_time_sec": 8.229430620995117, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5134, "loss": 4.267767906188965, "lr": 0.0002, "elapsed_sec": 42592.033301353455, "step_time_sec": 8.229548003000673, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5135, "loss": 4.068815231323242, "lr": 0.0002, "elapsed_sec": 42600.26385188103, "step_time_sec": 8.230444763990818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5136, "loss": 4.569265365600586, "lr": 0.0002, "elapsed_sec": 42608.49447655678, "step_time_sec": 8.230445369990775, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5137, "loss": 4.096375942230225, "lr": 0.0002, "elapsed_sec": 42616.72386479378, "step_time_sec": 8.229262767999899, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5138, "loss": 4.047229290008545, "lr": 0.0002, "elapsed_sec": 42624.952694654465, "step_time_sec": 8.228626900992822, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5139, "loss": 4.024569511413574, "lr": 0.0002, "elapsed_sec": 42633.18235087395, "step_time_sec": 8.229486391996033, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5140, "loss": 4.0000715255737305, "lr": 0.0002, "elapsed_sec": 42641.41304731369, "step_time_sec": 8.230563943012385, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5141, "loss": 4.100747108459473, "lr": 0.0002, "elapsed_sec": 42649.64406681061, "step_time_sec": 8.230872770975111, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5142, "loss": 4.189759731292725, "lr": 0.0002, "elapsed_sec": 42657.875349998474, "step_time_sec": 8.231097468989901, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5143, "loss": 3.929445743560791, "lr": 0.0002, "elapsed_sec": 42666.10446333885, "step_time_sec": 8.228969759016763, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5144, "loss": 4.262655735015869, "lr": 0.0002, "elapsed_sec": 42674.33463358879, "step_time_sec": 8.229998112015892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5145, "loss": 4.0944037437438965, "lr": 0.0002, "elapsed_sec": 42682.56387543678, "step_time_sec": 8.229136426001787, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5146, "loss": 4.094953536987305, "lr": 0.0002, "elapsed_sec": 42690.79460287094, "step_time_sec": 8.230534769012593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5147, "loss": 4.0937066078186035, "lr": 0.0002, "elapsed_sec": 42699.02405190468, "step_time_sec": 8.229290483985096, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5148, "loss": 4.105748176574707, "lr": 0.0002, "elapsed_sec": 42707.25282359123, "step_time_sec": 8.228600955015281, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5149, "loss": 4.099915981292725, "lr": 0.0002, "elapsed_sec": 42715.48245191574, "step_time_sec": 8.229502378002508, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5150, "loss": 4.216845512390137, "lr": 0.0002, "elapsed_sec": 42723.711245298386, "step_time_sec": 8.228651093988447, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5151, "loss": 4.0758137702941895, "lr": 0.0002, "elapsed_sec": 42731.9408261776, "step_time_sec": 8.229369341017446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5152, "loss": 3.991456985473633, "lr": 0.0002, "elapsed_sec": 42740.16917037964, "step_time_sec": 8.22823997301748, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5153, "loss": 4.1983819007873535, "lr": 0.0002, "elapsed_sec": 42748.399985551834, "step_time_sec": 8.2305986660067, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5154, "loss": 4.232901573181152, "lr": 0.0002, "elapsed_sec": 42756.631555080414, "step_time_sec": 8.231409790023463, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5155, "loss": 4.079742431640625, "lr": 0.0002, "elapsed_sec": 42764.86041331291, "step_time_sec": 8.22868508999818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5156, "loss": 4.075519561767578, "lr": 0.0002, "elapsed_sec": 42773.09032535553, "step_time_sec": 8.229755976994056, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5157, "loss": 3.9060795307159424, "lr": 0.0002, "elapsed_sec": 42781.31942462921, "step_time_sec": 8.228946797986282, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5158, "loss": 4.238706588745117, "lr": 0.0002, "elapsed_sec": 42789.549136161804, "step_time_sec": 8.229572543990798, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5159, "loss": 3.9419350624084473, "lr": 0.0002, "elapsed_sec": 42797.77788281441, "step_time_sec": 8.228613916988252, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5160, "loss": 4.362409591674805, "lr": 0.0002, "elapsed_sec": 42806.00729441643, "step_time_sec": 8.22919189397362, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5161, "loss": 4.160886764526367, "lr": 0.0002, "elapsed_sec": 42814.23742675781, "step_time_sec": 8.230000675015617, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5162, "loss": 4.139545440673828, "lr": 0.0002, "elapsed_sec": 42822.46672105789, "step_time_sec": 8.229100188997108, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5163, "loss": 3.8051624298095703, "lr": 0.0002, "elapsed_sec": 42830.698090553284, "step_time_sec": 8.231223556009354, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5164, "loss": 4.109070777893066, "lr": 0.0002, "elapsed_sec": 42838.926926612854, "step_time_sec": 8.228669559000991, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5165, "loss": 4.021169185638428, "lr": 0.0002, "elapsed_sec": 42847.15712881088, "step_time_sec": 8.230042570998194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5166, "loss": 4.129500865936279, "lr": 0.0002, "elapsed_sec": 42855.388350725174, "step_time_sec": 8.231130870000925, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5167, "loss": 4.2164225578308105, "lr": 0.0002, "elapsed_sec": 42863.618242025375, "step_time_sec": 8.22967007299303, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5168, "loss": 4.215542316436768, "lr": 0.0002, "elapsed_sec": 42871.848350286484, "step_time_sec": 8.22995109698968, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5169, "loss": 3.9418725967407227, "lr": 0.0002, "elapsed_sec": 42880.07952284813, "step_time_sec": 8.230965274007758, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5170, "loss": 4.02067232131958, "lr": 0.0002, "elapsed_sec": 42888.30707716942, "step_time_sec": 8.22743959800573, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5171, "loss": 4.223878383636475, "lr": 0.0002, "elapsed_sec": 42896.537974357605, "step_time_sec": 8.230648741999175, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5172, "loss": 4.14597225189209, "lr": 0.0002, "elapsed_sec": 42904.76818871498, "step_time_sec": 8.230087447998812, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5173, "loss": 4.191648483276367, "lr": 0.0002, "elapsed_sec": 42912.99757647514, "step_time_sec": 8.229240579996258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5174, "loss": 4.189634323120117, "lr": 0.0002, "elapsed_sec": 42921.226907491684, "step_time_sec": 8.229150844999822, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5175, "loss": 4.14362096786499, "lr": 0.0002, "elapsed_sec": 42929.45261096954, "step_time_sec": 8.225550588016631, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5176, "loss": 3.975097894668579, "lr": 0.0002, "elapsed_sec": 42937.68173122406, "step_time_sec": 8.229017885983922, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5177, "loss": 4.080315589904785, "lr": 0.0002, "elapsed_sec": 42945.91115951538, "step_time_sec": 8.22924310999224, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5178, "loss": 4.083263874053955, "lr": 0.0002, "elapsed_sec": 42954.141721725464, "step_time_sec": 8.230384776979918, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5179, "loss": 4.112806797027588, "lr": 0.0002, "elapsed_sec": 42962.37309765816, "step_time_sec": 8.231201260001399, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5180, "loss": 4.066776752471924, "lr": 0.0002, "elapsed_sec": 42970.60320878029, "step_time_sec": 8.229967263003346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5181, "loss": 4.192162990570068, "lr": 0.0002, "elapsed_sec": 42978.8349571228, "step_time_sec": 8.231553512014216, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5182, "loss": 4.214718341827393, "lr": 0.0002, "elapsed_sec": 42987.06390595436, "step_time_sec": 8.228813852998428, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5183, "loss": 3.9888336658477783, "lr": 0.0002, "elapsed_sec": 42995.293551683426, "step_time_sec": 8.22944518100121, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5184, "loss": 4.144224166870117, "lr": 0.0002, "elapsed_sec": 43003.52335071564, "step_time_sec": 8.229651267000008, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5185, "loss": 4.144608020782471, "lr": 0.0002, "elapsed_sec": 43011.75037121773, "step_time_sec": 8.22686063501169, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5186, "loss": 4.020516395568848, "lr": 0.0002, "elapsed_sec": 43019.9794716835, "step_time_sec": 8.228946771007031, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5187, "loss": 4.038647651672363, "lr": 0.0002, "elapsed_sec": 43028.2104973793, "step_time_sec": 8.230910589016275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5188, "loss": 4.019751071929932, "lr": 0.0002, "elapsed_sec": 43036.44206357002, "step_time_sec": 8.231370806985069, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5189, "loss": 4.057969093322754, "lr": 0.0002, "elapsed_sec": 43044.672513246536, "step_time_sec": 8.230292296997504, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5190, "loss": 4.0761799812316895, "lr": 0.0002, "elapsed_sec": 43052.9029314518, "step_time_sec": 8.23030098201707, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5191, "loss": 4.052182197570801, "lr": 0.0002, "elapsed_sec": 43061.13156604767, "step_time_sec": 8.228425573004643, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5192, "loss": 4.022703170776367, "lr": 0.0002, "elapsed_sec": 43069.36027622223, "step_time_sec": 8.22852111098473, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5193, "loss": 4.26310396194458, "lr": 0.0002, "elapsed_sec": 43077.59093666077, "step_time_sec": 8.230508798005758, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5194, "loss": 4.073235988616943, "lr": 0.0002, "elapsed_sec": 43085.82122659683, "step_time_sec": 8.230149581009755, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5195, "loss": 4.150174140930176, "lr": 0.0002, "elapsed_sec": 43094.05261349678, "step_time_sec": 8.23121590501978, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5196, "loss": 4.108185291290283, "lr": 0.0002, "elapsed_sec": 43102.28316926956, "step_time_sec": 8.230438097001752, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5197, "loss": 4.069774150848389, "lr": 0.0002, "elapsed_sec": 43110.51394224167, "step_time_sec": 8.230601889983518, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5198, "loss": 4.129143714904785, "lr": 0.0002, "elapsed_sec": 43118.745618104935, "step_time_sec": 8.23148213600507, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5199, "loss": 4.054384708404541, "lr": 0.0002, "elapsed_sec": 43126.97497963905, "step_time_sec": 8.229231795005035, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5200, "loss": 4.044466972351074, "lr": 0.0002, "elapsed_sec": 43135.20577812195, "step_time_sec": 8.23056213799282, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5201, "loss": 4.202404499053955, "lr": 0.0002, "elapsed_sec": 43143.43731021881, "step_time_sec": 8.231380723998882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5202, "loss": 4.165354251861572, "lr": 0.0002, "elapsed_sec": 43151.667763233185, "step_time_sec": 8.23029685797519, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5203, "loss": 4.085964679718018, "lr": 0.0002, "elapsed_sec": 43159.89949655533, "step_time_sec": 8.231602182990173, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5204, "loss": 4.035040855407715, "lr": 0.0002, "elapsed_sec": 43168.13058304787, "step_time_sec": 8.23096801599604, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5205, "loss": 4.061703681945801, "lr": 0.0002, "elapsed_sec": 43176.361481666565, "step_time_sec": 8.230687974020839, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5206, "loss": 4.049102783203125, "lr": 0.0002, "elapsed_sec": 43184.59001803398, "step_time_sec": 8.228384066023864, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5207, "loss": 4.028801441192627, "lr": 0.0002, "elapsed_sec": 43192.82010650635, "step_time_sec": 8.229961527016712, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5208, "loss": 4.05100154876709, "lr": 0.0002, "elapsed_sec": 43201.05013632774, "step_time_sec": 8.229843371984316, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5209, "loss": 4.226853847503662, "lr": 0.0002, "elapsed_sec": 43209.281061410904, "step_time_sec": 8.230763318017125, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5210, "loss": 4.0206217765808105, "lr": 0.0002, "elapsed_sec": 43217.512310028076, "step_time_sec": 8.231068808003329, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5211, "loss": 4.062520503997803, "lr": 0.0002, "elapsed_sec": 43225.74343776703, "step_time_sec": 8.231013758020708, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5212, "loss": 4.0732879638671875, "lr": 0.0002, "elapsed_sec": 43233.97405600548, "step_time_sec": 8.230448959977366, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5213, "loss": 4.02126932144165, "lr": 0.0002, "elapsed_sec": 43242.2026219368, "step_time_sec": 8.228413755976362, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5214, "loss": 4.089313983917236, "lr": 0.0002, "elapsed_sec": 43250.43423604965, "step_time_sec": 8.231416319991695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5215, "loss": 4.128794193267822, "lr": 0.0002, "elapsed_sec": 43258.66539525986, "step_time_sec": 8.231006161979167, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5216, "loss": 3.975149154663086, "lr": 0.0002, "elapsed_sec": 43266.89629936218, "step_time_sec": 8.230828594998457, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5217, "loss": 4.020448684692383, "lr": 0.0002, "elapsed_sec": 43275.12575817108, "step_time_sec": 8.2293081189855, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5218, "loss": 3.9848361015319824, "lr": 0.0002, "elapsed_sec": 43283.355421066284, "step_time_sec": 8.229468011995777, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5219, "loss": 4.18712043762207, "lr": 0.0002, "elapsed_sec": 43291.584141254425, "step_time_sec": 8.22853830299573, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5220, "loss": 4.29010009765625, "lr": 0.0002, "elapsed_sec": 43299.81232523918, "step_time_sec": 8.228045516996644, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5221, "loss": 4.015557765960693, "lr": 0.0002, "elapsed_sec": 43308.03971338272, "step_time_sec": 8.22719486700953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5222, "loss": 4.138147830963135, "lr": 0.0002, "elapsed_sec": 43316.26999473572, "step_time_sec": 8.230132809025235, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5223, "loss": 3.9736969470977783, "lr": 0.0002, "elapsed_sec": 43324.49838972092, "step_time_sec": 8.228235888003837, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5224, "loss": 4.173072338104248, "lr": 0.0002, "elapsed_sec": 43332.726944208145, "step_time_sec": 8.228392907010857, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5225, "loss": 3.9303581714630127, "lr": 0.0002, "elapsed_sec": 43340.9570813179, "step_time_sec": 8.230016821995378, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5226, "loss": 4.065589904785156, "lr": 0.0002, "elapsed_sec": 43349.18414235115, "step_time_sec": 8.226853730011499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5227, "loss": 4.0067877769470215, "lr": 0.0002, "elapsed_sec": 43357.41443443298, "step_time_sec": 8.230212925001979, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5228, "loss": 4.093386650085449, "lr": 0.0002, "elapsed_sec": 43365.64465332031, "step_time_sec": 8.230031046987278, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5229, "loss": 4.007869720458984, "lr": 0.0002, "elapsed_sec": 43373.87445259094, "step_time_sec": 8.229593814001419, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5230, "loss": 4.151526927947998, "lr": 0.0002, "elapsed_sec": 43382.10336852074, "step_time_sec": 8.228765040985309, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5231, "loss": 4.1466803550720215, "lr": 0.0002, "elapsed_sec": 43390.33150911331, "step_time_sec": 8.227992650005035, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5232, "loss": 4.077686786651611, "lr": 0.0002, "elapsed_sec": 43398.56080746651, "step_time_sec": 8.229193488979945, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5233, "loss": 3.93864107131958, "lr": 0.0002, "elapsed_sec": 43406.79074501991, "step_time_sec": 8.229733389016474, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5234, "loss": 3.9774882793426514, "lr": 0.0002, "elapsed_sec": 43415.02132129669, "step_time_sec": 8.230473292001989, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5235, "loss": 4.050060272216797, "lr": 0.0002, "elapsed_sec": 43423.24967122078, "step_time_sec": 8.228180019010324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5236, "loss": 4.118514537811279, "lr": 0.0002, "elapsed_sec": 43431.47616624832, "step_time_sec": 8.226305710995803, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5237, "loss": 3.948483467102051, "lr": 0.0002, "elapsed_sec": 43439.70693397522, "step_time_sec": 8.23062684101751, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5238, "loss": 4.112937927246094, "lr": 0.0002, "elapsed_sec": 43447.93837475777, "step_time_sec": 8.231315331009682, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5239, "loss": 4.180540084838867, "lr": 0.0002, "elapsed_sec": 43456.16676092148, "step_time_sec": 8.228174957999727, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5240, "loss": 3.9648165702819824, "lr": 0.0002, "elapsed_sec": 43464.39507198334, "step_time_sec": 8.228152216004673, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5241, "loss": 4.139224529266357, "lr": 0.0002, "elapsed_sec": 43472.6243686676, "step_time_sec": 8.229139822011348, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5242, "loss": 3.9757120609283447, "lr": 0.0002, "elapsed_sec": 43480.85365319252, "step_time_sec": 8.229162536998047, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5243, "loss": 4.085703372955322, "lr": 0.0002, "elapsed_sec": 43489.08378815651, "step_time_sec": 8.229916561977006, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5244, "loss": 4.044645309448242, "lr": 0.0002, "elapsed_sec": 43497.31244683266, "step_time_sec": 8.228525618993444, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5245, "loss": 4.017092227935791, "lr": 0.0002, "elapsed_sec": 43505.541546821594, "step_time_sec": 8.228944678005064, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5246, "loss": 4.09835958480835, "lr": 0.0002, "elapsed_sec": 43513.7700946331, "step_time_sec": 8.228482706006616, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5247, "loss": 4.155764102935791, "lr": 0.0002, "elapsed_sec": 43521.99973344803, "step_time_sec": 8.229348082008073, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5248, "loss": 4.150148868560791, "lr": 0.0002, "elapsed_sec": 43530.23054599762, "step_time_sec": 8.230637927976204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5249, "loss": 4.059593200683594, "lr": 0.0002, "elapsed_sec": 43538.46166181564, "step_time_sec": 8.231015729979845, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5250, "loss": 4.186098098754883, "lr": 0.0002, "elapsed_sec": 43546.69300842285, "step_time_sec": 8.231177605019184, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5251, "loss": 4.106304168701172, "lr": 0.0002, "elapsed_sec": 43554.92503499985, "step_time_sec": 8.23183562498889, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5252, "loss": 4.031148910522461, "lr": 0.0002, "elapsed_sec": 43563.1540749073, "step_time_sec": 8.228864211007021, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5253, "loss": 4.078771114349365, "lr": 0.0002, "elapsed_sec": 43571.38302564621, "step_time_sec": 8.228806777013233, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5254, "loss": 3.914281129837036, "lr": 0.0002, "elapsed_sec": 43579.614153146744, "step_time_sec": 8.230998521001311, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5255, "loss": 3.8729496002197266, "lr": 0.0002, "elapsed_sec": 43587.84330534935, "step_time_sec": 8.22896519000642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5256, "loss": 4.170082092285156, "lr": 0.0002, "elapsed_sec": 43596.07387590408, "step_time_sec": 8.230449654016411, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5257, "loss": 3.9498507976531982, "lr": 0.0002, "elapsed_sec": 43604.304106235504, "step_time_sec": 8.230026445002295, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5258, "loss": 4.054964542388916, "lr": 0.0002, "elapsed_sec": 43612.532625198364, "step_time_sec": 8.228353684011381, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5259, "loss": 4.222827434539795, "lr": 0.0002, "elapsed_sec": 43620.76222038269, "step_time_sec": 8.229452372994274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5260, "loss": 4.057734966278076, "lr": 0.0002, "elapsed_sec": 43628.99247717857, "step_time_sec": 8.23007801300264, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5261, "loss": 4.088746547698975, "lr": 0.0002, "elapsed_sec": 43637.221960783005, "step_time_sec": 8.229338514007395, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5262, "loss": 4.070169448852539, "lr": 0.0002, "elapsed_sec": 43645.452131032944, "step_time_sec": 8.230005572986556, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5263, "loss": 4.226678371429443, "lr": 0.0002, "elapsed_sec": 43653.68247079849, "step_time_sec": 8.23028068599524, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5264, "loss": 4.1411004066467285, "lr": 0.0002, "elapsed_sec": 43661.91273069382, "step_time_sec": 8.230019169015577, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5265, "loss": 4.177613735198975, "lr": 0.0002, "elapsed_sec": 43670.14169025421, "step_time_sec": 8.228785910003353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5266, "loss": 4.065260887145996, "lr": 0.0002, "elapsed_sec": 43678.37212896347, "step_time_sec": 8.230382584995823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5267, "loss": 4.1135077476501465, "lr": 0.0002, "elapsed_sec": 43686.60310006142, "step_time_sec": 8.230704852991039, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5268, "loss": 3.8451976776123047, "lr": 0.0002, "elapsed_sec": 43694.83484053612, "step_time_sec": 8.231598106998717, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5269, "loss": 4.101529598236084, "lr": 0.0002, "elapsed_sec": 43703.066506147385, "step_time_sec": 8.23149263500818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5270, "loss": 4.029816150665283, "lr": 0.0002, "elapsed_sec": 43711.296557188034, "step_time_sec": 8.229948510008398, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5271, "loss": 3.9634087085723877, "lr": 0.0002, "elapsed_sec": 43719.527161836624, "step_time_sec": 8.230435103003401, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5272, "loss": 3.982532024383545, "lr": 0.0002, "elapsed_sec": 43727.75769615173, "step_time_sec": 8.230362609989243, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5273, "loss": 4.128046989440918, "lr": 0.0002, "elapsed_sec": 43735.98599100113, "step_time_sec": 8.228189474990359, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5274, "loss": 3.9228785037994385, "lr": 0.0002, "elapsed_sec": 43744.214184999466, "step_time_sec": 8.22802828301792, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5275, "loss": 4.312664031982422, "lr": 0.0002, "elapsed_sec": 43752.443378448486, "step_time_sec": 8.228985577996355, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5276, "loss": 4.038946628570557, "lr": 0.0002, "elapsed_sec": 43760.672539711, "step_time_sec": 8.229011473013088, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5277, "loss": 4.127288341522217, "lr": 0.0002, "elapsed_sec": 43768.902956962585, "step_time_sec": 8.230306615994778, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5278, "loss": 4.1800456047058105, "lr": 0.0002, "elapsed_sec": 43777.13442134857, "step_time_sec": 8.231225505005568, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5279, "loss": 4.030277729034424, "lr": 0.0002, "elapsed_sec": 43785.36557364464, "step_time_sec": 8.231024492997676, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5280, "loss": 4.1757612228393555, "lr": 0.0002, "elapsed_sec": 43793.596195459366, "step_time_sec": 8.230451591982273, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5281, "loss": 4.054234981536865, "lr": 0.0002, "elapsed_sec": 43801.82770776749, "step_time_sec": 8.231344520987477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5282, "loss": 4.153505802154541, "lr": 0.0002, "elapsed_sec": 43810.05849790573, "step_time_sec": 8.230649754987098, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5283, "loss": 4.220991611480713, "lr": 0.0002, "elapsed_sec": 43818.28713083267, "step_time_sec": 8.228456697019283, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5284, "loss": 4.170018672943115, "lr": 0.0002, "elapsed_sec": 43826.516272068024, "step_time_sec": 8.228979492996586, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5285, "loss": 3.990243911743164, "lr": 0.0002, "elapsed_sec": 43834.7448515892, "step_time_sec": 8.22845155702089, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5286, "loss": 4.193181037902832, "lr": 0.0002, "elapsed_sec": 43842.973189115524, "step_time_sec": 8.22818360797828, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5287, "loss": 4.263396739959717, "lr": 0.0002, "elapsed_sec": 43851.202718257904, "step_time_sec": 8.229416613990907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5288, "loss": 4.170274257659912, "lr": 0.0002, "elapsed_sec": 43859.43258023262, "step_time_sec": 8.229645667015575, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5289, "loss": 4.10939359664917, "lr": 0.0002, "elapsed_sec": 43867.66197299957, "step_time_sec": 8.229222349007614, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5290, "loss": 4.101803302764893, "lr": 0.0002, "elapsed_sec": 43875.89219164848, "step_time_sec": 8.230128194001736, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5291, "loss": 4.166301727294922, "lr": 0.0002, "elapsed_sec": 43884.11983513832, "step_time_sec": 8.227498555002967, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5292, "loss": 4.121711730957031, "lr": 0.0002, "elapsed_sec": 43892.34869503975, "step_time_sec": 8.228650359000312, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5293, "loss": 4.1445770263671875, "lr": 0.0002, "elapsed_sec": 43900.57900309563, "step_time_sec": 8.230117447004886, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5294, "loss": 4.068601608276367, "lr": 0.0002, "elapsed_sec": 43908.80946612358, "step_time_sec": 8.230374595004832, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5295, "loss": 4.019469261169434, "lr": 0.0002, "elapsed_sec": 43917.040419340134, "step_time_sec": 8.230755237018457, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5296, "loss": 4.053472518920898, "lr": 0.0002, "elapsed_sec": 43925.270383358, "step_time_sec": 8.229805744020268, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5297, "loss": 3.990037202835083, "lr": 0.0002, "elapsed_sec": 43933.4988090992, "step_time_sec": 8.228275471017696, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5298, "loss": 4.05323600769043, "lr": 0.0002, "elapsed_sec": 43941.729476451874, "step_time_sec": 8.230521354009397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5299, "loss": 4.005690097808838, "lr": 0.0002, "elapsed_sec": 43949.959646463394, "step_time_sec": 8.230038624024019, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5300, "loss": 4.145215034484863, "lr": 0.0002, "elapsed_sec": 43958.191267728806, "step_time_sec": 8.231445925019216, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5301, "loss": 3.999459743499756, "lr": 0.0002, "elapsed_sec": 43966.42239499092, "step_time_sec": 8.23099458697834, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5302, "loss": 4.196981430053711, "lr": 0.0002, "elapsed_sec": 43974.65340209007, "step_time_sec": 8.230870897998102, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5303, "loss": 4.111754894256592, "lr": 0.0002, "elapsed_sec": 43982.88391780853, "step_time_sec": 8.230310107988771, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5304, "loss": 4.033180236816406, "lr": 0.0002, "elapsed_sec": 43991.11469101906, "step_time_sec": 8.230678539985092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5305, "loss": 4.166674613952637, "lr": 0.0002, "elapsed_sec": 43999.34602165222, "step_time_sec": 8.231130035012029, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5306, "loss": 4.134483814239502, "lr": 0.0002, "elapsed_sec": 44007.57505965233, "step_time_sec": 8.228852922009537, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5307, "loss": 4.193135738372803, "lr": 0.0002, "elapsed_sec": 44015.80644154549, "step_time_sec": 8.231225393974455, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5308, "loss": 4.124547958374023, "lr": 0.0002, "elapsed_sec": 44024.03712248802, "step_time_sec": 8.230589931976283, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5309, "loss": 4.243001937866211, "lr": 0.0002, "elapsed_sec": 44032.268540620804, "step_time_sec": 8.231236096995417, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5310, "loss": 4.024174213409424, "lr": 0.0002, "elapsed_sec": 44040.499656915665, "step_time_sec": 8.230912417988293, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5311, "loss": 3.9989380836486816, "lr": 0.0002, "elapsed_sec": 44048.72823762894, "step_time_sec": 8.228484569001012, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5312, "loss": 3.9714229106903076, "lr": 0.0002, "elapsed_sec": 44056.95783853531, "step_time_sec": 8.22944467101479, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5313, "loss": 4.1525468826293945, "lr": 0.0002, "elapsed_sec": 44065.18802881241, "step_time_sec": 8.22998095100047, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5314, "loss": 3.9690027236938477, "lr": 0.0002, "elapsed_sec": 44073.41936016083, "step_time_sec": 8.231168808008078, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5315, "loss": 4.135546684265137, "lr": 0.0002, "elapsed_sec": 44081.65054774284, "step_time_sec": 8.231040067999857, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5316, "loss": 4.087793350219727, "lr": 0.0002, "elapsed_sec": 44089.88167977333, "step_time_sec": 8.231004678993486, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5317, "loss": 4.132168292999268, "lr": 0.0002, "elapsed_sec": 44098.11261367798, "step_time_sec": 8.230743344989605, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5318, "loss": 4.046964645385742, "lr": 0.0002, "elapsed_sec": 44106.33976483345, "step_time_sec": 8.227032749011414, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5319, "loss": 4.2549920082092285, "lr": 0.0002, "elapsed_sec": 44114.56988930702, "step_time_sec": 8.22990916800336, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5320, "loss": 4.207277297973633, "lr": 0.0002, "elapsed_sec": 44122.800146102905, "step_time_sec": 8.230107056995621, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5321, "loss": 4.094368934631348, "lr": 0.0002, "elapsed_sec": 44131.03120517731, "step_time_sec": 8.230971993005369, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5322, "loss": 4.129560947418213, "lr": 0.0002, "elapsed_sec": 44139.26276946068, "step_time_sec": 8.231405264989007, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5323, "loss": 4.415639400482178, "lr": 0.0002, "elapsed_sec": 44147.49368071556, "step_time_sec": 8.230679277010495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5324, "loss": 4.111254692077637, "lr": 0.0002, "elapsed_sec": 44155.725044965744, "step_time_sec": 8.231200643000193, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5325, "loss": 3.988238573074341, "lr": 0.0002, "elapsed_sec": 44163.95558786392, "step_time_sec": 8.230384641006822, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5326, "loss": 4.207367420196533, "lr": 0.0002, "elapsed_sec": 44172.18565821648, "step_time_sec": 8.229938587988727, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5327, "loss": 4.130311965942383, "lr": 0.0002, "elapsed_sec": 44180.417025089264, "step_time_sec": 8.231199428992113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5328, "loss": 4.140885829925537, "lr": 0.0002, "elapsed_sec": 44188.64792609215, "step_time_sec": 8.23072388098808, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5329, "loss": 4.052973747253418, "lr": 0.0002, "elapsed_sec": 44196.87936425209, "step_time_sec": 8.231276020989753, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5330, "loss": 4.2169036865234375, "lr": 0.0002, "elapsed_sec": 44205.10886669159, "step_time_sec": 8.229377436975483, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5331, "loss": 4.025973796844482, "lr": 0.0002, "elapsed_sec": 44213.340057611465, "step_time_sec": 8.231003109016456, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5332, "loss": 4.091495990753174, "lr": 0.0002, "elapsed_sec": 44221.56989169121, "step_time_sec": 8.229648784006713, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5333, "loss": 4.091911315917969, "lr": 0.0002, "elapsed_sec": 44229.79955244064, "step_time_sec": 8.22950039501302, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5334, "loss": 3.958874225616455, "lr": 0.0002, "elapsed_sec": 44238.02697134018, "step_time_sec": 8.22723646700615, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5335, "loss": 4.113738059997559, "lr": 0.0002, "elapsed_sec": 44246.25643157959, "step_time_sec": 8.229355987015879, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5336, "loss": 4.067473411560059, "lr": 0.0002, "elapsed_sec": 44254.4845867157, "step_time_sec": 8.227992469008313, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5337, "loss": 4.117400169372559, "lr": 0.0002, "elapsed_sec": 44262.71486878395, "step_time_sec": 8.23007692900137, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5338, "loss": 4.1564106941223145, "lr": 0.0002, "elapsed_sec": 44270.947053432465, "step_time_sec": 8.232046001008712, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5339, "loss": 4.128871917724609, "lr": 0.0002, "elapsed_sec": 44279.17768454552, "step_time_sec": 8.230459871003404, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5340, "loss": 4.079959392547607, "lr": 0.0002, "elapsed_sec": 44287.40969324112, "step_time_sec": 8.231879800994648, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5341, "loss": 4.089249134063721, "lr": 0.0002, "elapsed_sec": 44295.6425178051, "step_time_sec": 8.232681071996922, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5342, "loss": 4.122392654418945, "lr": 0.0002, "elapsed_sec": 44303.872764110565, "step_time_sec": 8.230009741004324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5343, "loss": 4.024392604827881, "lr": 0.0002, "elapsed_sec": 44312.103649139404, "step_time_sec": 8.230721375002759, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5344, "loss": 4.109583377838135, "lr": 0.0002, "elapsed_sec": 44320.33417344093, "step_time_sec": 8.230386727984296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5345, "loss": 4.072851181030273, "lr": 0.0002, "elapsed_sec": 44328.564363479614, "step_time_sec": 8.230037799978163, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5346, "loss": 4.089802265167236, "lr": 0.0002, "elapsed_sec": 44336.79378223419, "step_time_sec": 8.229213804006577, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5347, "loss": 4.174381256103516, "lr": 0.0002, "elapsed_sec": 44345.02401018143, "step_time_sec": 8.23008735399344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5348, "loss": 4.16497802734375, "lr": 0.0002, "elapsed_sec": 44353.2534134388, "step_time_sec": 8.229248818999622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5349, "loss": 3.9789927005767822, "lr": 0.0002, "elapsed_sec": 44361.48242640495, "step_time_sec": 8.22888832501485, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5350, "loss": 3.888421058654785, "lr": 0.0002, "elapsed_sec": 44369.71314787865, "step_time_sec": 8.230577956972411, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5351, "loss": 4.166144847869873, "lr": 0.0002, "elapsed_sec": 44377.94346809387, "step_time_sec": 8.23010879001231, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5352, "loss": 4.107031345367432, "lr": 0.0002, "elapsed_sec": 44386.17481994629, "step_time_sec": 8.231168860016624, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5353, "loss": 4.006103038787842, "lr": 0.0002, "elapsed_sec": 44394.40614080429, "step_time_sec": 8.231198795023374, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5354, "loss": 4.277659893035889, "lr": 0.0002, "elapsed_sec": 44402.637419223785, "step_time_sec": 8.231059017009102, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5355, "loss": 4.155068397521973, "lr": 0.0002, "elapsed_sec": 44410.86777853966, "step_time_sec": 8.230201133992523, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5356, "loss": 4.000691890716553, "lr": 0.0002, "elapsed_sec": 44419.097902059555, "step_time_sec": 8.229978933988605, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5357, "loss": 4.059609889984131, "lr": 0.0002, "elapsed_sec": 44427.32668709755, "step_time_sec": 8.228657729981933, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5358, "loss": 4.118340969085693, "lr": 0.0002, "elapsed_sec": 44435.55690050125, "step_time_sec": 8.230065451993141, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5359, "loss": 4.216427803039551, "lr": 0.0002, "elapsed_sec": 44443.78806471825, "step_time_sec": 8.230966581002576, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5360, "loss": 4.126765251159668, "lr": 0.0002, "elapsed_sec": 44452.01791834831, "step_time_sec": 8.229648767999606, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5361, "loss": 4.0758843421936035, "lr": 0.0002, "elapsed_sec": 44460.24666905403, "step_time_sec": 8.228586226992775, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5362, "loss": 4.013856887817383, "lr": 0.0002, "elapsed_sec": 44468.47445273399, "step_time_sec": 8.227661106007872, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5363, "loss": 4.215982437133789, "lr": 0.0002, "elapsed_sec": 44476.7051115036, "step_time_sec": 8.230544739984907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5364, "loss": 4.118481636047363, "lr": 0.0002, "elapsed_sec": 44484.93618130684, "step_time_sec": 8.230823403020622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5365, "loss": 4.072933673858643, "lr": 0.0002, "elapsed_sec": 44493.16682720184, "step_time_sec": 8.230488479021005, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5366, "loss": 4.253246307373047, "lr": 0.0002, "elapsed_sec": 44501.39789891243, "step_time_sec": 8.230949196993606, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5367, "loss": 4.027832508087158, "lr": 0.0002, "elapsed_sec": 44509.63028097153, "step_time_sec": 8.232256368006347, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5368, "loss": 4.152889728546143, "lr": 0.0002, "elapsed_sec": 44517.85811281204, "step_time_sec": 8.227608965011314, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5369, "loss": 4.095968723297119, "lr": 0.0002, "elapsed_sec": 44526.088624715805, "step_time_sec": 8.230407261027722, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5370, "loss": 4.031872272491455, "lr": 0.0002, "elapsed_sec": 44534.31958794594, "step_time_sec": 8.230707575014094, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5371, "loss": 4.165981292724609, "lr": 0.0002, "elapsed_sec": 44542.55111217499, "step_time_sec": 8.231376957002794, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5372, "loss": 4.095240592956543, "lr": 0.0002, "elapsed_sec": 44550.78194308281, "step_time_sec": 8.23066105999169, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5373, "loss": 4.076562881469727, "lr": 0.0002, "elapsed_sec": 44559.01311540604, "step_time_sec": 8.230995796999196, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5374, "loss": 4.0916218757629395, "lr": 0.0002, "elapsed_sec": 44567.24463009834, "step_time_sec": 8.231343838007888, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5375, "loss": 4.059723854064941, "lr": 0.0002, "elapsed_sec": 44575.475848436356, "step_time_sec": 8.231065934989601, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5376, "loss": 4.116049766540527, "lr": 0.0002, "elapsed_sec": 44583.70593094826, "step_time_sec": 8.229937366006197, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5377, "loss": 4.049222946166992, "lr": 0.0002, "elapsed_sec": 44591.934720516205, "step_time_sec": 8.228647197014652, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5378, "loss": 4.122207164764404, "lr": 0.0002, "elapsed_sec": 44600.164506196976, "step_time_sec": 8.229607019980904, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5379, "loss": 4.135268688201904, "lr": 0.0002, "elapsed_sec": 44608.39482784271, "step_time_sec": 8.230147952999687, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5380, "loss": 4.1314191818237305, "lr": 0.0002, "elapsed_sec": 44616.62572932243, "step_time_sec": 8.230782966013066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5381, "loss": 3.8458688259124756, "lr": 0.0002, "elapsed_sec": 44624.85712528229, "step_time_sec": 8.231255960999988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5382, "loss": 4.071778297424316, "lr": 0.0002, "elapsed_sec": 44633.088128089905, "step_time_sec": 8.230775909993099, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5383, "loss": 4.030184268951416, "lr": 0.0002, "elapsed_sec": 44641.319428920746, "step_time_sec": 8.231158209004207, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5384, "loss": 4.039957046508789, "lr": 0.0002, "elapsed_sec": 44649.550114393234, "step_time_sec": 8.2305338709848, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5385, "loss": 3.9926083087921143, "lr": 0.0002, "elapsed_sec": 44657.782081365585, "step_time_sec": 8.231761968985666, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5386, "loss": 4.067318439483643, "lr": 0.0002, "elapsed_sec": 44666.01183247566, "step_time_sec": 8.2295952590066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5387, "loss": 4.077134609222412, "lr": 0.0002, "elapsed_sec": 44674.2423825264, "step_time_sec": 8.230456479010172, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5388, "loss": 4.134898662567139, "lr": 0.0002, "elapsed_sec": 44682.47360539436, "step_time_sec": 8.230996454018168, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5389, "loss": 4.099691390991211, "lr": 0.0002, "elapsed_sec": 44690.702471256256, "step_time_sec": 8.228709402028471, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5390, "loss": 3.9976418018341064, "lr": 0.0002, "elapsed_sec": 44698.93363237381, "step_time_sec": 8.230998490005732, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5391, "loss": 4.037163734436035, "lr": 0.0002, "elapsed_sec": 44707.16405963898, "step_time_sec": 8.23025471699657, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5392, "loss": 4.080487251281738, "lr": 0.0002, "elapsed_sec": 44715.3937394619, "step_time_sec": 8.229600788996322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5393, "loss": 4.0855793952941895, "lr": 0.0002, "elapsed_sec": 44723.6233754158, "step_time_sec": 8.22940474600182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5394, "loss": 4.079326629638672, "lr": 0.0002, "elapsed_sec": 44731.85382413864, "step_time_sec": 8.230323844007216, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5395, "loss": 4.065242290496826, "lr": 0.0002, "elapsed_sec": 44740.08527350426, "step_time_sec": 8.231293696997454, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5396, "loss": 4.080923080444336, "lr": 0.0002, "elapsed_sec": 44748.31336903572, "step_time_sec": 8.22792928799754, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5397, "loss": 4.060666561126709, "lr": 0.0002, "elapsed_sec": 44756.544827222824, "step_time_sec": 8.23126582801342, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5398, "loss": 4.090270042419434, "lr": 0.0002, "elapsed_sec": 44764.77627301216, "step_time_sec": 8.231307245994685, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5399, "loss": 4.200323581695557, "lr": 0.0002, "elapsed_sec": 44773.00782227516, "step_time_sec": 8.231334664014867, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5400, "loss": 4.201820373535156, "lr": 0.0002, "elapsed_sec": 44781.238970041275, "step_time_sec": 8.230986688024132, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5401, "loss": 4.084059715270996, "lr": 0.0002, "elapsed_sec": 44789.47071361542, "step_time_sec": 8.231580351974117, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5402, "loss": 4.095048904418945, "lr": 0.0002, "elapsed_sec": 44797.70235180855, "step_time_sec": 8.23153894697316, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5403, "loss": 4.071706295013428, "lr": 0.0002, "elapsed_sec": 44805.93370223045, "step_time_sec": 8.231147139013046, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5404, "loss": 4.113619327545166, "lr": 0.0002, "elapsed_sec": 44814.1649954319, "step_time_sec": 8.231096584990155, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5405, "loss": 4.129576206207275, "lr": 0.0002, "elapsed_sec": 44822.394161224365, "step_time_sec": 8.229004121996695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5406, "loss": 3.993427038192749, "lr": 0.0002, "elapsed_sec": 44830.624591350555, "step_time_sec": 8.230270737985848, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5407, "loss": 4.072210311889648, "lr": 0.0002, "elapsed_sec": 44838.85548710823, "step_time_sec": 8.23071064701071, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5408, "loss": 4.114748954772949, "lr": 0.0002, "elapsed_sec": 44847.0863673687, "step_time_sec": 8.230753106996417, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5409, "loss": 4.002819061279297, "lr": 0.0002, "elapsed_sec": 44855.317618370056, "step_time_sec": 8.231061183993006, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5410, "loss": 4.030467987060547, "lr": 0.0002, "elapsed_sec": 44863.54764008522, "step_time_sec": 8.229896920995088, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5411, "loss": 4.2205586433410645, "lr": 0.0002, "elapsed_sec": 44871.77944254875, "step_time_sec": 8.23162560298806, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5412, "loss": 3.9934754371643066, "lr": 0.0002, "elapsed_sec": 44880.00956058502, "step_time_sec": 8.229958509007702, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5413, "loss": 4.098668575286865, "lr": 0.0002, "elapsed_sec": 44888.23965334892, "step_time_sec": 8.229911456990521, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5414, "loss": 4.10671329498291, "lr": 0.0002, "elapsed_sec": 44896.4704349041, "step_time_sec": 8.230609138001455, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5415, "loss": 3.9424357414245605, "lr": 0.0002, "elapsed_sec": 44904.70124077797, "step_time_sec": 8.23064534200239, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5416, "loss": 4.20057487487793, "lr": 0.0002, "elapsed_sec": 44912.932743787766, "step_time_sec": 8.23134528301307, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5417, "loss": 4.000094890594482, "lr": 0.0002, "elapsed_sec": 44921.162992954254, "step_time_sec": 8.230146923015127, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5418, "loss": 4.080565929412842, "lr": 0.0002, "elapsed_sec": 44929.39239358902, "step_time_sec": 8.22916060400894, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5419, "loss": 4.158111095428467, "lr": 0.0002, "elapsed_sec": 44937.62187695503, "step_time_sec": 8.22932566498639, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5420, "loss": 4.128824710845947, "lr": 0.0002, "elapsed_sec": 44945.85099482536, "step_time_sec": 8.228996891004499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5421, "loss": 4.12371826171875, "lr": 0.0002, "elapsed_sec": 44954.08038663864, "step_time_sec": 8.22920713701751, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5422, "loss": 3.9923133850097656, "lr": 0.0002, "elapsed_sec": 44962.30904150009, "step_time_sec": 8.228493263013661, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5423, "loss": 4.0470356941223145, "lr": 0.0002, "elapsed_sec": 44970.54062628746, "step_time_sec": 8.231386725994525, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5424, "loss": 4.070606708526611, "lr": 0.0002, "elapsed_sec": 44978.771178007126, "step_time_sec": 8.230393001023913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5425, "loss": 4.137372016906738, "lr": 0.0002, "elapsed_sec": 44987.00008368492, "step_time_sec": 8.228793615999166, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5426, "loss": 4.105534553527832, "lr": 0.0002, "elapsed_sec": 44995.23122382164, "step_time_sec": 8.230987760005519, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5427, "loss": 4.058445930480957, "lr": 0.0002, "elapsed_sec": 45003.46173095703, "step_time_sec": 8.230289661005372, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5428, "loss": 4.131017684936523, "lr": 0.0002, "elapsed_sec": 45011.69284820557, "step_time_sec": 8.23093853300088, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5429, "loss": 4.084017753601074, "lr": 0.0002, "elapsed_sec": 45019.9236702919, "step_time_sec": 8.230698831001064, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5430, "loss": 4.044315814971924, "lr": 0.0002, "elapsed_sec": 45028.15523314476, "step_time_sec": 8.231372702983208, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5431, "loss": 4.14283561706543, "lr": 0.0002, "elapsed_sec": 45036.38596534729, "step_time_sec": 8.230555573973106, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5432, "loss": 4.090673446655273, "lr": 0.0002, "elapsed_sec": 45044.617688179016, "step_time_sec": 8.231565264024539, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5433, "loss": 3.951880693435669, "lr": 0.0002, "elapsed_sec": 45052.84722661972, "step_time_sec": 8.229385045007803, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5434, "loss": 4.171011447906494, "lr": 0.0002, "elapsed_sec": 45061.07526063919, "step_time_sec": 8.227905081992503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5435, "loss": 4.063583850860596, "lr": 0.0002, "elapsed_sec": 45069.30428957939, "step_time_sec": 8.228843197983224, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5436, "loss": 4.1011128425598145, "lr": 0.0002, "elapsed_sec": 45077.53437829018, "step_time_sec": 8.229917853983352, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5437, "loss": 4.019417762756348, "lr": 0.0002, "elapsed_sec": 45085.765325307846, "step_time_sec": 8.23081162100425, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5438, "loss": 4.069201469421387, "lr": 0.0002, "elapsed_sec": 45093.99601602554, "step_time_sec": 8.230523644015193, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5439, "loss": 4.006721496582031, "lr": 0.0002, "elapsed_sec": 45102.226840257645, "step_time_sec": 8.230662061017938, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5440, "loss": 4.10087251663208, "lr": 0.0002, "elapsed_sec": 45110.45842957497, "step_time_sec": 8.231422076001763, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5441, "loss": 4.012565612792969, "lr": 0.0002, "elapsed_sec": 45118.68875980377, "step_time_sec": 8.230126342998119, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5442, "loss": 4.0297532081604, "lr": 0.0002, "elapsed_sec": 45126.920087099075, "step_time_sec": 8.231236977997469, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5443, "loss": 4.100466251373291, "lr": 0.0002, "elapsed_sec": 45135.151139974594, "step_time_sec": 8.230903185001807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5444, "loss": 4.066948413848877, "lr": 0.0002, "elapsed_sec": 45143.380474090576, "step_time_sec": 8.22911132799345, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5445, "loss": 4.020194053649902, "lr": 0.0002, "elapsed_sec": 45151.60930085182, "step_time_sec": 8.228662359004375, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5446, "loss": 4.087247848510742, "lr": 0.0002, "elapsed_sec": 45159.84003329277, "step_time_sec": 8.230585175013402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5447, "loss": 4.077870845794678, "lr": 0.0002, "elapsed_sec": 45168.072957992554, "step_time_sec": 8.232777199998964, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5448, "loss": 4.082046985626221, "lr": 0.0002, "elapsed_sec": 45176.30313158035, "step_time_sec": 8.229983392986469, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5449, "loss": 4.0888590812683105, "lr": 0.0002, "elapsed_sec": 45184.53407692909, "step_time_sec": 8.230804052000167, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5450, "loss": 3.994147777557373, "lr": 0.0002, "elapsed_sec": 45192.76446437836, "step_time_sec": 8.230177068995545, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5451, "loss": 4.093277454376221, "lr": 0.0002, "elapsed_sec": 45200.99335169792, "step_time_sec": 8.228762282000389, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5452, "loss": 4.135866165161133, "lr": 0.0002, "elapsed_sec": 45209.22198987007, "step_time_sec": 8.228438595018815, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5453, "loss": 4.070179462432861, "lr": 0.0002, "elapsed_sec": 45217.451662778854, "step_time_sec": 8.229532409022795, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5454, "loss": 4.103725910186768, "lr": 0.0002, "elapsed_sec": 45225.6812710762, "step_time_sec": 8.229444517986849, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5455, "loss": 4.122385501861572, "lr": 0.0002, "elapsed_sec": 45233.91036939621, "step_time_sec": 8.228886706987396, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5456, "loss": 4.217254638671875, "lr": 0.0002, "elapsed_sec": 45242.138850450516, "step_time_sec": 8.228363094007364, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5457, "loss": 4.003715515136719, "lr": 0.0002, "elapsed_sec": 45250.36802983284, "step_time_sec": 8.229015308985254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5458, "loss": 4.0477471351623535, "lr": 0.0002, "elapsed_sec": 45258.596640348434, "step_time_sec": 8.228407919988967, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5459, "loss": 4.486151695251465, "lr": 0.0002, "elapsed_sec": 45266.82579445839, "step_time_sec": 8.22897923499113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5460, "loss": 4.056805610656738, "lr": 0.0002, "elapsed_sec": 45275.05489110947, "step_time_sec": 8.228927702992223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5461, "loss": 4.0736589431762695, "lr": 0.0002, "elapsed_sec": 45283.28297662735, "step_time_sec": 8.227921753015835, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5462, "loss": 4.1230926513671875, "lr": 0.0002, "elapsed_sec": 45291.51087927818, "step_time_sec": 8.227746552001918, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5463, "loss": 4.0134429931640625, "lr": 0.0002, "elapsed_sec": 45299.73832964897, "step_time_sec": 8.227325321000535, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5464, "loss": 4.138050556182861, "lr": 0.0002, "elapsed_sec": 45307.966556310654, "step_time_sec": 8.22806033998495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5465, "loss": 4.0792622566223145, "lr": 0.0002, "elapsed_sec": 45316.196811676025, "step_time_sec": 8.230073909973726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5466, "loss": 4.009525299072266, "lr": 0.0002, "elapsed_sec": 45324.42721939087, "step_time_sec": 8.23025805799989, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5467, "loss": 4.0111308097839355, "lr": 0.0002, "elapsed_sec": 45332.65809130669, "step_time_sec": 8.230687717994442, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5468, "loss": 4.154489040374756, "lr": 0.0002, "elapsed_sec": 45340.888833761215, "step_time_sec": 8.23057364000124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5469, "loss": 4.003884792327881, "lr": 0.0002, "elapsed_sec": 45349.12052512169, "step_time_sec": 8.231524687987985, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5470, "loss": 3.872070789337158, "lr": 0.0002, "elapsed_sec": 45357.35061955452, "step_time_sec": 8.22998142297729, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5471, "loss": 3.9363391399383545, "lr": 0.0002, "elapsed_sec": 45365.58098769188, "step_time_sec": 8.23019612499047, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5472, "loss": 3.9134182929992676, "lr": 0.0002, "elapsed_sec": 45373.81145405769, "step_time_sec": 8.230286370991962, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5473, "loss": 4.032241344451904, "lr": 0.0002, "elapsed_sec": 45382.03923749924, "step_time_sec": 8.22754284500843, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5474, "loss": 4.080241680145264, "lr": 0.0002, "elapsed_sec": 45390.268775463104, "step_time_sec": 8.229387135012075, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5475, "loss": 4.083611965179443, "lr": 0.0002, "elapsed_sec": 45398.49804353714, "step_time_sec": 8.229113052017055, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5476, "loss": 3.931842565536499, "lr": 0.0002, "elapsed_sec": 45406.72640109062, "step_time_sec": 8.22820018499624, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5477, "loss": 3.9116954803466797, "lr": 0.0002, "elapsed_sec": 45414.95425486565, "step_time_sec": 8.227658230025554, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5478, "loss": 4.193292617797852, "lr": 0.0002, "elapsed_sec": 45423.1828186512, "step_time_sec": 8.228396844991948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5479, "loss": 4.065478801727295, "lr": 0.0002, "elapsed_sec": 45431.41371130943, "step_time_sec": 8.230810506996932, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5480, "loss": 4.104193687438965, "lr": 0.0002, "elapsed_sec": 45439.64425730705, "step_time_sec": 8.23034699098207, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5481, "loss": 3.8617701530456543, "lr": 0.0002, "elapsed_sec": 45447.87324953079, "step_time_sec": 8.228806807019282, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5482, "loss": 4.217314720153809, "lr": 0.0002, "elapsed_sec": 45456.10119891167, "step_time_sec": 8.227764243987622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5483, "loss": 3.9502413272857666, "lr": 0.0002, "elapsed_sec": 45464.32866239548, "step_time_sec": 8.227297195000574, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5484, "loss": 4.069271087646484, "lr": 0.0002, "elapsed_sec": 45472.55916786194, "step_time_sec": 8.230389960983302, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5485, "loss": 3.960663080215454, "lr": 0.0002, "elapsed_sec": 45480.787172079086, "step_time_sec": 8.227837915997952, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5486, "loss": 4.042372226715088, "lr": 0.0002, "elapsed_sec": 45489.01591682434, "step_time_sec": 8.228541032003704, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5487, "loss": 4.048230171203613, "lr": 0.0002, "elapsed_sec": 45497.244976997375, "step_time_sec": 8.22894237699802, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5488, "loss": 4.136467456817627, "lr": 0.0002, "elapsed_sec": 45505.47291088104, "step_time_sec": 8.22769273800077, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5489, "loss": 3.9592952728271484, "lr": 0.0002, "elapsed_sec": 45513.70413041115, "step_time_sec": 8.231082415994024, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5490, "loss": 4.130417823791504, "lr": 0.0002, "elapsed_sec": 45521.934549331665, "step_time_sec": 8.230247170984512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5491, "loss": 4.045634746551514, "lr": 0.0002, "elapsed_sec": 45530.165623664856, "step_time_sec": 8.23097358099767, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5492, "loss": 3.9514169692993164, "lr": 0.0002, "elapsed_sec": 45538.39657330513, "step_time_sec": 8.230792623013258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5493, "loss": 4.163862705230713, "lr": 0.0002, "elapsed_sec": 45546.627123594284, "step_time_sec": 8.230316920991754, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5494, "loss": 3.8123109340667725, "lr": 0.0002, "elapsed_sec": 45554.85632824898, "step_time_sec": 8.229047736007487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5495, "loss": 4.0733866691589355, "lr": 0.0002, "elapsed_sec": 45563.085008621216, "step_time_sec": 8.228530371998204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5496, "loss": 4.027347087860107, "lr": 0.0002, "elapsed_sec": 45571.314229011536, "step_time_sec": 8.229085906990804, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5497, "loss": 4.188486099243164, "lr": 0.0002, "elapsed_sec": 45579.54521203041, "step_time_sec": 8.2308597349911, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5498, "loss": 4.035097122192383, "lr": 0.0002, "elapsed_sec": 45587.77357816696, "step_time_sec": 8.22813722200226, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5499, "loss": 4.126668930053711, "lr": 0.0002, "elapsed_sec": 45596.00420713425, "step_time_sec": 8.23051653499715, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5500, "loss": 4.08742094039917, "lr": 0.0002, "elapsed_sec": 45604.23372268677, "step_time_sec": 30.710039290977875, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5501, "loss": 4.327143669128418, "lr": 0.0002, "elapsed_sec": 45634.95300102234, "step_time_sec": 8.238118070992641, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5502, "loss": 4.080867767333984, "lr": 0.0002, "elapsed_sec": 45643.16990113258, "step_time_sec": 8.216740336007206, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5503, "loss": 3.9109995365142822, "lr": 0.0002, "elapsed_sec": 45651.39089131355, "step_time_sec": 8.220743220997974, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5504, "loss": 4.116318702697754, "lr": 0.0002, "elapsed_sec": 45659.621460199356, "step_time_sec": 8.23046342600719, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5505, "loss": 3.9655418395996094, "lr": 0.0002, "elapsed_sec": 45667.85307788849, "step_time_sec": 8.231432999979006, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5506, "loss": 4.0871100425720215, "lr": 0.0002, "elapsed_sec": 45676.083632946014, "step_time_sec": 8.230436568992445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5507, "loss": 4.038854598999023, "lr": 0.0002, "elapsed_sec": 45684.31424641609, "step_time_sec": 8.230411530006677, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5508, "loss": 3.9992051124572754, "lr": 0.0002, "elapsed_sec": 45692.54355788231, "step_time_sec": 8.229192624014104, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5509, "loss": 4.105510711669922, "lr": 0.0002, "elapsed_sec": 45700.77140688896, "step_time_sec": 8.22764788600034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5510, "loss": 4.119811058044434, "lr": 0.0002, "elapsed_sec": 45709.00125002861, "step_time_sec": 8.229712439992, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5511, "loss": 4.1721882820129395, "lr": 0.0002, "elapsed_sec": 45717.23163628578, "step_time_sec": 8.230199626996182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5512, "loss": 3.911829948425293, "lr": 0.0002, "elapsed_sec": 45725.46216130257, "step_time_sec": 8.230327457014937, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5513, "loss": 4.18174934387207, "lr": 0.0002, "elapsed_sec": 45733.69146108627, "step_time_sec": 8.229212147009093, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5514, "loss": 3.9450645446777344, "lr": 0.0002, "elapsed_sec": 45741.922843933105, "step_time_sec": 8.231143195996992, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5515, "loss": 4.049205303192139, "lr": 0.0002, "elapsed_sec": 45750.15307807922, "step_time_sec": 8.230054980987916, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5516, "loss": 4.115837097167969, "lr": 0.0002, "elapsed_sec": 45758.38262653351, "step_time_sec": 8.229389885003911, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5517, "loss": 3.8806557655334473, "lr": 0.0002, "elapsed_sec": 45766.61388993263, "step_time_sec": 8.231121933989925, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5518, "loss": 4.012770652770996, "lr": 0.0002, "elapsed_sec": 45774.84430193901, "step_time_sec": 8.23021405300824, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5519, "loss": 3.9587795734405518, "lr": 0.0002, "elapsed_sec": 45783.07594704628, "step_time_sec": 8.23148246400524, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5520, "loss": 4.011881351470947, "lr": 0.0002, "elapsed_sec": 45791.306620121, "step_time_sec": 8.230589143000543, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5521, "loss": 3.9840002059936523, "lr": 0.0002, "elapsed_sec": 45799.53686666489, "step_time_sec": 8.230033778992947, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5522, "loss": 3.9849584102630615, "lr": 0.0002, "elapsed_sec": 45807.76581740379, "step_time_sec": 8.228790160996141, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5523, "loss": 4.1075239181518555, "lr": 0.0002, "elapsed_sec": 45815.99488067627, "step_time_sec": 8.22888951501227, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5524, "loss": 3.9805521965026855, "lr": 0.0002, "elapsed_sec": 45824.22380542755, "step_time_sec": 8.228727759997128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5525, "loss": 4.142551898956299, "lr": 0.0002, "elapsed_sec": 45832.45490574837, "step_time_sec": 8.230955335020553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5526, "loss": 4.129153251647949, "lr": 0.0002, "elapsed_sec": 45840.6853017807, "step_time_sec": 8.230196353018982, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5527, "loss": 4.094875812530518, "lr": 0.0002, "elapsed_sec": 45848.91612625122, "step_time_sec": 8.230651186982868, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5528, "loss": 4.007862091064453, "lr": 0.0002, "elapsed_sec": 45857.14792919159, "step_time_sec": 8.231637616991065, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5529, "loss": 4.101215839385986, "lr": 0.0002, "elapsed_sec": 45865.37880063057, "step_time_sec": 8.230692781013204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5530, "loss": 4.070682525634766, "lr": 0.0002, "elapsed_sec": 45873.61028337479, "step_time_sec": 8.231322281004395, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5531, "loss": 3.90354061126709, "lr": 0.0002, "elapsed_sec": 45881.84111785889, "step_time_sec": 8.230679747997783, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5532, "loss": 4.076213836669922, "lr": 0.0002, "elapsed_sec": 45890.07264113426, "step_time_sec": 8.231375051982468, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5533, "loss": 4.059908866882324, "lr": 0.0002, "elapsed_sec": 45898.30232000351, "step_time_sec": 8.229571202013176, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5534, "loss": 4.069532871246338, "lr": 0.0002, "elapsed_sec": 45906.53154754639, "step_time_sec": 8.229042098013451, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5535, "loss": 4.062557220458984, "lr": 0.0002, "elapsed_sec": 45914.76220083237, "step_time_sec": 8.230448488000548, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5536, "loss": 4.159934997558594, "lr": 0.0002, "elapsed_sec": 45922.9926905632, "step_time_sec": 8.230300560011528, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5537, "loss": 4.10466194152832, "lr": 0.0002, "elapsed_sec": 45931.22377490997, "step_time_sec": 8.231001787993591, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5538, "loss": 4.053045749664307, "lr": 0.0002, "elapsed_sec": 45939.45477795601, "step_time_sec": 8.230810489010764, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5539, "loss": 4.121116638183594, "lr": 0.0002, "elapsed_sec": 45947.685890197754, "step_time_sec": 8.230943497008411, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5540, "loss": 4.1188130378723145, "lr": 0.0002, "elapsed_sec": 45955.917746305466, "step_time_sec": 8.23164752998855, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5541, "loss": 4.0404767990112305, "lr": 0.0002, "elapsed_sec": 45964.147090911865, "step_time_sec": 8.229190526006278, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5542, "loss": 3.8811984062194824, "lr": 0.0002, "elapsed_sec": 45972.374853134155, "step_time_sec": 8.227615262992913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5543, "loss": 4.116417407989502, "lr": 0.0002, "elapsed_sec": 45980.60639643669, "step_time_sec": 8.231321134982863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5544, "loss": 4.089178085327148, "lr": 0.0002, "elapsed_sec": 45988.83649802208, "step_time_sec": 8.229929267981788, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5545, "loss": 3.828251838684082, "lr": 0.0002, "elapsed_sec": 45997.06632184982, "step_time_sec": 8.22968222000054, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5546, "loss": 4.17227840423584, "lr": 0.0002, "elapsed_sec": 46005.29653453827, "step_time_sec": 8.23004858798231, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5547, "loss": 4.030470371246338, "lr": 0.0002, "elapsed_sec": 46013.52841067314, "step_time_sec": 8.231689570995513, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13269960192}
{"step": 5548, "loss": 4.053192138671875, "lr": 0.0002, "elapsed_sec": 46022.72255182266, "step_time_sec": 9.194035599997733, "effective_batch_tokens": 232407, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5549, "loss": 3.9956204891204834, "lr": 0.0002, "elapsed_sec": 46030.95094704628, "step_time_sec": 8.22827456399682, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5550, "loss": 4.057313919067383, "lr": 0.0002, "elapsed_sec": 46039.182493925095, "step_time_sec": 8.23129733800306, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5551, "loss": 4.110785007476807, "lr": 0.0002, "elapsed_sec": 46047.41388607025, "step_time_sec": 8.231237403000705, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5552, "loss": 4.071905136108398, "lr": 0.0002, "elapsed_sec": 46055.644659757614, "step_time_sec": 8.230678051972063, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5553, "loss": 4.128008842468262, "lr": 0.0002, "elapsed_sec": 46063.87543606758, "step_time_sec": 8.230589941988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5554, "loss": 4.026928424835205, "lr": 0.0002, "elapsed_sec": 46072.106058359146, "step_time_sec": 8.230461717001162, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5555, "loss": 3.984066963195801, "lr": 0.0002, "elapsed_sec": 46080.33558797836, "step_time_sec": 8.229367630992783, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5556, "loss": 4.093075752258301, "lr": 0.0002, "elapsed_sec": 46088.565181970596, "step_time_sec": 8.229447296005674, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5557, "loss": 4.069849967956543, "lr": 0.0002, "elapsed_sec": 46096.796330690384, "step_time_sec": 8.231037407997064, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5558, "loss": 4.086678504943848, "lr": 0.0002, "elapsed_sec": 46105.02569150925, "step_time_sec": 8.229131942993263, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5559, "loss": 3.990553140640259, "lr": 0.0002, "elapsed_sec": 46113.2547917366, "step_time_sec": 8.228952950012172, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5560, "loss": 4.095404148101807, "lr": 0.0002, "elapsed_sec": 46121.48437047005, "step_time_sec": 8.229406191996532, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5561, "loss": 4.479536533355713, "lr": 0.0002, "elapsed_sec": 46129.7132267952, "step_time_sec": 8.228757476987084, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5562, "loss": 4.024416923522949, "lr": 0.0002, "elapsed_sec": 46137.94154763222, "step_time_sec": 8.228155165008502, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5563, "loss": 4.026753902435303, "lr": 0.0002, "elapsed_sec": 46146.17168402672, "step_time_sec": 8.22994461999042, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5564, "loss": 4.057743549346924, "lr": 0.0002, "elapsed_sec": 46154.40335583687, "step_time_sec": 8.231551200995455, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5565, "loss": 4.0459160804748535, "lr": 0.0002, "elapsed_sec": 46162.63242268562, "step_time_sec": 8.228864945995156, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5566, "loss": 3.772669553756714, "lr": 0.0002, "elapsed_sec": 46170.862557172775, "step_time_sec": 8.229969815001823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5567, "loss": 4.064646244049072, "lr": 0.0002, "elapsed_sec": 46179.09416103363, "step_time_sec": 8.231441930984147, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5568, "loss": 4.1400532722473145, "lr": 0.0002, "elapsed_sec": 46187.32266497612, "step_time_sec": 8.228334059007466, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5569, "loss": 4.034774303436279, "lr": 0.0002, "elapsed_sec": 46195.55144047737, "step_time_sec": 8.22861549598747, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5570, "loss": 4.141640663146973, "lr": 0.0002, "elapsed_sec": 46203.77998447418, "step_time_sec": 8.228399095998611, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5571, "loss": 4.134809494018555, "lr": 0.0002, "elapsed_sec": 46212.00992822647, "step_time_sec": 8.229812486999435, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5572, "loss": 4.055581092834473, "lr": 0.0002, "elapsed_sec": 46220.24131274223, "step_time_sec": 8.231249208009103, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5573, "loss": 4.165756702423096, "lr": 0.0002, "elapsed_sec": 46228.470885276794, "step_time_sec": 8.229374408983858, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5574, "loss": 4.22141695022583, "lr": 0.0002, "elapsed_sec": 46236.70076370239, "step_time_sec": 8.229712003987515, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5575, "loss": 4.095097541809082, "lr": 0.0002, "elapsed_sec": 46244.93196249008, "step_time_sec": 8.231053636001889, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5576, "loss": 3.9726343154907227, "lr": 0.0002, "elapsed_sec": 46253.163294792175, "step_time_sec": 8.231198755005607, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5577, "loss": 4.254820346832275, "lr": 0.0002, "elapsed_sec": 46261.39428448677, "step_time_sec": 8.23079043100006, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5578, "loss": 3.9674601554870605, "lr": 0.0002, "elapsed_sec": 46269.625041246414, "step_time_sec": 8.230608654004754, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5579, "loss": 4.137265205383301, "lr": 0.0002, "elapsed_sec": 46277.856041669846, "step_time_sec": 8.230840844014892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5580, "loss": 4.059810161590576, "lr": 0.0002, "elapsed_sec": 46286.08534002304, "step_time_sec": 8.229110090993345, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5581, "loss": 4.041741371154785, "lr": 0.0002, "elapsed_sec": 46294.316009283066, "step_time_sec": 8.230554492009105, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5582, "loss": 3.9832632541656494, "lr": 0.0002, "elapsed_sec": 46302.54757142067, "step_time_sec": 8.231369373999769, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5583, "loss": 4.098837852478027, "lr": 0.0002, "elapsed_sec": 46310.77793312073, "step_time_sec": 8.230218785000034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5584, "loss": 4.221014022827148, "lr": 0.0002, "elapsed_sec": 46319.00840306282, "step_time_sec": 8.230348870012676, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5585, "loss": 4.094401836395264, "lr": 0.0002, "elapsed_sec": 46327.23882508278, "step_time_sec": 8.23028051899746, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5586, "loss": 4.027716636657715, "lr": 0.0002, "elapsed_sec": 46335.46837735176, "step_time_sec": 8.229417070979252, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5587, "loss": 4.079101085662842, "lr": 0.0002, "elapsed_sec": 46343.69837284088, "step_time_sec": 8.22974432099727, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5588, "loss": 4.055065631866455, "lr": 0.0002, "elapsed_sec": 46351.926879644394, "step_time_sec": 8.228368029987905, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5589, "loss": 3.964200496673584, "lr": 0.0002, "elapsed_sec": 46360.158081531525, "step_time_sec": 8.231045974011067, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5590, "loss": 4.075218200683594, "lr": 0.0002, "elapsed_sec": 46368.38885259628, "step_time_sec": 8.230638006993104, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5591, "loss": 4.039418697357178, "lr": 0.0002, "elapsed_sec": 46376.61919975281, "step_time_sec": 8.230139839986805, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5592, "loss": 4.0152082443237305, "lr": 0.0002, "elapsed_sec": 46384.848397254944, "step_time_sec": 8.22902580499067, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5593, "loss": 3.928719997406006, "lr": 0.0002, "elapsed_sec": 46393.07671761513, "step_time_sec": 8.22821616701549, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5594, "loss": 4.057676792144775, "lr": 0.0002, "elapsed_sec": 46401.30573630333, "step_time_sec": 8.228868150006747, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5595, "loss": 3.9663772583007812, "lr": 0.0002, "elapsed_sec": 46409.53408432007, "step_time_sec": 8.228151980001712, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5596, "loss": 4.0645246505737305, "lr": 0.0002, "elapsed_sec": 46417.76289463043, "step_time_sec": 8.228659899992635, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5597, "loss": 3.8374476432800293, "lr": 0.0002, "elapsed_sec": 46425.99228620529, "step_time_sec": 8.229208370001288, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5598, "loss": 4.210233211517334, "lr": 0.0002, "elapsed_sec": 46434.22138142586, "step_time_sec": 8.228933350008447, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5599, "loss": 3.8942618370056152, "lr": 0.0002, "elapsed_sec": 46442.45194721222, "step_time_sec": 8.230416973005049, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5600, "loss": 4.120336055755615, "lr": 0.0002, "elapsed_sec": 46450.681839466095, "step_time_sec": 8.22972803999437, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5601, "loss": 4.185028076171875, "lr": 0.0002, "elapsed_sec": 46458.98555493355, "step_time_sec": 8.24551427300321, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5602, "loss": 4.116019248962402, "lr": 0.0002, "elapsed_sec": 46467.215229034424, "step_time_sec": 8.229578076017788, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5603, "loss": 4.059484004974365, "lr": 0.0002, "elapsed_sec": 46475.444080114365, "step_time_sec": 8.228602108982159, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5604, "loss": 4.05063009262085, "lr": 0.0002, "elapsed_sec": 46483.673698186874, "step_time_sec": 8.22944926499622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5605, "loss": 3.9039275646209717, "lr": 0.0002, "elapsed_sec": 46491.904906988144, "step_time_sec": 8.231048116984311, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5606, "loss": 3.966757297515869, "lr": 0.0002, "elapsed_sec": 46500.134237766266, "step_time_sec": 8.229158208006993, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5607, "loss": 4.031544208526611, "lr": 0.0002, "elapsed_sec": 46508.364845752716, "step_time_sec": 8.230501810990972, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5608, "loss": 4.166508674621582, "lr": 0.0002, "elapsed_sec": 46516.596056222916, "step_time_sec": 8.231030860013561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5609, "loss": 4.159998893737793, "lr": 0.0002, "elapsed_sec": 46524.827013492584, "step_time_sec": 8.23075563000748, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5610, "loss": 3.9262173175811768, "lr": 0.0002, "elapsed_sec": 46533.05812835693, "step_time_sec": 8.231010226998478, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5611, "loss": 4.003180980682373, "lr": 0.0002, "elapsed_sec": 46541.28916168213, "step_time_sec": 8.230837511015125, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5612, "loss": 4.128458023071289, "lr": 0.0002, "elapsed_sec": 46549.52109217644, "step_time_sec": 8.231763109011808, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5613, "loss": 3.939514398574829, "lr": 0.0002, "elapsed_sec": 46557.75169086456, "step_time_sec": 8.230459976010025, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5614, "loss": 4.289246082305908, "lr": 0.0002, "elapsed_sec": 46565.983046770096, "step_time_sec": 8.231157987000188, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5615, "loss": 3.9581358432769775, "lr": 0.0002, "elapsed_sec": 46574.21212673187, "step_time_sec": 8.228899272013223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5616, "loss": 4.140625953674316, "lr": 0.0002, "elapsed_sec": 46582.440638542175, "step_time_sec": 8.22839923101128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5617, "loss": 4.147453308105469, "lr": 0.0002, "elapsed_sec": 46590.67121434212, "step_time_sec": 8.23037520199432, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5618, "loss": 4.025333881378174, "lr": 0.0002, "elapsed_sec": 46598.90193295479, "step_time_sec": 8.230570761021227, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5619, "loss": 4.139280319213867, "lr": 0.0002, "elapsed_sec": 46607.133783102036, "step_time_sec": 8.23169001401402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5620, "loss": 4.025697231292725, "lr": 0.0002, "elapsed_sec": 46615.36506438255, "step_time_sec": 8.23109907298931, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5621, "loss": 3.846088171005249, "lr": 0.0002, "elapsed_sec": 46623.59638547897, "step_time_sec": 8.231140165997203, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5622, "loss": 4.09423303604126, "lr": 0.0002, "elapsed_sec": 46631.827306985855, "step_time_sec": 8.230776591983158, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5623, "loss": 4.19359016418457, "lr": 0.0002, "elapsed_sec": 46640.05756735802, "step_time_sec": 8.2301004199835, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5624, "loss": 4.13751745223999, "lr": 0.0002, "elapsed_sec": 46648.286014556885, "step_time_sec": 8.228362502006348, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5625, "loss": 4.119840621948242, "lr": 0.0002, "elapsed_sec": 46656.515595674515, "step_time_sec": 8.22934405101114, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5626, "loss": 4.081584930419922, "lr": 0.0002, "elapsed_sec": 46664.74622941017, "step_time_sec": 8.230478669021977, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5627, "loss": 3.9091498851776123, "lr": 0.0002, "elapsed_sec": 46672.97695398331, "step_time_sec": 8.230611617997056, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5628, "loss": 4.194128036499023, "lr": 0.0002, "elapsed_sec": 46681.206954717636, "step_time_sec": 8.229831320000812, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5629, "loss": 4.124537944793701, "lr": 0.0002, "elapsed_sec": 46689.43721199036, "step_time_sec": 8.230092437996063, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5630, "loss": 4.061858654022217, "lr": 0.0002, "elapsed_sec": 46697.66842198372, "step_time_sec": 8.231030882976484, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5631, "loss": 4.088557243347168, "lr": 0.0002, "elapsed_sec": 46705.899174928665, "step_time_sec": 8.230566893005744, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5632, "loss": 4.150774955749512, "lr": 0.0002, "elapsed_sec": 46714.12915420532, "step_time_sec": 8.229845353984274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5633, "loss": 3.915262460708618, "lr": 0.0002, "elapsed_sec": 46722.35871243477, "step_time_sec": 8.229370299988659, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5634, "loss": 4.114609718322754, "lr": 0.0002, "elapsed_sec": 46730.58785557747, "step_time_sec": 8.228997391997837, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5635, "loss": 4.084519863128662, "lr": 0.0002, "elapsed_sec": 46738.81859874725, "step_time_sec": 8.230572265019873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5636, "loss": 4.156869888305664, "lr": 0.0002, "elapsed_sec": 46747.04975152016, "step_time_sec": 8.231017632002477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5637, "loss": 4.045841693878174, "lr": 0.0002, "elapsed_sec": 46755.28090786934, "step_time_sec": 8.231026970985113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5638, "loss": 4.091902732849121, "lr": 0.0002, "elapsed_sec": 46763.50985598564, "step_time_sec": 8.228771790978499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5639, "loss": 4.088434219360352, "lr": 0.0002, "elapsed_sec": 46771.7382478714, "step_time_sec": 8.228234381007496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5640, "loss": 4.163364887237549, "lr": 0.0002, "elapsed_sec": 46779.9670214653, "step_time_sec": 8.228595313004917, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5641, "loss": 4.05147647857666, "lr": 0.0002, "elapsed_sec": 46788.1991622448, "step_time_sec": 8.232025068020448, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5642, "loss": 4.064853191375732, "lr": 0.0002, "elapsed_sec": 46796.43090558052, "step_time_sec": 8.231511929014232, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5643, "loss": 3.970745325088501, "lr": 0.0002, "elapsed_sec": 46804.661512851715, "step_time_sec": 8.230449237016728, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5644, "loss": 4.176699161529541, "lr": 0.0002, "elapsed_sec": 46812.89222431183, "step_time_sec": 8.230596873007016, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5645, "loss": 4.108786106109619, "lr": 0.0002, "elapsed_sec": 46821.1233689785, "step_time_sec": 8.230939036991913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5646, "loss": 4.1282057762146, "lr": 0.0002, "elapsed_sec": 46829.35380792618, "step_time_sec": 8.230286122008692, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5647, "loss": 4.131052017211914, "lr": 0.0002, "elapsed_sec": 46837.58517932892, "step_time_sec": 8.231266843999038, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5648, "loss": 3.9953486919403076, "lr": 0.0002, "elapsed_sec": 46845.815574645996, "step_time_sec": 8.230176734999986, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5649, "loss": 4.139101982116699, "lr": 0.0002, "elapsed_sec": 46854.046075344086, "step_time_sec": 8.230325442011235, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5650, "loss": 3.9981374740600586, "lr": 0.0002, "elapsed_sec": 46862.27743887901, "step_time_sec": 8.231202315015253, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5651, "loss": 4.110328197479248, "lr": 0.0002, "elapsed_sec": 46870.50574326515, "step_time_sec": 8.228161745006219, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5652, "loss": 4.03833532333374, "lr": 0.0002, "elapsed_sec": 46878.73614215851, "step_time_sec": 8.230246753024403, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5653, "loss": 4.0819292068481445, "lr": 0.0002, "elapsed_sec": 46886.9665749073, "step_time_sec": 8.230325899989111, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5654, "loss": 4.209655284881592, "lr": 0.0002, "elapsed_sec": 46895.19818663597, "step_time_sec": 8.231383094011107, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5655, "loss": 4.01708459854126, "lr": 0.0002, "elapsed_sec": 46903.42966461182, "step_time_sec": 8.231379041011678, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5656, "loss": 4.015016078948975, "lr": 0.0002, "elapsed_sec": 46911.66079998016, "step_time_sec": 8.230920027999673, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5657, "loss": 4.087488174438477, "lr": 0.0002, "elapsed_sec": 46919.891869068146, "step_time_sec": 8.230921865004348, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5658, "loss": 4.343107223510742, "lr": 0.0002, "elapsed_sec": 46928.120000600815, "step_time_sec": 8.228023311006837, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5659, "loss": 4.112606048583984, "lr": 0.0002, "elapsed_sec": 46936.35190653801, "step_time_sec": 8.231735651002964, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5660, "loss": 4.0599775314331055, "lr": 0.0002, "elapsed_sec": 46944.58205938339, "step_time_sec": 8.229938264994416, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5661, "loss": 4.188767910003662, "lr": 0.0002, "elapsed_sec": 46952.811559677124, "step_time_sec": 8.229383908997988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5662, "loss": 3.9859089851379395, "lr": 0.0002, "elapsed_sec": 46961.04195737839, "step_time_sec": 8.230202801991254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5663, "loss": 4.010322570800781, "lr": 0.0002, "elapsed_sec": 46969.27210140228, "step_time_sec": 8.229991804983001, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5664, "loss": 4.054261684417725, "lr": 0.0002, "elapsed_sec": 46977.50280547142, "step_time_sec": 8.23053049997543, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5665, "loss": 4.160508632659912, "lr": 0.0002, "elapsed_sec": 46985.73305726051, "step_time_sec": 8.230115607992047, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5666, "loss": 4.075756072998047, "lr": 0.0002, "elapsed_sec": 46993.96223950386, "step_time_sec": 8.229092327004764, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5667, "loss": 4.033374786376953, "lr": 0.0002, "elapsed_sec": 47002.19198465347, "step_time_sec": 8.22953786602011, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5668, "loss": 4.026778697967529, "lr": 0.0002, "elapsed_sec": 47010.41995096207, "step_time_sec": 8.227774920000229, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5669, "loss": 4.09222936630249, "lr": 0.0002, "elapsed_sec": 47018.65104722977, "step_time_sec": 8.231015222001588, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5670, "loss": 4.158336162567139, "lr": 0.0002, "elapsed_sec": 47026.88289356232, "step_time_sec": 8.231676915980643, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5671, "loss": 4.020303249359131, "lr": 0.0002, "elapsed_sec": 47035.11451172829, "step_time_sec": 8.231394842005102, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5672, "loss": 4.0657057762146, "lr": 0.0002, "elapsed_sec": 47043.344250917435, "step_time_sec": 8.229636172996834, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5673, "loss": 4.144209384918213, "lr": 0.0002, "elapsed_sec": 47051.57568216324, "step_time_sec": 8.23121018699021, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5674, "loss": 4.06658935546875, "lr": 0.0002, "elapsed_sec": 47059.80573296547, "step_time_sec": 8.229926161002368, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5675, "loss": 4.057581901550293, "lr": 0.0002, "elapsed_sec": 47068.03455758095, "step_time_sec": 8.22859292398789, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5676, "loss": 3.8941168785095215, "lr": 0.0002, "elapsed_sec": 47076.26288986206, "step_time_sec": 8.228257593000308, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5677, "loss": 4.297502040863037, "lr": 0.0002, "elapsed_sec": 47084.493602752686, "step_time_sec": 8.230516087001888, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5678, "loss": 4.120710849761963, "lr": 0.0002, "elapsed_sec": 47092.725237607956, "step_time_sec": 8.231448029022431, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5679, "loss": 4.11061954498291, "lr": 0.0002, "elapsed_sec": 47100.95624303818, "step_time_sec": 8.230849473999115, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5680, "loss": 3.939704418182373, "lr": 0.0002, "elapsed_sec": 47109.18725895882, "step_time_sec": 8.230840400996385, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5681, "loss": 4.090179443359375, "lr": 0.0002, "elapsed_sec": 47117.418199539185, "step_time_sec": 8.230763851985103, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5682, "loss": 4.160719394683838, "lr": 0.0002, "elapsed_sec": 47125.649559020996, "step_time_sec": 8.231222737987991, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5683, "loss": 3.9158220291137695, "lr": 0.0002, "elapsed_sec": 47133.88121294975, "step_time_sec": 8.231540354987374, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5684, "loss": 4.006404876708984, "lr": 0.0002, "elapsed_sec": 47142.1119556427, "step_time_sec": 8.230527328996686, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5685, "loss": 4.059987545013428, "lr": 0.0002, "elapsed_sec": 47150.34432768822, "step_time_sec": 8.23220643599052, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5686, "loss": 4.03359842300415, "lr": 0.0002, "elapsed_sec": 47158.575090408325, "step_time_sec": 8.230698408995522, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5687, "loss": 3.980417013168335, "lr": 0.0002, "elapsed_sec": 47166.80679488182, "step_time_sec": 8.231455873989034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5688, "loss": 4.139793395996094, "lr": 0.0002, "elapsed_sec": 47175.036836862564, "step_time_sec": 8.229927239008248, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5689, "loss": 4.172679424285889, "lr": 0.0002, "elapsed_sec": 47183.266626119614, "step_time_sec": 8.229580432991497, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5690, "loss": 4.1665167808532715, "lr": 0.0002, "elapsed_sec": 47191.49698162079, "step_time_sec": 8.230179945996497, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5691, "loss": 4.044037818908691, "lr": 0.0002, "elapsed_sec": 47199.72768974304, "step_time_sec": 8.230541060998803, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5692, "loss": 4.0800461769104, "lr": 0.0002, "elapsed_sec": 47207.956733226776, "step_time_sec": 8.228866984980414, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5693, "loss": 4.001661777496338, "lr": 0.0002, "elapsed_sec": 47216.18581843376, "step_time_sec": 8.228921748988796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5694, "loss": 4.081512451171875, "lr": 0.0002, "elapsed_sec": 47224.41354894638, "step_time_sec": 8.22758620098466, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5695, "loss": 4.038759708404541, "lr": 0.0002, "elapsed_sec": 47232.64414381981, "step_time_sec": 8.230467298999429, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5696, "loss": 4.169353485107422, "lr": 0.0002, "elapsed_sec": 47240.87541770935, "step_time_sec": 8.231052019982599, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5697, "loss": 4.076623439788818, "lr": 0.0002, "elapsed_sec": 47249.105676651, "step_time_sec": 8.230140687985113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5698, "loss": 4.051965713500977, "lr": 0.0002, "elapsed_sec": 47257.335940122604, "step_time_sec": 8.230061542999465, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5699, "loss": 4.075285911560059, "lr": 0.0002, "elapsed_sec": 47265.56700181961, "step_time_sec": 8.230925365001895, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5700, "loss": 4.041077136993408, "lr": 0.0002, "elapsed_sec": 47273.79725813866, "step_time_sec": 8.230119660001947, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5701, "loss": 3.9070003032684326, "lr": 0.0002, "elapsed_sec": 47282.0277671814, "step_time_sec": 8.230281648022356, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5702, "loss": 4.126590728759766, "lr": 0.0002, "elapsed_sec": 47290.257962703705, "step_time_sec": 8.230028047983069, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5703, "loss": 3.9831020832061768, "lr": 0.0002, "elapsed_sec": 47298.487595796585, "step_time_sec": 8.229476220003562, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5704, "loss": 4.078746795654297, "lr": 0.0002, "elapsed_sec": 47306.71797609329, "step_time_sec": 8.230227018008009, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5705, "loss": 4.250119209289551, "lr": 0.0002, "elapsed_sec": 47314.94771313667, "step_time_sec": 8.2295750429912, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5706, "loss": 4.178350448608398, "lr": 0.0002, "elapsed_sec": 47323.178488731384, "step_time_sec": 8.230633650993695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5707, "loss": 4.133512496948242, "lr": 0.0002, "elapsed_sec": 47331.4097571373, "step_time_sec": 8.231087993975962, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5708, "loss": 4.172065734863281, "lr": 0.0002, "elapsed_sec": 47339.64037322998, "step_time_sec": 8.230475360993296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5709, "loss": 4.109809875488281, "lr": 0.0002, "elapsed_sec": 47347.87059736252, "step_time_sec": 8.230107559007592, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5710, "loss": 4.096749305725098, "lr": 0.0002, "elapsed_sec": 47356.1003742218, "step_time_sec": 8.229558477993123, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5711, "loss": 3.9885990619659424, "lr": 0.0002, "elapsed_sec": 47364.332221746445, "step_time_sec": 8.231682823010487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5712, "loss": 4.025777339935303, "lr": 0.0002, "elapsed_sec": 47372.56181764603, "step_time_sec": 8.229440827999497, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5713, "loss": 4.065154552459717, "lr": 0.0002, "elapsed_sec": 47380.79255056381, "step_time_sec": 8.230591172992717, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5714, "loss": 3.9178731441497803, "lr": 0.0002, "elapsed_sec": 47389.02281451225, "step_time_sec": 8.230138907994842, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5715, "loss": 4.109623432159424, "lr": 0.0002, "elapsed_sec": 47397.254096746445, "step_time_sec": 8.231165987002896, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5716, "loss": 4.071839809417725, "lr": 0.0002, "elapsed_sec": 47405.48420596123, "step_time_sec": 8.229883811000036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5717, "loss": 4.076505184173584, "lr": 0.0002, "elapsed_sec": 47413.712540864944, "step_time_sec": 8.228234065987635, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5718, "loss": 3.9931509494781494, "lr": 0.0002, "elapsed_sec": 47421.94125652313, "step_time_sec": 8.228509422013303, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5719, "loss": 4.036441326141357, "lr": 0.0002, "elapsed_sec": 47430.1712038517, "step_time_sec": 8.230566658981843, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5720, "loss": 3.9507198333740234, "lr": 0.0002, "elapsed_sec": 47438.40138196945, "step_time_sec": 8.229250481002964, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5721, "loss": 4.041420936584473, "lr": 0.0002, "elapsed_sec": 47446.63008189201, "step_time_sec": 8.22853422301705, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5722, "loss": 3.990748405456543, "lr": 0.0002, "elapsed_sec": 47454.86106777191, "step_time_sec": 8.230871210020268, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5723, "loss": 4.060542106628418, "lr": 0.0002, "elapsed_sec": 47463.09309744835, "step_time_sec": 8.231828650983516, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5724, "loss": 4.020864486694336, "lr": 0.0002, "elapsed_sec": 47471.32209610939, "step_time_sec": 8.228797402989585, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5725, "loss": 3.874727249145508, "lr": 0.0002, "elapsed_sec": 47479.551580905914, "step_time_sec": 8.229330218018731, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5726, "loss": 4.022449016571045, "lr": 0.0002, "elapsed_sec": 47487.78241443634, "step_time_sec": 8.230648241005838, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5727, "loss": 4.046261787414551, "lr": 0.0002, "elapsed_sec": 47496.01326107979, "step_time_sec": 8.230730459006736, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5728, "loss": 3.9288012981414795, "lr": 0.0002, "elapsed_sec": 47504.24445152283, "step_time_sec": 8.231050319998758, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5729, "loss": 3.927400827407837, "lr": 0.0002, "elapsed_sec": 47512.47445845604, "step_time_sec": 8.229811110999435, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5730, "loss": 4.129666328430176, "lr": 0.0002, "elapsed_sec": 47520.70408630371, "step_time_sec": 8.229472919018008, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5731, "loss": 4.018298149108887, "lr": 0.0002, "elapsed_sec": 47528.93432354927, "step_time_sec": 8.230162774998462, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5732, "loss": 3.938215732574463, "lr": 0.0002, "elapsed_sec": 47537.16554284096, "step_time_sec": 8.230999561987119, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5733, "loss": 4.0617828369140625, "lr": 0.0002, "elapsed_sec": 47545.39627671242, "step_time_sec": 8.230579313996714, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5734, "loss": 4.0575056076049805, "lr": 0.0002, "elapsed_sec": 47553.627104759216, "step_time_sec": 8.23071491101291, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5735, "loss": 4.063488006591797, "lr": 0.0002, "elapsed_sec": 47561.8576900959, "step_time_sec": 8.230376651976258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5736, "loss": 3.9344284534454346, "lr": 0.0002, "elapsed_sec": 47570.08812880516, "step_time_sec": 8.230332054023165, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5737, "loss": 3.877103328704834, "lr": 0.0002, "elapsed_sec": 47578.317145347595, "step_time_sec": 8.228903985989746, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5738, "loss": 3.9939632415771484, "lr": 0.0002, "elapsed_sec": 47586.547609090805, "step_time_sec": 8.230233230016893, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5739, "loss": 4.035184383392334, "lr": 0.0002, "elapsed_sec": 47594.77791953087, "step_time_sec": 8.230137984006433, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5740, "loss": 4.039819717407227, "lr": 0.0002, "elapsed_sec": 47603.009021282196, "step_time_sec": 8.230935633997433, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5741, "loss": 3.9496161937713623, "lr": 0.0002, "elapsed_sec": 47611.23772573471, "step_time_sec": 8.228581239993218, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5742, "loss": 4.067815780639648, "lr": 0.0002, "elapsed_sec": 47619.468968868256, "step_time_sec": 8.231105676997686, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5743, "loss": 4.024702072143555, "lr": 0.0002, "elapsed_sec": 47627.69960618019, "step_time_sec": 8.230417466984363, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5744, "loss": 4.180920124053955, "lr": 0.0002, "elapsed_sec": 47635.92883205414, "step_time_sec": 8.229074527014745, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5745, "loss": 4.121452808380127, "lr": 0.0002, "elapsed_sec": 47644.157401800156, "step_time_sec": 8.228477042983286, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5746, "loss": 4.01693868637085, "lr": 0.0002, "elapsed_sec": 47652.38489866257, "step_time_sec": 8.227327982982388, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5747, "loss": 3.995736837387085, "lr": 0.0002, "elapsed_sec": 47660.61636567116, "step_time_sec": 8.231291870994028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5748, "loss": 3.9720232486724854, "lr": 0.0002, "elapsed_sec": 47668.847163915634, "step_time_sec": 8.23066785201081, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5749, "loss": 4.111087322235107, "lr": 0.0002, "elapsed_sec": 47677.07702445984, "step_time_sec": 8.229716793983243, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5750, "loss": 3.984107732772827, "lr": 0.0002, "elapsed_sec": 47685.30517578125, "step_time_sec": 8.227973544999259, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5751, "loss": 3.9549310207366943, "lr": 0.0002, "elapsed_sec": 47693.53346800804, "step_time_sec": 8.22809384498396, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5752, "loss": 4.004446029663086, "lr": 0.0002, "elapsed_sec": 47701.764487981796, "step_time_sec": 8.230865346995415, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5753, "loss": 3.89933443069458, "lr": 0.0002, "elapsed_sec": 47709.99490213394, "step_time_sec": 8.230280996998772, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5754, "loss": 4.173211097717285, "lr": 0.0002, "elapsed_sec": 47718.22409534454, "step_time_sec": 8.229021311010001, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5755, "loss": 4.015073299407959, "lr": 0.0002, "elapsed_sec": 47726.45298600197, "step_time_sec": 8.228694224002538, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5756, "loss": 3.9425899982452393, "lr": 0.0002, "elapsed_sec": 47734.681169748306, "step_time_sec": 8.228029020014219, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5757, "loss": 3.9514424800872803, "lr": 0.0002, "elapsed_sec": 47742.91072058678, "step_time_sec": 8.229381947981892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5758, "loss": 4.168205738067627, "lr": 0.0002, "elapsed_sec": 47751.14084005356, "step_time_sec": 8.229974468005821, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5759, "loss": 4.108918190002441, "lr": 0.0002, "elapsed_sec": 47759.36967110634, "step_time_sec": 8.228737180994358, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5760, "loss": 4.061925411224365, "lr": 0.0002, "elapsed_sec": 47767.598563194275, "step_time_sec": 8.228684859001078, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5761, "loss": 4.062878608703613, "lr": 0.0002, "elapsed_sec": 47775.829359292984, "step_time_sec": 8.230636416003108, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5762, "loss": 4.183744430541992, "lr": 0.0002, "elapsed_sec": 47784.06027197838, "step_time_sec": 8.23082668101415, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5763, "loss": 4.04636812210083, "lr": 0.0002, "elapsed_sec": 47792.29016375542, "step_time_sec": 8.229682642006082, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5764, "loss": 4.10865592956543, "lr": 0.0002, "elapsed_sec": 47800.52211308479, "step_time_sec": 8.231794611987425, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5765, "loss": 4.043957233428955, "lr": 0.0002, "elapsed_sec": 47808.75444030762, "step_time_sec": 8.232158193015493, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5766, "loss": 4.146218776702881, "lr": 0.0002, "elapsed_sec": 47816.984748363495, "step_time_sec": 8.230164016014896, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5767, "loss": 4.014991283416748, "lr": 0.0002, "elapsed_sec": 47825.214668512344, "step_time_sec": 8.229762366012437, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5768, "loss": 4.025537014007568, "lr": 0.0002, "elapsed_sec": 47833.4433195591, "step_time_sec": 8.22852042800514, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5769, "loss": 4.183014392852783, "lr": 0.0002, "elapsed_sec": 47841.67323040962, "step_time_sec": 8.229798599990318, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5770, "loss": 4.012434959411621, "lr": 0.0002, "elapsed_sec": 47849.903616666794, "step_time_sec": 8.230184124986408, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5771, "loss": 4.084991931915283, "lr": 0.0002, "elapsed_sec": 47858.13452386856, "step_time_sec": 8.230710345989792, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5772, "loss": 4.168278694152832, "lr": 0.0002, "elapsed_sec": 47866.36516666412, "step_time_sec": 8.230505123006878, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5773, "loss": 4.066329002380371, "lr": 0.0002, "elapsed_sec": 47874.59647679329, "step_time_sec": 8.231195972010028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5774, "loss": 4.033202648162842, "lr": 0.0002, "elapsed_sec": 47882.827677726746, "step_time_sec": 8.230979581014253, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5775, "loss": 4.067209720611572, "lr": 0.0002, "elapsed_sec": 47891.05830168724, "step_time_sec": 8.230519899982028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5776, "loss": 4.056988716125488, "lr": 0.0002, "elapsed_sec": 47899.290113687515, "step_time_sec": 8.231624476000434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5777, "loss": 4.029590129852295, "lr": 0.0002, "elapsed_sec": 47907.52059483528, "step_time_sec": 8.230319683993002, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5778, "loss": 4.01989221572876, "lr": 0.0002, "elapsed_sec": 47915.74943947792, "step_time_sec": 8.228670230979333, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5779, "loss": 4.056484699249268, "lr": 0.0002, "elapsed_sec": 47923.978466272354, "step_time_sec": 8.228924811002798, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5780, "loss": 3.906369209289551, "lr": 0.0002, "elapsed_sec": 47932.20706033707, "step_time_sec": 8.22839089500485, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5781, "loss": 4.085692405700684, "lr": 0.0002, "elapsed_sec": 47940.43540287018, "step_time_sec": 8.22818919600104, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5782, "loss": 3.9604849815368652, "lr": 0.0002, "elapsed_sec": 47948.66326189041, "step_time_sec": 8.227699923998443, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5783, "loss": 4.1523823738098145, "lr": 0.0002, "elapsed_sec": 47956.89400053024, "step_time_sec": 8.230604697979288, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5784, "loss": 3.959969997406006, "lr": 0.0002, "elapsed_sec": 47965.12432026863, "step_time_sec": 8.230094790022122, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5785, "loss": 4.134875774383545, "lr": 0.0002, "elapsed_sec": 47973.3542406559, "step_time_sec": 8.229835104983067, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5786, "loss": 3.871476888656616, "lr": 0.0002, "elapsed_sec": 47981.58342552185, "step_time_sec": 8.228980247018626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5787, "loss": 4.031925678253174, "lr": 0.0002, "elapsed_sec": 47989.814426898956, "step_time_sec": 8.230897244997323, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5788, "loss": 4.196314334869385, "lr": 0.0002, "elapsed_sec": 47998.04549527168, "step_time_sec": 8.23087193799438, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5789, "loss": 4.0300421714782715, "lr": 0.0002, "elapsed_sec": 48006.27702283859, "step_time_sec": 8.231356773991138, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5790, "loss": 3.9510269165039062, "lr": 0.0002, "elapsed_sec": 48014.50786232948, "step_time_sec": 8.23073365399614, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5791, "loss": 4.057998180389404, "lr": 0.0002, "elapsed_sec": 48022.73741006851, "step_time_sec": 8.229344105988275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5792, "loss": 4.229516506195068, "lr": 0.0002, "elapsed_sec": 48030.96564078331, "step_time_sec": 8.228130189992953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5793, "loss": 4.024885654449463, "lr": 0.0002, "elapsed_sec": 48039.19532108307, "step_time_sec": 8.229492535989266, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5794, "loss": 4.123287677764893, "lr": 0.0002, "elapsed_sec": 48047.42441368103, "step_time_sec": 8.228902100992855, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5795, "loss": 4.033829212188721, "lr": 0.0002, "elapsed_sec": 48055.65576672554, "step_time_sec": 8.231206171010854, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5796, "loss": 3.9813015460968018, "lr": 0.0002, "elapsed_sec": 48063.88657069206, "step_time_sec": 8.23065377099556, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5797, "loss": 4.061415195465088, "lr": 0.0002, "elapsed_sec": 48072.11808037758, "step_time_sec": 8.231340224010637, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5798, "loss": 3.9133236408233643, "lr": 0.0002, "elapsed_sec": 48080.34668326378, "step_time_sec": 8.228439238999272, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5799, "loss": 4.011948585510254, "lr": 0.0002, "elapsed_sec": 48088.57406973839, "step_time_sec": 8.227245576999849, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5800, "loss": 3.930088758468628, "lr": 0.0002, "elapsed_sec": 48096.802782297134, "step_time_sec": 8.228558690985665, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5801, "loss": 4.063460826873779, "lr": 0.0002, "elapsed_sec": 48105.03196787834, "step_time_sec": 8.229096739989473, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5802, "loss": 3.921696901321411, "lr": 0.0002, "elapsed_sec": 48113.25993824005, "step_time_sec": 8.227739527006634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5803, "loss": 3.9812936782836914, "lr": 0.0002, "elapsed_sec": 48121.490502119064, "step_time_sec": 8.230476332013495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5804, "loss": 3.9132492542266846, "lr": 0.0002, "elapsed_sec": 48129.71919941902, "step_time_sec": 8.228525029000593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5805, "loss": 3.9725098609924316, "lr": 0.0002, "elapsed_sec": 48137.94783806801, "step_time_sec": 8.228452166018542, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5806, "loss": 4.0284833908081055, "lr": 0.0002, "elapsed_sec": 48146.17843270302, "step_time_sec": 8.23042344598798, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5807, "loss": 3.993593692779541, "lr": 0.0002, "elapsed_sec": 48154.4087870121, "step_time_sec": 8.230250277993036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5808, "loss": 3.973576784133911, "lr": 0.0002, "elapsed_sec": 48162.637278556824, "step_time_sec": 8.228307152021443, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5809, "loss": 4.034078598022461, "lr": 0.0002, "elapsed_sec": 48170.86693096161, "step_time_sec": 8.229505431983853, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5810, "loss": 4.22401762008667, "lr": 0.0002, "elapsed_sec": 48179.09635210037, "step_time_sec": 8.229263485991396, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5811, "loss": 4.069217681884766, "lr": 0.0002, "elapsed_sec": 48187.326941251755, "step_time_sec": 8.23043953301385, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5812, "loss": 3.972231149673462, "lr": 0.0002, "elapsed_sec": 48195.55815768242, "step_time_sec": 8.231133280001814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5813, "loss": 4.020587921142578, "lr": 0.0002, "elapsed_sec": 48203.78884887695, "step_time_sec": 8.230465264990926, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5814, "loss": 4.1397528648376465, "lr": 0.0002, "elapsed_sec": 48212.0187830925, "step_time_sec": 8.229778836015612, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5815, "loss": 4.232081413269043, "lr": 0.0002, "elapsed_sec": 48220.25037050247, "step_time_sec": 8.231425917998422, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5816, "loss": 4.05958366394043, "lr": 0.0002, "elapsed_sec": 48228.48078894615, "step_time_sec": 8.230315363005502, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5817, "loss": 4.117853164672852, "lr": 0.0002, "elapsed_sec": 48236.712164878845, "step_time_sec": 8.231146567995893, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5818, "loss": 4.112892150878906, "lr": 0.0002, "elapsed_sec": 48244.94296050072, "step_time_sec": 8.230680773005588, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5819, "loss": 4.092519283294678, "lr": 0.0002, "elapsed_sec": 48253.17251253128, "step_time_sec": 8.229367007996188, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5820, "loss": 3.944852828979492, "lr": 0.0002, "elapsed_sec": 48261.40153670311, "step_time_sec": 8.228858153976034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5821, "loss": 3.9246599674224854, "lr": 0.0002, "elapsed_sec": 48269.63028049469, "step_time_sec": 8.228656664985465, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5822, "loss": 4.03304386138916, "lr": 0.0002, "elapsed_sec": 48277.86231255531, "step_time_sec": 8.23186528502265, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5823, "loss": 3.9927854537963867, "lr": 0.0002, "elapsed_sec": 48286.093361616135, "step_time_sec": 8.230853585002478, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5824, "loss": 4.0094428062438965, "lr": 0.0002, "elapsed_sec": 48294.324427604675, "step_time_sec": 8.23095093597658, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5825, "loss": 4.0110182762146, "lr": 0.0002, "elapsed_sec": 48302.555572748184, "step_time_sec": 8.230932475009467, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5826, "loss": 4.064759731292725, "lr": 0.0002, "elapsed_sec": 48310.78582119942, "step_time_sec": 8.230103505018633, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5827, "loss": 4.070398330688477, "lr": 0.0002, "elapsed_sec": 48319.01734614372, "step_time_sec": 8.231378933996893, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5828, "loss": 3.9978926181793213, "lr": 0.0002, "elapsed_sec": 48327.24890422821, "step_time_sec": 8.231410709995544, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5829, "loss": 4.008375644683838, "lr": 0.0002, "elapsed_sec": 48335.47881603241, "step_time_sec": 8.229777492000721, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5830, "loss": 3.897425651550293, "lr": 0.0002, "elapsed_sec": 48343.70866727829, "step_time_sec": 8.229644706007093, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5831, "loss": 4.111202716827393, "lr": 0.0002, "elapsed_sec": 48351.93873858452, "step_time_sec": 8.229945165017853, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5832, "loss": 3.994337320327759, "lr": 0.0002, "elapsed_sec": 48360.169016599655, "step_time_sec": 8.23016227898188, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5833, "loss": 4.1271772384643555, "lr": 0.0002, "elapsed_sec": 48368.39950847626, "step_time_sec": 8.23030439100694, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5834, "loss": 3.7437548637390137, "lr": 0.0002, "elapsed_sec": 48376.63122224808, "step_time_sec": 8.231593033997342, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5835, "loss": 3.92572283744812, "lr": 0.0002, "elapsed_sec": 48384.86138343811, "step_time_sec": 8.229944541002624, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5836, "loss": 4.030357360839844, "lr": 0.0002, "elapsed_sec": 48393.090453863144, "step_time_sec": 8.22892698700889, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5837, "loss": 4.021367073059082, "lr": 0.0002, "elapsed_sec": 48401.31996369362, "step_time_sec": 8.229357290983899, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5838, "loss": 3.901158571243286, "lr": 0.0002, "elapsed_sec": 48409.55214595795, "step_time_sec": 8.232071531005204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5839, "loss": 3.9320926666259766, "lr": 0.0002, "elapsed_sec": 48417.78262734413, "step_time_sec": 8.23028043899103, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5840, "loss": 3.9690728187561035, "lr": 0.0002, "elapsed_sec": 48426.013409137726, "step_time_sec": 8.23064903501654, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5841, "loss": 3.8664419651031494, "lr": 0.0002, "elapsed_sec": 48434.24433302879, "step_time_sec": 8.23076170499553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5842, "loss": 3.9616036415100098, "lr": 0.0002, "elapsed_sec": 48442.47485637665, "step_time_sec": 8.230385533010121, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5843, "loss": 4.084275722503662, "lr": 0.0002, "elapsed_sec": 48450.705562114716, "step_time_sec": 8.23053351798444, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5844, "loss": 3.954878330230713, "lr": 0.0002, "elapsed_sec": 48458.93548965454, "step_time_sec": 8.229782344016712, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5845, "loss": 4.102745056152344, "lr": 0.0002, "elapsed_sec": 48467.16673898697, "step_time_sec": 8.231117477000225, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5846, "loss": 3.9642093181610107, "lr": 0.0002, "elapsed_sec": 48475.397329092026, "step_time_sec": 8.230456858989783, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5847, "loss": 4.069472789764404, "lr": 0.0002, "elapsed_sec": 48483.627434015274, "step_time_sec": 8.229905448999489, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5848, "loss": 3.9578843116760254, "lr": 0.0002, "elapsed_sec": 48491.85738658905, "step_time_sec": 8.229809483018471, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5849, "loss": 4.061724662780762, "lr": 0.0002, "elapsed_sec": 48500.08790373802, "step_time_sec": 8.230399259016849, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5850, "loss": 4.035805702209473, "lr": 0.0002, "elapsed_sec": 48508.318878650665, "step_time_sec": 8.230791841982864, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5851, "loss": 3.9827089309692383, "lr": 0.0002, "elapsed_sec": 48516.5504026413, "step_time_sec": 8.231366273015738, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5852, "loss": 4.0666351318359375, "lr": 0.0002, "elapsed_sec": 48524.78103685379, "step_time_sec": 8.230503433995182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5853, "loss": 4.061349868774414, "lr": 0.0002, "elapsed_sec": 48533.01190018654, "step_time_sec": 8.230664923990844, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5854, "loss": 3.779576301574707, "lr": 0.0002, "elapsed_sec": 48541.242507219315, "step_time_sec": 8.230478618002962, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5855, "loss": 4.080240726470947, "lr": 0.0002, "elapsed_sec": 48549.47099137306, "step_time_sec": 8.228352911013644, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5856, "loss": 3.9810428619384766, "lr": 0.0002, "elapsed_sec": 48557.70141291618, "step_time_sec": 8.230203774000984, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5857, "loss": 4.176366329193115, "lr": 0.0002, "elapsed_sec": 48565.93020606041, "step_time_sec": 8.228691341995727, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5858, "loss": 3.996509075164795, "lr": 0.0002, "elapsed_sec": 48574.15883398056, "step_time_sec": 8.228478761011502, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5859, "loss": 4.0038347244262695, "lr": 0.0002, "elapsed_sec": 48582.38618803024, "step_time_sec": 8.227150771999732, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5860, "loss": 4.0304083824157715, "lr": 0.0002, "elapsed_sec": 48590.61357212067, "step_time_sec": 8.227263345004758, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5861, "loss": 3.9658806324005127, "lr": 0.0002, "elapsed_sec": 48598.842960357666, "step_time_sec": 8.229186773009133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5862, "loss": 3.931267738342285, "lr": 0.0002, "elapsed_sec": 48607.07340717316, "step_time_sec": 8.230316796980333, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5863, "loss": 4.1305623054504395, "lr": 0.0002, "elapsed_sec": 48615.30408143997, "step_time_sec": 8.230580183007987, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5864, "loss": 4.145278453826904, "lr": 0.0002, "elapsed_sec": 48623.53372144699, "step_time_sec": 8.229395283997292, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5865, "loss": 3.9933199882507324, "lr": 0.0002, "elapsed_sec": 48631.76312112808, "step_time_sec": 8.229238165018614, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5866, "loss": 4.089040279388428, "lr": 0.0002, "elapsed_sec": 48639.99251270294, "step_time_sec": 8.229259204003029, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5867, "loss": 3.927889347076416, "lr": 0.0002, "elapsed_sec": 48648.22287774086, "step_time_sec": 8.230233853013488, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5868, "loss": 4.03428316116333, "lr": 0.0002, "elapsed_sec": 48656.4535112381, "step_time_sec": 8.23044178498094, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5869, "loss": 4.08319091796875, "lr": 0.0002, "elapsed_sec": 48664.68414092064, "step_time_sec": 8.230509888991946, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5870, "loss": 4.038322925567627, "lr": 0.0002, "elapsed_sec": 48672.9160323143, "step_time_sec": 8.231681642995682, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5871, "loss": 4.003343105316162, "lr": 0.0002, "elapsed_sec": 48681.146089315414, "step_time_sec": 8.229884638014482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5872, "loss": 4.04691743850708, "lr": 0.0002, "elapsed_sec": 48689.37519264221, "step_time_sec": 8.228986237983918, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5873, "loss": 3.9259448051452637, "lr": 0.0002, "elapsed_sec": 48697.603800058365, "step_time_sec": 8.228439414000604, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5874, "loss": 4.018320560455322, "lr": 0.0002, "elapsed_sec": 48705.834451675415, "step_time_sec": 8.230510519002564, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5875, "loss": 4.122435569763184, "lr": 0.0002, "elapsed_sec": 48714.06449651718, "step_time_sec": 8.229899262980325, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5876, "loss": 3.9625725746154785, "lr": 0.0002, "elapsed_sec": 48722.29258656502, "step_time_sec": 8.227934631024254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5877, "loss": 4.046468257904053, "lr": 0.0002, "elapsed_sec": 48730.52412080765, "step_time_sec": 8.231432336004218, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5878, "loss": 4.069897174835205, "lr": 0.0002, "elapsed_sec": 48738.75448131561, "step_time_sec": 8.230159854021622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5879, "loss": 4.177279472351074, "lr": 0.0002, "elapsed_sec": 48746.98550057411, "step_time_sec": 8.230842120014131, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5880, "loss": 3.872575283050537, "lr": 0.0002, "elapsed_sec": 48755.21677279472, "step_time_sec": 8.231125321006402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5881, "loss": 3.868014335632324, "lr": 0.0002, "elapsed_sec": 48763.44734573364, "step_time_sec": 8.23047053598566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5882, "loss": 4.134852886199951, "lr": 0.0002, "elapsed_sec": 48771.67875289917, "step_time_sec": 8.231204176991014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5883, "loss": 4.1135077476501465, "lr": 0.0002, "elapsed_sec": 48779.90768647194, "step_time_sec": 8.228837636997923, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5884, "loss": 4.087094306945801, "lr": 0.0002, "elapsed_sec": 48788.137622356415, "step_time_sec": 8.229818304011133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5885, "loss": 3.8542909622192383, "lr": 0.0002, "elapsed_sec": 48796.36624073982, "step_time_sec": 8.228422665997641, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5886, "loss": 4.039510250091553, "lr": 0.0002, "elapsed_sec": 48804.59261012077, "step_time_sec": 8.226159781013848, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5887, "loss": 4.123979091644287, "lr": 0.0002, "elapsed_sec": 48812.82253956795, "step_time_sec": 8.229796576983063, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5888, "loss": 4.118004322052002, "lr": 0.0002, "elapsed_sec": 48821.052685022354, "step_time_sec": 8.2299576770165, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5889, "loss": 4.113828182220459, "lr": 0.0002, "elapsed_sec": 48829.28312039375, "step_time_sec": 8.230260778014781, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5890, "loss": 4.041684150695801, "lr": 0.0002, "elapsed_sec": 48837.51431250572, "step_time_sec": 8.231053764000535, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5891, "loss": 3.81937575340271, "lr": 0.0002, "elapsed_sec": 48845.744878292084, "step_time_sec": 8.23045284699765, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5892, "loss": 4.1212568283081055, "lr": 0.0002, "elapsed_sec": 48853.974034547806, "step_time_sec": 8.229025566019118, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5893, "loss": 4.069974899291992, "lr": 0.0002, "elapsed_sec": 48862.20297908783, "step_time_sec": 8.22871840200969, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5894, "loss": 4.189481735229492, "lr": 0.0002, "elapsed_sec": 48870.43055009842, "step_time_sec": 8.227425601013238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5895, "loss": 4.047233581542969, "lr": 0.0002, "elapsed_sec": 48878.6609711647, "step_time_sec": 8.230259288015077, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5896, "loss": 4.0059733390808105, "lr": 0.0002, "elapsed_sec": 48886.891228437424, "step_time_sec": 8.230168616981246, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5897, "loss": 4.046464443206787, "lr": 0.0002, "elapsed_sec": 48895.12194442749, "step_time_sec": 8.230533638998168, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5898, "loss": 4.0166425704956055, "lr": 0.0002, "elapsed_sec": 48903.35148525238, "step_time_sec": 8.22940747498069, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5899, "loss": 3.9409234523773193, "lr": 0.0002, "elapsed_sec": 48911.58023047447, "step_time_sec": 8.228530455002328, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5900, "loss": 3.9396419525146484, "lr": 0.0002, "elapsed_sec": 48919.81093144417, "step_time_sec": 8.230591791012557, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5901, "loss": 4.124947547912598, "lr": 0.0002, "elapsed_sec": 48928.04130792618, "step_time_sec": 8.23016586599988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5902, "loss": 4.0157246589660645, "lr": 0.0002, "elapsed_sec": 48936.27279305458, "step_time_sec": 8.23134333800408, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5903, "loss": 4.061605453491211, "lr": 0.0002, "elapsed_sec": 48944.50252199173, "step_time_sec": 8.229592910996871, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5904, "loss": 4.153736114501953, "lr": 0.0002, "elapsed_sec": 48952.734164237976, "step_time_sec": 8.231491417012876, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5905, "loss": 3.991347551345825, "lr": 0.0002, "elapsed_sec": 48960.96610569954, "step_time_sec": 8.231828991003567, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5906, "loss": 4.1789631843566895, "lr": 0.0002, "elapsed_sec": 48969.196580410004, "step_time_sec": 8.230275551992236, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5907, "loss": 4.142834186553955, "lr": 0.0002, "elapsed_sec": 48977.42570900917, "step_time_sec": 8.229020874015987, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5908, "loss": 4.041234016418457, "lr": 0.0002, "elapsed_sec": 48985.655151844025, "step_time_sec": 8.229241146997083, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5909, "loss": 3.8919639587402344, "lr": 0.0002, "elapsed_sec": 48993.88589811325, "step_time_sec": 8.230632403981872, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5910, "loss": 3.898836135864258, "lr": 0.0002, "elapsed_sec": 49002.116337776184, "step_time_sec": 8.230245126003865, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5911, "loss": 4.053189754486084, "lr": 0.0002, "elapsed_sec": 49010.34889340401, "step_time_sec": 8.23239244800061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5912, "loss": 4.087409496307373, "lr": 0.0002, "elapsed_sec": 49018.579139471054, "step_time_sec": 8.230168196983868, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5913, "loss": 3.962080955505371, "lr": 0.0002, "elapsed_sec": 49026.81009864807, "step_time_sec": 8.23073145901435, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5914, "loss": 4.10127592086792, "lr": 0.0002, "elapsed_sec": 49035.039850473404, "step_time_sec": 8.229672168003162, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5915, "loss": 4.001861572265625, "lr": 0.0002, "elapsed_sec": 49043.26854920387, "step_time_sec": 8.228456102981, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5916, "loss": 3.8548309803009033, "lr": 0.0002, "elapsed_sec": 49051.49895238876, "step_time_sec": 8.23027282799012, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5917, "loss": 4.173124313354492, "lr": 0.0002, "elapsed_sec": 49059.72909116745, "step_time_sec": 8.229959258984309, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5918, "loss": 3.9571120738983154, "lr": 0.0002, "elapsed_sec": 49067.95819211006, "step_time_sec": 8.22893925398239, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5919, "loss": 3.8916518688201904, "lr": 0.0002, "elapsed_sec": 49076.18786764145, "step_time_sec": 8.22955433797324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5920, "loss": 4.1448845863342285, "lr": 0.0002, "elapsed_sec": 49084.41669511795, "step_time_sec": 8.228637670981698, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5921, "loss": 3.9988038539886475, "lr": 0.0002, "elapsed_sec": 49092.64653420448, "step_time_sec": 8.229667762992904, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5922, "loss": 4.013201713562012, "lr": 0.0002, "elapsed_sec": 49100.877331495285, "step_time_sec": 8.230722034990322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5923, "loss": 3.902308940887451, "lr": 0.0002, "elapsed_sec": 49109.1068816185, "step_time_sec": 8.229318743018666, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5924, "loss": 3.963385820388794, "lr": 0.0002, "elapsed_sec": 49117.334245204926, "step_time_sec": 8.227212978003081, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5925, "loss": 4.069861888885498, "lr": 0.0002, "elapsed_sec": 49125.56318283081, "step_time_sec": 8.228817595023429, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5926, "loss": 3.9411425590515137, "lr": 0.0002, "elapsed_sec": 49133.7946767807, "step_time_sec": 8.231302502012113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5927, "loss": 4.049111366271973, "lr": 0.0002, "elapsed_sec": 49142.02623653412, "step_time_sec": 8.231433938984992, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5928, "loss": 4.007149696350098, "lr": 0.0002, "elapsed_sec": 49150.25681734085, "step_time_sec": 8.23046627198346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5929, "loss": 4.038665294647217, "lr": 0.0002, "elapsed_sec": 49158.48753905296, "step_time_sec": 8.230527395993704, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5930, "loss": 4.124094486236572, "lr": 0.0002, "elapsed_sec": 49166.718636512756, "step_time_sec": 8.230943536997074, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5931, "loss": 3.9396300315856934, "lr": 0.0002, "elapsed_sec": 49174.949496269226, "step_time_sec": 8.230766355991364, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5932, "loss": 4.00822639465332, "lr": 0.0002, "elapsed_sec": 49183.180608034134, "step_time_sec": 8.230903792980826, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5933, "loss": 4.070846080780029, "lr": 0.0002, "elapsed_sec": 49191.41207242012, "step_time_sec": 8.231348399014678, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5934, "loss": 3.9879491329193115, "lr": 0.0002, "elapsed_sec": 49199.64218735695, "step_time_sec": 8.229918497003382, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5935, "loss": 4.052121639251709, "lr": 0.0002, "elapsed_sec": 49207.87297129631, "step_time_sec": 8.230665276991203, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5936, "loss": 3.941101551055908, "lr": 0.0002, "elapsed_sec": 49216.103618621826, "step_time_sec": 8.23049458899186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5937, "loss": 4.04248571395874, "lr": 0.0002, "elapsed_sec": 49224.33489179611, "step_time_sec": 8.231082279002294, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5938, "loss": 4.043374061584473, "lr": 0.0002, "elapsed_sec": 49232.565821409225, "step_time_sec": 8.230773916991893, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5939, "loss": 4.019152641296387, "lr": 0.0002, "elapsed_sec": 49240.79737830162, "step_time_sec": 8.23140198198962, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5940, "loss": 4.049572467803955, "lr": 0.0002, "elapsed_sec": 49249.02737045288, "step_time_sec": 8.229851267999038, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5941, "loss": 4.130309104919434, "lr": 0.0002, "elapsed_sec": 49257.257737874985, "step_time_sec": 8.23020501298015, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5942, "loss": 4.0599260330200195, "lr": 0.0002, "elapsed_sec": 49265.48519539833, "step_time_sec": 8.227307461987948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5943, "loss": 4.168359756469727, "lr": 0.0002, "elapsed_sec": 49273.71415448189, "step_time_sec": 8.228868464007974, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5944, "loss": 4.24461555480957, "lr": 0.0002, "elapsed_sec": 49281.94164228439, "step_time_sec": 8.2272573109949, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5945, "loss": 4.072183132171631, "lr": 0.0002, "elapsed_sec": 49290.16966295242, "step_time_sec": 8.227920675009955, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5946, "loss": 4.023138046264648, "lr": 0.0002, "elapsed_sec": 49298.39842772484, "step_time_sec": 8.228599541005678, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5947, "loss": 4.107730865478516, "lr": 0.0002, "elapsed_sec": 49306.62791013718, "step_time_sec": 8.229333915980533, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5948, "loss": 4.043879508972168, "lr": 0.0002, "elapsed_sec": 49314.85895490646, "step_time_sec": 8.230795495997882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5949, "loss": 3.943483591079712, "lr": 0.0002, "elapsed_sec": 49323.0898065567, "step_time_sec": 8.230717172991717, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5950, "loss": 4.058795928955078, "lr": 0.0002, "elapsed_sec": 49331.32089138031, "step_time_sec": 8.230967337993206, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5951, "loss": 4.1078901290893555, "lr": 0.0002, "elapsed_sec": 49339.55188703537, "step_time_sec": 8.230763638013741, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5952, "loss": 4.0299882888793945, "lr": 0.0002, "elapsed_sec": 49347.78172326088, "step_time_sec": 8.229669930995442, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5953, "loss": 3.922865152359009, "lr": 0.0002, "elapsed_sec": 49356.010001420975, "step_time_sec": 8.22811292699771, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5954, "loss": 3.988806962966919, "lr": 0.0002, "elapsed_sec": 49364.241104602814, "step_time_sec": 8.23095332601224, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5955, "loss": 4.0171990394592285, "lr": 0.0002, "elapsed_sec": 49372.4722905159, "step_time_sec": 8.23100464901654, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5956, "loss": 4.149168491363525, "lr": 0.0002, "elapsed_sec": 49380.70311284065, "step_time_sec": 8.230686701979721, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5957, "loss": 4.002275466918945, "lr": 0.0002, "elapsed_sec": 49388.934170246124, "step_time_sec": 8.230886937002651, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5958, "loss": 3.955754518508911, "lr": 0.0002, "elapsed_sec": 49397.16448569298, "step_time_sec": 8.230157830985263, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5959, "loss": 4.299796104431152, "lr": 0.0002, "elapsed_sec": 49405.396139383316, "step_time_sec": 8.231556870014174, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5960, "loss": 4.020846366882324, "lr": 0.0002, "elapsed_sec": 49413.625458955765, "step_time_sec": 8.229114771995228, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5961, "loss": 4.122160911560059, "lr": 0.0002, "elapsed_sec": 49421.85533976555, "step_time_sec": 8.229749104997609, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5962, "loss": 4.053924560546875, "lr": 0.0002, "elapsed_sec": 49430.08390665054, "step_time_sec": 8.22844437501044, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5963, "loss": 4.079655647277832, "lr": 0.0002, "elapsed_sec": 49438.31048941612, "step_time_sec": 8.226454440009547, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5964, "loss": 4.008899688720703, "lr": 0.0002, "elapsed_sec": 49446.53977680206, "step_time_sec": 8.229097266012104, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5965, "loss": 4.050369739532471, "lr": 0.0002, "elapsed_sec": 49454.77025818825, "step_time_sec": 8.230295816989383, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5966, "loss": 4.026066303253174, "lr": 0.0002, "elapsed_sec": 49463.00155854225, "step_time_sec": 8.231133741006488, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5967, "loss": 3.959984302520752, "lr": 0.0002, "elapsed_sec": 49471.229565143585, "step_time_sec": 8.227863747015363, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5968, "loss": 3.9624462127685547, "lr": 0.0002, "elapsed_sec": 49479.45930790901, "step_time_sec": 8.229593848023796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5969, "loss": 4.100245475769043, "lr": 0.0002, "elapsed_sec": 49487.688796281815, "step_time_sec": 8.229312062001554, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5970, "loss": 4.09610652923584, "lr": 0.0002, "elapsed_sec": 49495.91754293442, "step_time_sec": 8.228594304993749, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5971, "loss": 4.04217004776001, "lr": 0.0002, "elapsed_sec": 49504.1459133625, "step_time_sec": 8.228224582999246, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5972, "loss": 3.93807053565979, "lr": 0.0002, "elapsed_sec": 49512.37729382515, "step_time_sec": 8.231231968995417, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5973, "loss": 4.042519569396973, "lr": 0.0002, "elapsed_sec": 49520.607776641846, "step_time_sec": 8.230351297999732, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5974, "loss": 4.078677654266357, "lr": 0.0002, "elapsed_sec": 49528.839104652405, "step_time_sec": 8.231197808985598, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5975, "loss": 4.114041805267334, "lr": 0.0002, "elapsed_sec": 49537.06848526001, "step_time_sec": 8.229168359015603, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5976, "loss": 4.131710529327393, "lr": 0.0002, "elapsed_sec": 49545.29688954353, "step_time_sec": 8.228307084995322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5977, "loss": 4.014865875244141, "lr": 0.0002, "elapsed_sec": 49553.526357889175, "step_time_sec": 8.229265906004002, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5978, "loss": 3.9588091373443604, "lr": 0.0002, "elapsed_sec": 49561.75539445877, "step_time_sec": 8.228967739007203, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5979, "loss": 4.05804443359375, "lr": 0.0002, "elapsed_sec": 49569.98250293732, "step_time_sec": 8.22689052100759, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5980, "loss": 3.9756016731262207, "lr": 0.0002, "elapsed_sec": 49578.21256017685, "step_time_sec": 8.229880654020235, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5981, "loss": 4.000910758972168, "lr": 0.0002, "elapsed_sec": 49586.44387793541, "step_time_sec": 8.231165681994753, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5982, "loss": 4.096929550170898, "lr": 0.0002, "elapsed_sec": 49594.67154335976, "step_time_sec": 8.22755966099794, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5983, "loss": 4.070247173309326, "lr": 0.0002, "elapsed_sec": 49602.90211844444, "step_time_sec": 8.23037915097666, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5984, "loss": 3.969878673553467, "lr": 0.0002, "elapsed_sec": 49611.13159227371, "step_time_sec": 8.229299133003224, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5985, "loss": 3.9238343238830566, "lr": 0.0002, "elapsed_sec": 49619.36004734039, "step_time_sec": 8.228366416995414, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5986, "loss": 3.9838428497314453, "lr": 0.0002, "elapsed_sec": 49627.58951449394, "step_time_sec": 8.229227565985639, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5987, "loss": 4.101564407348633, "lr": 0.0002, "elapsed_sec": 49635.81908893585, "step_time_sec": 8.229434895998565, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5988, "loss": 3.998901605606079, "lr": 0.0002, "elapsed_sec": 49644.04835867882, "step_time_sec": 8.229092334979214, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5989, "loss": 4.145465850830078, "lr": 0.0002, "elapsed_sec": 49652.277265787125, "step_time_sec": 8.228780879988335, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5990, "loss": 3.9865565299987793, "lr": 0.0002, "elapsed_sec": 49660.50719666481, "step_time_sec": 8.229812656994909, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5991, "loss": 4.137277603149414, "lr": 0.0002, "elapsed_sec": 49668.73723125458, "step_time_sec": 8.229842226981418, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5992, "loss": 4.079356670379639, "lr": 0.0002, "elapsed_sec": 49676.966109752655, "step_time_sec": 8.228757897013566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5993, "loss": 4.056480884552002, "lr": 0.0002, "elapsed_sec": 49685.194789886475, "step_time_sec": 8.22854349098634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5994, "loss": 4.09596586227417, "lr": 0.0002, "elapsed_sec": 49693.42339968681, "step_time_sec": 8.228393317986047, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5995, "loss": 4.074723243713379, "lr": 0.0002, "elapsed_sec": 49701.65425205231, "step_time_sec": 8.230759831989417, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5996, "loss": 3.8877696990966797, "lr": 0.0002, "elapsed_sec": 49709.88542056084, "step_time_sec": 8.230965483002365, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5997, "loss": 3.9910826683044434, "lr": 0.0002, "elapsed_sec": 49718.1148352623, "step_time_sec": 8.229276956000831, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5998, "loss": 3.9112744331359863, "lr": 0.0002, "elapsed_sec": 49726.34443736076, "step_time_sec": 8.229410133993952, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 5999, "loss": 3.97407865524292, "lr": 0.0002, "elapsed_sec": 49734.57416653633, "step_time_sec": 8.229571822972503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6000, "loss": 4.139560699462891, "lr": 0.0002, "elapsed_sec": 49742.80324912071, "step_time_sec": 53.267057883000234, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": true, "probe_ran": true, "probe_elapsed_sec": 0.9974728900124319, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6001, "loss": 4.074009895324707, "lr": 0.0002, "elapsed_sec": 49796.08189344406, "step_time_sec": 8.240398383000866, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6002, "loss": 3.992490768432617, "lr": 0.0002, "elapsed_sec": 49804.31062102318, "step_time_sec": 8.228586909011938, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6003, "loss": 3.8710672855377197, "lr": 0.0002, "elapsed_sec": 49812.541132450104, "step_time_sec": 8.230363862006925, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6004, "loss": 4.000166416168213, "lr": 0.0002, "elapsed_sec": 49820.77085042, "step_time_sec": 8.229593489988474, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6005, "loss": 4.096874713897705, "lr": 0.0002, "elapsed_sec": 49828.99836015701, "step_time_sec": 8.227329355984693, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6006, "loss": 4.116151332855225, "lr": 0.0002, "elapsed_sec": 49837.22596049309, "step_time_sec": 8.227490502991714, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6007, "loss": 4.0124993324279785, "lr": 0.0002, "elapsed_sec": 49845.45749759674, "step_time_sec": 8.231379617005587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6008, "loss": 4.019351005554199, "lr": 0.0002, "elapsed_sec": 49853.68798446655, "step_time_sec": 8.230274055997143, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6009, "loss": 4.22174596786499, "lr": 0.0002, "elapsed_sec": 49861.91848683357, "step_time_sec": 8.23040662202402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6010, "loss": 4.1095170974731445, "lr": 0.0002, "elapsed_sec": 49870.148884058, "step_time_sec": 8.230236100003822, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6011, "loss": 4.066402912139893, "lr": 0.0002, "elapsed_sec": 49878.37972354889, "step_time_sec": 8.23066464299336, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6012, "loss": 4.033993244171143, "lr": 0.0002, "elapsed_sec": 49886.60870194435, "step_time_sec": 8.22881614099606, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6013, "loss": 4.0547685623168945, "lr": 0.0002, "elapsed_sec": 49894.83704829216, "step_time_sec": 8.22821698000189, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6014, "loss": 3.9605069160461426, "lr": 0.0002, "elapsed_sec": 49903.06562900543, "step_time_sec": 8.228412119002314, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6015, "loss": 4.048019886016846, "lr": 0.0002, "elapsed_sec": 49911.295840501785, "step_time_sec": 8.23005255201133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6016, "loss": 4.130589485168457, "lr": 0.0002, "elapsed_sec": 49919.5250453949, "step_time_sec": 8.229041917977156, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6017, "loss": 4.108098983764648, "lr": 0.0002, "elapsed_sec": 49927.755073308945, "step_time_sec": 8.229866440000478, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6018, "loss": 4.03592586517334, "lr": 0.0002, "elapsed_sec": 49935.984055519104, "step_time_sec": 8.228841566015035, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6019, "loss": 4.108002185821533, "lr": 0.0002, "elapsed_sec": 49944.21277284622, "step_time_sec": 8.228556321002543, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6020, "loss": 4.0748467445373535, "lr": 0.0002, "elapsed_sec": 49952.4411547184, "step_time_sec": 8.228231888992013, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6021, "loss": 4.041729927062988, "lr": 0.0002, "elapsed_sec": 49960.669795274734, "step_time_sec": 8.228501821984537, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6022, "loss": 4.110454082489014, "lr": 0.0002, "elapsed_sec": 49968.90046811104, "step_time_sec": 8.230465225991793, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6023, "loss": 4.089703559875488, "lr": 0.0002, "elapsed_sec": 49977.12865567207, "step_time_sec": 8.228090934018837, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6024, "loss": 4.184309959411621, "lr": 0.0002, "elapsed_sec": 49985.357456445694, "step_time_sec": 8.228587309014983, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6025, "loss": 4.087230682373047, "lr": 0.0002, "elapsed_sec": 49993.58638715744, "step_time_sec": 8.228839738003444, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6026, "loss": 4.135879039764404, "lr": 0.0002, "elapsed_sec": 50001.81405091286, "step_time_sec": 8.22749100899091, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6027, "loss": 4.042682647705078, "lr": 0.0002, "elapsed_sec": 50010.04475188255, "step_time_sec": 8.2304821430007, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6028, "loss": 3.9122049808502197, "lr": 0.0002, "elapsed_sec": 50018.27582550049, "step_time_sec": 8.23089050699491, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6029, "loss": 3.963369607925415, "lr": 0.0002, "elapsed_sec": 50026.50403833389, "step_time_sec": 8.228051606012741, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6030, "loss": 3.961566686630249, "lr": 0.0002, "elapsed_sec": 50034.73424625397, "step_time_sec": 8.23003962598159, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6031, "loss": 3.9532032012939453, "lr": 0.0002, "elapsed_sec": 50042.96176600456, "step_time_sec": 8.227369934989838, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6032, "loss": 4.324065208435059, "lr": 0.0002, "elapsed_sec": 50051.19013237953, "step_time_sec": 8.228219488984905, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6033, "loss": 4.065674781799316, "lr": 0.0002, "elapsed_sec": 50059.419600725174, "step_time_sec": 8.229324148996966, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6034, "loss": 3.988403081893921, "lr": 0.0002, "elapsed_sec": 50067.650332927704, "step_time_sec": 8.230543918994954, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6035, "loss": 3.9018566608428955, "lr": 0.0002, "elapsed_sec": 50075.880652189255, "step_time_sec": 8.230206266016467, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6036, "loss": 4.162990093231201, "lr": 0.0002, "elapsed_sec": 50084.110429525375, "step_time_sec": 8.229614134004805, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6037, "loss": 4.063287734985352, "lr": 0.0002, "elapsed_sec": 50092.34158182144, "step_time_sec": 8.231029803981073, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6038, "loss": 4.05194616317749, "lr": 0.0002, "elapsed_sec": 50100.57138633728, "step_time_sec": 8.229642215999775, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6039, "loss": 4.028723239898682, "lr": 0.0002, "elapsed_sec": 50108.79967594147, "step_time_sec": 8.228065002011135, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6040, "loss": 4.098809719085693, "lr": 0.0002, "elapsed_sec": 50117.02805042267, "step_time_sec": 8.228223921993049, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6041, "loss": 4.061488628387451, "lr": 0.0002, "elapsed_sec": 50125.25502753258, "step_time_sec": 8.22678409400396, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6042, "loss": 3.99638032913208, "lr": 0.0002, "elapsed_sec": 50133.48312711716, "step_time_sec": 8.227927576983348, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6043, "loss": 3.9724223613739014, "lr": 0.0002, "elapsed_sec": 50141.7127532959, "step_time_sec": 8.229480982990935, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6044, "loss": 4.155501365661621, "lr": 0.0002, "elapsed_sec": 50149.94389438629, "step_time_sec": 8.231017053010873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6045, "loss": 4.026564121246338, "lr": 0.0002, "elapsed_sec": 50158.17433357239, "step_time_sec": 8.230302128009498, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6046, "loss": 4.060348033905029, "lr": 0.0002, "elapsed_sec": 50166.404744148254, "step_time_sec": 8.230249086016556, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6047, "loss": 4.024266242980957, "lr": 0.0002, "elapsed_sec": 50174.63544869423, "step_time_sec": 8.230493265989935, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6048, "loss": 4.078804016113281, "lr": 0.0002, "elapsed_sec": 50182.865464925766, "step_time_sec": 8.229801845009206, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6049, "loss": 4.021694660186768, "lr": 0.0002, "elapsed_sec": 50191.09501409531, "step_time_sec": 8.229467718978412, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6050, "loss": 3.9952147006988525, "lr": 0.0002, "elapsed_sec": 50199.323102235794, "step_time_sec": 8.227888352994341, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6051, "loss": 4.099106311798096, "lr": 0.0002, "elapsed_sec": 50207.550996780396, "step_time_sec": 8.227755387022626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6052, "loss": 4.147480487823486, "lr": 0.0002, "elapsed_sec": 50215.77910518646, "step_time_sec": 8.227963184006512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6053, "loss": 4.106292724609375, "lr": 0.0002, "elapsed_sec": 50224.0092291832, "step_time_sec": 8.229916570999194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6054, "loss": 3.9575634002685547, "lr": 0.0002, "elapsed_sec": 50232.23955988884, "step_time_sec": 8.230151602998376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6055, "loss": 4.096940040588379, "lr": 0.0002, "elapsed_sec": 50240.47024178505, "step_time_sec": 8.230539485026384, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6056, "loss": 4.021152496337891, "lr": 0.0002, "elapsed_sec": 50248.700753211975, "step_time_sec": 8.23033527500229, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6057, "loss": 4.0688157081604, "lr": 0.0002, "elapsed_sec": 50256.931849718094, "step_time_sec": 8.230970649019582, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6058, "loss": 4.008421421051025, "lr": 0.0002, "elapsed_sec": 50265.16296195984, "step_time_sec": 8.230925300973468, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6059, "loss": 3.9902408123016357, "lr": 0.0002, "elapsed_sec": 50273.392308950424, "step_time_sec": 8.229227005009307, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6060, "loss": 4.026578426361084, "lr": 0.0002, "elapsed_sec": 50281.62250447273, "step_time_sec": 8.230022100993665, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6061, "loss": 3.770705461502075, "lr": 0.0002, "elapsed_sec": 50289.85346388817, "step_time_sec": 8.230829347012332, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6062, "loss": 3.9494922161102295, "lr": 0.0002, "elapsed_sec": 50298.08338022232, "step_time_sec": 8.229725623008562, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6063, "loss": 4.014941692352295, "lr": 0.0002, "elapsed_sec": 50306.31407380104, "step_time_sec": 8.230540107993875, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6064, "loss": 4.052332401275635, "lr": 0.0002, "elapsed_sec": 50314.54424262047, "step_time_sec": 8.230008109996561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6065, "loss": 4.031266689300537, "lr": 0.0002, "elapsed_sec": 50322.77312660217, "step_time_sec": 8.228779786004452, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6066, "loss": 4.150460243225098, "lr": 0.0002, "elapsed_sec": 50331.00152730942, "step_time_sec": 8.228245686012087, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6067, "loss": 3.9809110164642334, "lr": 0.0002, "elapsed_sec": 50339.23143219948, "step_time_sec": 8.22968771198066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6068, "loss": 4.051927089691162, "lr": 0.0002, "elapsed_sec": 50347.462230205536, "step_time_sec": 8.230685119022382, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6069, "loss": 3.996093273162842, "lr": 0.0002, "elapsed_sec": 50355.69345974922, "step_time_sec": 8.231045707012527, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6070, "loss": 4.095937728881836, "lr": 0.0002, "elapsed_sec": 50363.9247610569, "step_time_sec": 8.231118836003589, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6071, "loss": 4.046003818511963, "lr": 0.0002, "elapsed_sec": 50372.155732154846, "step_time_sec": 8.230824619997293, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6072, "loss": 3.9407646656036377, "lr": 0.0002, "elapsed_sec": 50380.3854675293, "step_time_sec": 8.229574491007952, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6073, "loss": 4.03778600692749, "lr": 0.0002, "elapsed_sec": 50388.61759734154, "step_time_sec": 8.232017208996695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6074, "loss": 3.9964287281036377, "lr": 0.0002, "elapsed_sec": 50396.84814977646, "step_time_sec": 8.230370009987382, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6075, "loss": 3.9916646480560303, "lr": 0.0002, "elapsed_sec": 50405.07676196098, "step_time_sec": 8.228452070005005, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6076, "loss": 4.1138811111450195, "lr": 0.0002, "elapsed_sec": 50413.30644440651, "step_time_sec": 8.229534301994136, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6077, "loss": 3.962373971939087, "lr": 0.0002, "elapsed_sec": 50421.53714108467, "step_time_sec": 8.230588639999041, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6078, "loss": 3.9538557529449463, "lr": 0.0002, "elapsed_sec": 50429.76809954643, "step_time_sec": 8.230757208977593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6079, "loss": 4.078714370727539, "lr": 0.0002, "elapsed_sec": 50437.99913573265, "step_time_sec": 8.23092734097736, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6080, "loss": 4.220361709594727, "lr": 0.0002, "elapsed_sec": 50446.22852110863, "step_time_sec": 8.229238339990843, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6081, "loss": 4.057001113891602, "lr": 0.0002, "elapsed_sec": 50454.45934009552, "step_time_sec": 8.23060946498299, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6082, "loss": 4.044034481048584, "lr": 0.0002, "elapsed_sec": 50462.68939161301, "step_time_sec": 8.229928360000486, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6083, "loss": 4.164155960083008, "lr": 0.0002, "elapsed_sec": 50470.91731739044, "step_time_sec": 8.227769064018503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6084, "loss": 3.93865966796875, "lr": 0.0002, "elapsed_sec": 50479.14672613144, "step_time_sec": 8.229211805999512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6085, "loss": 3.870061159133911, "lr": 0.0002, "elapsed_sec": 50487.37786626816, "step_time_sec": 8.23097833400243, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6086, "loss": 3.9482271671295166, "lr": 0.0002, "elapsed_sec": 50495.60824346542, "step_time_sec": 8.230248412000947, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6087, "loss": 3.9789109230041504, "lr": 0.0002, "elapsed_sec": 50503.838584661484, "step_time_sec": 8.230178327998146, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6088, "loss": 3.821042537689209, "lr": 0.0002, "elapsed_sec": 50512.06967711449, "step_time_sec": 8.230930502002593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6089, "loss": 4.039133071899414, "lr": 0.0002, "elapsed_sec": 50520.29967427254, "step_time_sec": 8.229837010992924, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6090, "loss": 4.0181732177734375, "lr": 0.0002, "elapsed_sec": 50528.52892565727, "step_time_sec": 8.229149000981124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6091, "loss": 3.913933753967285, "lr": 0.0002, "elapsed_sec": 50536.76024270058, "step_time_sec": 8.23118926101597, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6092, "loss": 3.8692455291748047, "lr": 0.0002, "elapsed_sec": 50544.99142718315, "step_time_sec": 8.23095670901239, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6093, "loss": 3.841602325439453, "lr": 0.0002, "elapsed_sec": 50553.2223610878, "step_time_sec": 8.230769843998132, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6094, "loss": 3.9270880222320557, "lr": 0.0002, "elapsed_sec": 50561.453221559525, "step_time_sec": 8.230713316996116, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6095, "loss": 4.100649833679199, "lr": 0.0002, "elapsed_sec": 50569.68470430374, "step_time_sec": 8.23134169500554, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6096, "loss": 3.8820366859436035, "lr": 0.0002, "elapsed_sec": 50577.9138174057, "step_time_sec": 8.228994550998323, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6097, "loss": 3.9817636013031006, "lr": 0.0002, "elapsed_sec": 50586.14360308647, "step_time_sec": 8.229629806010053, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6098, "loss": 4.0275654792785645, "lr": 0.0002, "elapsed_sec": 50594.37404179573, "step_time_sec": 8.230241113982629, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6099, "loss": 4.014427661895752, "lr": 0.0002, "elapsed_sec": 50602.60425853729, "step_time_sec": 8.230054072017083, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6100, "loss": 4.0429840087890625, "lr": 0.0002, "elapsed_sec": 50610.83488869667, "step_time_sec": 8.230501451005694, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6101, "loss": 4.075871467590332, "lr": 0.0002, "elapsed_sec": 50619.06580209732, "step_time_sec": 8.230759914004011, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6102, "loss": 3.8948280811309814, "lr": 0.0002, "elapsed_sec": 50627.296020030975, "step_time_sec": 8.230043755000224, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6103, "loss": 4.041744709014893, "lr": 0.0002, "elapsed_sec": 50635.5263633728, "step_time_sec": 8.230175628006691, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6104, "loss": 4.048869609832764, "lr": 0.0002, "elapsed_sec": 50643.75696659088, "step_time_sec": 8.23042685200926, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6105, "loss": 3.8240363597869873, "lr": 0.0002, "elapsed_sec": 50651.98723578453, "step_time_sec": 8.230174137017457, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6106, "loss": 3.9123659133911133, "lr": 0.0002, "elapsed_sec": 50660.21747326851, "step_time_sec": 8.230030948005151, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6107, "loss": 3.931037425994873, "lr": 0.0002, "elapsed_sec": 50668.44811820984, "step_time_sec": 8.230488442990463, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6108, "loss": 4.110426902770996, "lr": 0.0002, "elapsed_sec": 50676.67735886574, "step_time_sec": 8.229076881019864, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6109, "loss": 4.037057399749756, "lr": 0.0002, "elapsed_sec": 50684.90830492973, "step_time_sec": 8.230837375012925, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6110, "loss": 3.991126775741577, "lr": 0.0002, "elapsed_sec": 50693.139811992645, "step_time_sec": 8.231366521999007, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6111, "loss": 4.0625176429748535, "lr": 0.0002, "elapsed_sec": 50701.36847567558, "step_time_sec": 8.228494120005053, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6112, "loss": 3.987182855606079, "lr": 0.0002, "elapsed_sec": 50709.597534418106, "step_time_sec": 8.22886322697741, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6113, "loss": 4.014142990112305, "lr": 0.0002, "elapsed_sec": 50717.82421016693, "step_time_sec": 8.226521019998472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6114, "loss": 4.110359191894531, "lr": 0.0002, "elapsed_sec": 50726.05393409729, "step_time_sec": 8.229542593006045, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6115, "loss": 4.107777118682861, "lr": 0.0002, "elapsed_sec": 50734.28290224075, "step_time_sec": 8.228860605013324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6116, "loss": 4.074305057525635, "lr": 0.0002, "elapsed_sec": 50742.51330661774, "step_time_sec": 8.23023618600564, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6117, "loss": 4.037402153015137, "lr": 0.0002, "elapsed_sec": 50750.743884801865, "step_time_sec": 8.23040778900031, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6118, "loss": 3.9297146797180176, "lr": 0.0002, "elapsed_sec": 50758.97282886505, "step_time_sec": 8.228793666989077, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6119, "loss": 4.004018783569336, "lr": 0.0002, "elapsed_sec": 50767.203424453735, "step_time_sec": 8.230456945981132, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6120, "loss": 4.021581172943115, "lr": 0.0002, "elapsed_sec": 50775.43408346176, "step_time_sec": 8.230570796993561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6121, "loss": 4.05665922164917, "lr": 0.0002, "elapsed_sec": 50783.664685726166, "step_time_sec": 8.230374167003902, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6122, "loss": 4.068724632263184, "lr": 0.0002, "elapsed_sec": 50791.89475107193, "step_time_sec": 8.22992380798678, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6123, "loss": 3.9834628105163574, "lr": 0.0002, "elapsed_sec": 50800.12627887726, "step_time_sec": 8.231395548005821, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6124, "loss": 4.155081272125244, "lr": 0.0002, "elapsed_sec": 50808.35671854019, "step_time_sec": 8.23031521201483, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6125, "loss": 4.163969039916992, "lr": 0.0002, "elapsed_sec": 50816.58737063408, "step_time_sec": 8.23043670001789, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6126, "loss": 4.077258110046387, "lr": 0.0002, "elapsed_sec": 50824.81656122208, "step_time_sec": 8.229053215996828, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6127, "loss": 3.9456169605255127, "lr": 0.0002, "elapsed_sec": 50833.04584693909, "step_time_sec": 8.229145299992524, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6128, "loss": 3.9666335582733154, "lr": 0.0002, "elapsed_sec": 50841.27540230751, "step_time_sec": 8.229394893016433, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6129, "loss": 3.937516689300537, "lr": 0.0002, "elapsed_sec": 50849.50569868088, "step_time_sec": 8.230116149003152, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6130, "loss": 4.008353233337402, "lr": 0.0002, "elapsed_sec": 50857.735635757446, "step_time_sec": 8.229849059018306, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6131, "loss": 3.910672664642334, "lr": 0.0002, "elapsed_sec": 50865.96478843689, "step_time_sec": 8.228936466010055, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6132, "loss": 3.9695684909820557, "lr": 0.0002, "elapsed_sec": 50874.19424057007, "step_time_sec": 8.229290594987106, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6133, "loss": 3.811455249786377, "lr": 0.0002, "elapsed_sec": 50882.423177957535, "step_time_sec": 8.228820018004626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6134, "loss": 3.923374652862549, "lr": 0.0002, "elapsed_sec": 50890.65360045433, "step_time_sec": 8.23023646499496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6135, "loss": 3.9827754497528076, "lr": 0.0002, "elapsed_sec": 50898.88440322876, "step_time_sec": 8.230629174999194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6136, "loss": 3.9599149227142334, "lr": 0.0002, "elapsed_sec": 50907.11498379707, "step_time_sec": 8.230446447996655, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6137, "loss": 3.9436488151550293, "lr": 0.0002, "elapsed_sec": 50915.34515929222, "step_time_sec": 8.230010952014709, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6138, "loss": 3.96940016746521, "lr": 0.0002, "elapsed_sec": 50923.57600021362, "step_time_sec": 8.230724516004557, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6139, "loss": 4.1393656730651855, "lr": 0.0002, "elapsed_sec": 50931.805748701096, "step_time_sec": 8.229538377985591, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6140, "loss": 3.8990108966827393, "lr": 0.0002, "elapsed_sec": 50940.03470349312, "step_time_sec": 8.228826074977405, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6141, "loss": 3.9813575744628906, "lr": 0.0002, "elapsed_sec": 50948.26140618324, "step_time_sec": 8.226577110996004, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6142, "loss": 4.026434421539307, "lr": 0.0002, "elapsed_sec": 50956.48960351944, "step_time_sec": 8.228019450994907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6143, "loss": 4.024306297302246, "lr": 0.0002, "elapsed_sec": 50964.71726131439, "step_time_sec": 8.227467093995074, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6144, "loss": 4.059316635131836, "lr": 0.0002, "elapsed_sec": 50972.94772052765, "step_time_sec": 8.230285875994014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6145, "loss": 4.053820610046387, "lr": 0.0002, "elapsed_sec": 50981.17863321304, "step_time_sec": 8.230753729993012, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6146, "loss": 3.821662187576294, "lr": 0.0002, "elapsed_sec": 50989.41067361832, "step_time_sec": 8.231906169006834, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6147, "loss": 4.102994918823242, "lr": 0.0002, "elapsed_sec": 50997.6402285099, "step_time_sec": 8.22939286500332, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6148, "loss": 3.879049062728882, "lr": 0.0002, "elapsed_sec": 51005.87139964104, "step_time_sec": 8.230995697987964, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6149, "loss": 4.001705646514893, "lr": 0.0002, "elapsed_sec": 51014.10219168663, "step_time_sec": 8.230697755992878, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6150, "loss": 4.075413227081299, "lr": 0.0002, "elapsed_sec": 51022.333313941956, "step_time_sec": 8.230931074009277, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6151, "loss": 3.9417948722839355, "lr": 0.0002, "elapsed_sec": 51030.564355134964, "step_time_sec": 8.230888074991526, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6152, "loss": 3.998650074005127, "lr": 0.0002, "elapsed_sec": 51038.79458642006, "step_time_sec": 8.230142776999855, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6153, "loss": 4.0614776611328125, "lr": 0.0002, "elapsed_sec": 51047.025554180145, "step_time_sec": 8.230758299992885, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6154, "loss": 3.914163589477539, "lr": 0.0002, "elapsed_sec": 51055.2531311512, "step_time_sec": 8.227427985984832, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6155, "loss": 4.071159362792969, "lr": 0.0002, "elapsed_sec": 51063.48272681236, "step_time_sec": 8.229470035003033, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6156, "loss": 3.944492816925049, "lr": 0.0002, "elapsed_sec": 51071.71113586426, "step_time_sec": 8.22825344800367, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6157, "loss": 4.080633640289307, "lr": 0.0002, "elapsed_sec": 51079.94190406799, "step_time_sec": 8.230550912994659, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6158, "loss": 3.9591803550720215, "lr": 0.0002, "elapsed_sec": 51088.17313337326, "step_time_sec": 8.231082013022387, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6159, "loss": 4.012997150421143, "lr": 0.0002, "elapsed_sec": 51096.403437137604, "step_time_sec": 8.230183818988735, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6160, "loss": 4.030004501342773, "lr": 0.0002, "elapsed_sec": 51104.63455605507, "step_time_sec": 8.23093273100676, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6161, "loss": 3.8570809364318848, "lr": 0.0002, "elapsed_sec": 51112.86555480957, "step_time_sec": 8.230874872999266, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6162, "loss": 3.9359638690948486, "lr": 0.0002, "elapsed_sec": 51121.09566307068, "step_time_sec": 8.229905339016113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6163, "loss": 4.107895374298096, "lr": 0.0002, "elapsed_sec": 51129.32527065277, "step_time_sec": 8.229468336008722, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6164, "loss": 3.959894895553589, "lr": 0.0002, "elapsed_sec": 51137.5532143116, "step_time_sec": 8.22778779399232, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6165, "loss": 3.8469629287719727, "lr": 0.0002, "elapsed_sec": 51145.781108379364, "step_time_sec": 8.227742215996841, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6166, "loss": 4.048964023590088, "lr": 0.0002, "elapsed_sec": 51154.010820150375, "step_time_sec": 8.2296109769959, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6167, "loss": 4.128787040710449, "lr": 0.0002, "elapsed_sec": 51162.23906779289, "step_time_sec": 8.22803889599163, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6168, "loss": 4.008283615112305, "lr": 0.0002, "elapsed_sec": 51170.467427015305, "step_time_sec": 8.228183247993002, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6169, "loss": 3.9697139263153076, "lr": 0.0002, "elapsed_sec": 51178.69583487511, "step_time_sec": 8.228291164996335, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6170, "loss": 4.002580642700195, "lr": 0.0002, "elapsed_sec": 51186.92678809166, "step_time_sec": 8.230769487010548, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6171, "loss": 3.950362205505371, "lr": 0.0002, "elapsed_sec": 51195.15800738335, "step_time_sec": 8.231125724007143, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6172, "loss": 4.012689590454102, "lr": 0.0002, "elapsed_sec": 51203.38770318031, "step_time_sec": 8.22951280901907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6173, "loss": 4.04656982421875, "lr": 0.0002, "elapsed_sec": 51211.61520791054, "step_time_sec": 8.227354057977209, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6174, "loss": 4.0016021728515625, "lr": 0.0002, "elapsed_sec": 51219.84547948837, "step_time_sec": 8.230101323017152, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6175, "loss": 4.0680999755859375, "lr": 0.0002, "elapsed_sec": 51228.076982975006, "step_time_sec": 8.23136837000493, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6176, "loss": 4.054131507873535, "lr": 0.0002, "elapsed_sec": 51236.306668519974, "step_time_sec": 8.2294612529804, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6177, "loss": 3.9750113487243652, "lr": 0.0002, "elapsed_sec": 51244.535180807114, "step_time_sec": 8.22839432299952, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6178, "loss": 3.977736473083496, "lr": 0.0002, "elapsed_sec": 51252.76387000084, "step_time_sec": 8.228509186999872, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6179, "loss": 4.032241344451904, "lr": 0.0002, "elapsed_sec": 51260.99082684517, "step_time_sec": 8.226812108012382, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6180, "loss": 4.016869068145752, "lr": 0.0002, "elapsed_sec": 51269.21966934204, "step_time_sec": 8.228672037977958, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6181, "loss": 4.057661533355713, "lr": 0.0002, "elapsed_sec": 51277.44955730438, "step_time_sec": 8.22973493198515, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6182, "loss": 4.0830230712890625, "lr": 0.0002, "elapsed_sec": 51285.67904853821, "step_time_sec": 8.22937116099638, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6183, "loss": 4.143900394439697, "lr": 0.0002, "elapsed_sec": 51293.90989804268, "step_time_sec": 8.230726159003098, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6184, "loss": 3.8907933235168457, "lr": 0.0002, "elapsed_sec": 51302.14005732536, "step_time_sec": 8.229946361010661, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6185, "loss": 3.9382824897766113, "lr": 0.0002, "elapsed_sec": 51310.369869709015, "step_time_sec": 8.229655461007496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6186, "loss": 3.9695425033569336, "lr": 0.0002, "elapsed_sec": 51318.600699186325, "step_time_sec": 8.230731171002844, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6187, "loss": 4.034655570983887, "lr": 0.0002, "elapsed_sec": 51326.83080124855, "step_time_sec": 8.229954962996999, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6188, "loss": 3.9637374877929688, "lr": 0.0002, "elapsed_sec": 51335.06077861786, "step_time_sec": 8.229776760999812, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6189, "loss": 4.114490032196045, "lr": 0.0002, "elapsed_sec": 51343.291608572006, "step_time_sec": 8.230668308999157, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6190, "loss": 4.186320781707764, "lr": 0.0002, "elapsed_sec": 51351.52087879181, "step_time_sec": 8.22910620100447, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6191, "loss": 4.01482629776001, "lr": 0.0002, "elapsed_sec": 51359.74804639816, "step_time_sec": 8.227042045007693, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6192, "loss": 3.98885440826416, "lr": 0.0002, "elapsed_sec": 51367.97649407387, "step_time_sec": 8.228275267989375, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6193, "loss": 3.9449820518493652, "lr": 0.0002, "elapsed_sec": 51376.20597910881, "step_time_sec": 8.229329341003904, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6194, "loss": 4.172144412994385, "lr": 0.0002, "elapsed_sec": 51384.431272506714, "step_time_sec": 8.225160476984456, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6195, "loss": 3.951586961746216, "lr": 0.0002, "elapsed_sec": 51392.661804676056, "step_time_sec": 8.230363168986514, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6196, "loss": 3.922393798828125, "lr": 0.0002, "elapsed_sec": 51400.89239549637, "step_time_sec": 8.230442178988596, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6197, "loss": 4.0364670753479, "lr": 0.0002, "elapsed_sec": 51409.1204996109, "step_time_sec": 8.22799222698086, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6198, "loss": 3.9020321369171143, "lr": 0.0002, "elapsed_sec": 51417.34986591339, "step_time_sec": 8.229158800997538, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6199, "loss": 4.06629753112793, "lr": 0.0002, "elapsed_sec": 51425.5768866539, "step_time_sec": 8.226868058001855, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6200, "loss": 4.033210754394531, "lr": 0.0002, "elapsed_sec": 51433.806736946106, "step_time_sec": 8.22976937898784, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6201, "loss": 3.9953086376190186, "lr": 0.0002, "elapsed_sec": 51442.03572511673, "step_time_sec": 8.228833546017995, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6202, "loss": 3.999581813812256, "lr": 0.0002, "elapsed_sec": 51450.264749765396, "step_time_sec": 8.22881281300215, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6203, "loss": 4.121184825897217, "lr": 0.0002, "elapsed_sec": 51458.49559688568, "step_time_sec": 8.230677485989872, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6204, "loss": 4.002583026885986, "lr": 0.0002, "elapsed_sec": 51466.72567987442, "step_time_sec": 8.230005243996857, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6205, "loss": 3.9203786849975586, "lr": 0.0002, "elapsed_sec": 51474.956762075424, "step_time_sec": 8.230872597981943, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6206, "loss": 4.07992696762085, "lr": 0.0002, "elapsed_sec": 51483.18713474274, "step_time_sec": 8.230228443979286, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6207, "loss": 3.8696699142456055, "lr": 0.0002, "elapsed_sec": 51491.41719865799, "step_time_sec": 8.229901118989801, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6208, "loss": 3.9974491596221924, "lr": 0.0002, "elapsed_sec": 51499.648298978806, "step_time_sec": 8.23095622897381, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6209, "loss": 3.8323659896850586, "lr": 0.0002, "elapsed_sec": 51507.87826538086, "step_time_sec": 8.229809371987358, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6210, "loss": 3.8864972591400146, "lr": 0.0002, "elapsed_sec": 51516.10819339752, "step_time_sec": 8.229764663003152, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6211, "loss": 4.085510730743408, "lr": 0.0002, "elapsed_sec": 51524.33813524246, "step_time_sec": 8.229827225994086, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6212, "loss": 4.004391193389893, "lr": 0.0002, "elapsed_sec": 51532.568547964096, "step_time_sec": 8.23021046401118, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6213, "loss": 4.083998680114746, "lr": 0.0002, "elapsed_sec": 51540.797459840775, "step_time_sec": 8.228770237008575, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6214, "loss": 4.022835731506348, "lr": 0.0002, "elapsed_sec": 51549.02643823624, "step_time_sec": 8.228846628015162, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6215, "loss": 4.006460189819336, "lr": 0.0002, "elapsed_sec": 51557.25765490532, "step_time_sec": 8.231029596994631, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6216, "loss": 3.982830047607422, "lr": 0.0002, "elapsed_sec": 51565.48797297478, "step_time_sec": 8.2301413739915, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6217, "loss": 3.938814640045166, "lr": 0.0002, "elapsed_sec": 51573.71748328209, "step_time_sec": 8.229357833013637, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6218, "loss": 4.006607532501221, "lr": 0.0002, "elapsed_sec": 51581.945652484894, "step_time_sec": 8.22807059501065, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6219, "loss": 3.987884283065796, "lr": 0.0002, "elapsed_sec": 51590.17279338837, "step_time_sec": 8.226968715025578, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6220, "loss": 3.8072943687438965, "lr": 0.0002, "elapsed_sec": 51598.40381717682, "step_time_sec": 8.230912940984126, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6221, "loss": 4.170008182525635, "lr": 0.0002, "elapsed_sec": 51606.6404774189, "step_time_sec": 8.229942620004294, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6222, "loss": 3.8558433055877686, "lr": 0.0002, "elapsed_sec": 51614.871554374695, "step_time_sec": 8.230936684005428, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6223, "loss": 3.9680116176605225, "lr": 0.0002, "elapsed_sec": 51623.10195231438, "step_time_sec": 8.2302574130008, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6224, "loss": 3.8716025352478027, "lr": 0.0002, "elapsed_sec": 51631.33255147934, "step_time_sec": 8.230466098000761, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6225, "loss": 3.9664392471313477, "lr": 0.0002, "elapsed_sec": 51639.56259226799, "step_time_sec": 8.229856548976386, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6226, "loss": 3.996980667114258, "lr": 0.0002, "elapsed_sec": 51647.79338884354, "step_time_sec": 8.230637824017322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6227, "loss": 4.044581413269043, "lr": 0.0002, "elapsed_sec": 51656.02530384064, "step_time_sec": 8.231783655995969, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6228, "loss": 4.143523693084717, "lr": 0.0002, "elapsed_sec": 51664.25614452362, "step_time_sec": 8.230724479013588, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6229, "loss": 3.9953057765960693, "lr": 0.0002, "elapsed_sec": 51672.486917972565, "step_time_sec": 8.230573695007479, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6230, "loss": 3.955435276031494, "lr": 0.0002, "elapsed_sec": 51680.717447042465, "step_time_sec": 8.230370449018665, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6231, "loss": 3.9904820919036865, "lr": 0.0002, "elapsed_sec": 51688.94779467583, "step_time_sec": 8.230193044000771, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6232, "loss": 4.002462863922119, "lr": 0.0002, "elapsed_sec": 51697.176446676254, "step_time_sec": 8.22852604900254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6233, "loss": 4.048469066619873, "lr": 0.0002, "elapsed_sec": 51705.40372800827, "step_time_sec": 8.227121019008337, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6234, "loss": 4.065395832061768, "lr": 0.0002, "elapsed_sec": 51713.63242030144, "step_time_sec": 8.2284955070063, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6235, "loss": 4.075672149658203, "lr": 0.0002, "elapsed_sec": 51721.86104440689, "step_time_sec": 8.228545514983125, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6236, "loss": 3.8893301486968994, "lr": 0.0002, "elapsed_sec": 51730.0904045105, "step_time_sec": 8.229205355019076, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6237, "loss": 3.916337013244629, "lr": 0.0002, "elapsed_sec": 51738.31858921051, "step_time_sec": 8.22795784700429, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6238, "loss": 4.101945877075195, "lr": 0.0002, "elapsed_sec": 51746.54823231697, "step_time_sec": 8.229496966989245, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6239, "loss": 3.838892936706543, "lr": 0.0002, "elapsed_sec": 51754.77962589264, "step_time_sec": 8.231284053006675, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6240, "loss": 3.987586259841919, "lr": 0.0002, "elapsed_sec": 51763.01086759567, "step_time_sec": 8.231041329010623, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6241, "loss": 3.8836612701416016, "lr": 0.0002, "elapsed_sec": 51771.241909742355, "step_time_sec": 8.23088049000944, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6242, "loss": 4.034970283508301, "lr": 0.0002, "elapsed_sec": 51779.47131443024, "step_time_sec": 8.229305640998064, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6243, "loss": 3.881934881210327, "lr": 0.0002, "elapsed_sec": 51787.69998383522, "step_time_sec": 8.228447650995804, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6244, "loss": 4.009581089019775, "lr": 0.0002, "elapsed_sec": 51795.929612636566, "step_time_sec": 8.22948818502482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6245, "loss": 4.07778263092041, "lr": 0.0002, "elapsed_sec": 51804.15720629692, "step_time_sec": 8.227436585992109, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6246, "loss": 4.027798175811768, "lr": 0.0002, "elapsed_sec": 51812.387570142746, "step_time_sec": 8.230218056996819, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6247, "loss": 4.081079959869385, "lr": 0.0002, "elapsed_sec": 51820.618619680405, "step_time_sec": 8.230968621995999, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6248, "loss": 4.011582374572754, "lr": 0.0002, "elapsed_sec": 51828.8491859436, "step_time_sec": 8.230369061988313, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6249, "loss": 3.8992879390716553, "lr": 0.0002, "elapsed_sec": 51837.07860469818, "step_time_sec": 8.22929100197507, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6250, "loss": 4.014105796813965, "lr": 0.0002, "elapsed_sec": 51845.309663534164, "step_time_sec": 8.230858234019252, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6251, "loss": 3.9902853965759277, "lr": 0.0002, "elapsed_sec": 51853.53981637955, "step_time_sec": 8.229995383007918, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6252, "loss": 3.9037632942199707, "lr": 0.0002, "elapsed_sec": 51861.76928520203, "step_time_sec": 8.229340447025606, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6253, "loss": 4.002696514129639, "lr": 0.0002, "elapsed_sec": 51870.00082445145, "step_time_sec": 8.231371593981748, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6254, "loss": 4.103195667266846, "lr": 0.0002, "elapsed_sec": 51878.231122493744, "step_time_sec": 8.230106189992512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6255, "loss": 3.881671905517578, "lr": 0.0002, "elapsed_sec": 51886.46201848984, "step_time_sec": 8.230740642000455, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6256, "loss": 4.085102558135986, "lr": 0.0002, "elapsed_sec": 51894.69153904915, "step_time_sec": 8.229396404, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6257, "loss": 3.8951852321624756, "lr": 0.0002, "elapsed_sec": 51902.91930246353, "step_time_sec": 8.22757013599039, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6258, "loss": 4.02030086517334, "lr": 0.0002, "elapsed_sec": 51911.14963555336, "step_time_sec": 8.230182826984674, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6259, "loss": 3.9699339866638184, "lr": 0.0002, "elapsed_sec": 51919.378368616104, "step_time_sec": 8.228587671997957, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6260, "loss": 4.067733287811279, "lr": 0.0002, "elapsed_sec": 51927.60807991028, "step_time_sec": 8.229547599999933, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6261, "loss": 4.0367865562438965, "lr": 0.0002, "elapsed_sec": 51935.838918447495, "step_time_sec": 8.230671359982807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6262, "loss": 4.0490546226501465, "lr": 0.0002, "elapsed_sec": 51944.06991481781, "step_time_sec": 8.230840681993868, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6263, "loss": 4.037408351898193, "lr": 0.0002, "elapsed_sec": 51952.30123233795, "step_time_sec": 8.231212929007597, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6264, "loss": 3.801102638244629, "lr": 0.0002, "elapsed_sec": 51960.531135082245, "step_time_sec": 8.229711672000121, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6265, "loss": 4.015713214874268, "lr": 0.0002, "elapsed_sec": 51968.760360479355, "step_time_sec": 8.22907510199002, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6266, "loss": 4.185315132141113, "lr": 0.0002, "elapsed_sec": 51976.98898768425, "step_time_sec": 8.228521003009519, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6267, "loss": 3.9937214851379395, "lr": 0.0002, "elapsed_sec": 51985.21681022644, "step_time_sec": 8.227650650020223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6268, "loss": 4.01632833480835, "lr": 0.0002, "elapsed_sec": 51993.445110082626, "step_time_sec": 8.228103550005471, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6269, "loss": 4.033370018005371, "lr": 0.0002, "elapsed_sec": 52001.67512345314, "step_time_sec": 8.229855310026323, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6270, "loss": 3.930241107940674, "lr": 0.0002, "elapsed_sec": 52009.90745019913, "step_time_sec": 8.232223800005158, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6271, "loss": 4.007620334625244, "lr": 0.0002, "elapsed_sec": 52018.13785862923, "step_time_sec": 8.230188087007264, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6272, "loss": 3.9249532222747803, "lr": 0.0002, "elapsed_sec": 52026.368530511856, "step_time_sec": 8.230526473984355, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6273, "loss": 4.027539253234863, "lr": 0.0002, "elapsed_sec": 52034.59950256348, "step_time_sec": 8.230850897001801, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6274, "loss": 4.006918907165527, "lr": 0.0002, "elapsed_sec": 52042.83042097092, "step_time_sec": 8.230747018009424, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6275, "loss": 4.183333873748779, "lr": 0.0002, "elapsed_sec": 52051.06136035919, "step_time_sec": 8.230781566991936, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6276, "loss": 3.9939475059509277, "lr": 0.0002, "elapsed_sec": 52059.29048895836, "step_time_sec": 8.228984904009849, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6277, "loss": 3.9696483612060547, "lr": 0.0002, "elapsed_sec": 52067.52101278305, "step_time_sec": 8.230360960005783, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6278, "loss": 3.9949660301208496, "lr": 0.0002, "elapsed_sec": 52075.752739429474, "step_time_sec": 8.23162506200606, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6279, "loss": 3.88079571723938, "lr": 0.0002, "elapsed_sec": 52083.983303785324, "step_time_sec": 8.230341757007409, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6280, "loss": 4.020831108093262, "lr": 0.0002, "elapsed_sec": 52092.21354055405, "step_time_sec": 8.230119323998224, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6281, "loss": 3.8796560764312744, "lr": 0.0002, "elapsed_sec": 52100.44411063194, "step_time_sec": 8.230372839985648, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6282, "loss": 4.045666217803955, "lr": 0.0002, "elapsed_sec": 52108.67385149002, "step_time_sec": 8.229600708000362, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6283, "loss": 3.982945680618286, "lr": 0.0002, "elapsed_sec": 52116.90439963341, "step_time_sec": 8.230396811995888, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6284, "loss": 4.08261775970459, "lr": 0.0002, "elapsed_sec": 52125.13572883606, "step_time_sec": 8.231192483013729, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6285, "loss": 3.8460233211517334, "lr": 0.0002, "elapsed_sec": 52133.36642026901, "step_time_sec": 8.230507357016904, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6286, "loss": 4.080044746398926, "lr": 0.0002, "elapsed_sec": 52141.597138404846, "step_time_sec": 8.23056032700697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6287, "loss": 3.9325501918792725, "lr": 0.0002, "elapsed_sec": 52149.828085660934, "step_time_sec": 8.230815187998815, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6288, "loss": 4.04541015625, "lr": 0.0002, "elapsed_sec": 52158.05893826485, "step_time_sec": 8.230645781994099, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6289, "loss": 4.044198989868164, "lr": 0.0002, "elapsed_sec": 52166.290153265, "step_time_sec": 8.231060318998061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6290, "loss": 3.871403217315674, "lr": 0.0002, "elapsed_sec": 52174.519295692444, "step_time_sec": 8.229017852019751, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6291, "loss": 4.062028408050537, "lr": 0.0002, "elapsed_sec": 52182.74703383446, "step_time_sec": 8.227578847989207, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6292, "loss": 3.918548583984375, "lr": 0.0002, "elapsed_sec": 52190.977588415146, "step_time_sec": 8.230386143026408, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6293, "loss": 3.933090925216675, "lr": 0.0002, "elapsed_sec": 52199.20812368393, "step_time_sec": 8.230403068009764, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6294, "loss": 4.113200664520264, "lr": 0.0002, "elapsed_sec": 52207.438514232635, "step_time_sec": 8.230256153008668, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6295, "loss": 3.9510960578918457, "lr": 0.0002, "elapsed_sec": 52215.6687669754, "step_time_sec": 8.230113013007212, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6296, "loss": 3.9100687503814697, "lr": 0.0002, "elapsed_sec": 52223.90060162544, "step_time_sec": 8.231580292980652, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6297, "loss": 3.976534843444824, "lr": 0.0002, "elapsed_sec": 52232.1303627491, "step_time_sec": 8.229639754019445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6298, "loss": 3.9322807788848877, "lr": 0.0002, "elapsed_sec": 52240.360654354095, "step_time_sec": 8.230150789022446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6299, "loss": 4.069744110107422, "lr": 0.0002, "elapsed_sec": 52248.591965675354, "step_time_sec": 8.231120489974273, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6300, "loss": 3.8651576042175293, "lr": 0.0002, "elapsed_sec": 52256.81987762451, "step_time_sec": 8.227765230985824, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6301, "loss": 4.14600133895874, "lr": 0.0002, "elapsed_sec": 52265.04881262779, "step_time_sec": 8.228836641996168, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6302, "loss": 4.12929630279541, "lr": 0.0002, "elapsed_sec": 52273.276071071625, "step_time_sec": 8.22706905301311, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6303, "loss": 4.079767227172852, "lr": 0.0002, "elapsed_sec": 52281.50567817688, "step_time_sec": 8.229440033988794, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6304, "loss": 4.218192100524902, "lr": 0.0002, "elapsed_sec": 52289.732691049576, "step_time_sec": 8.226860002003377, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6305, "loss": 4.076226711273193, "lr": 0.0002, "elapsed_sec": 52297.96263861656, "step_time_sec": 8.229872679017717, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6306, "loss": 3.9992966651916504, "lr": 0.0002, "elapsed_sec": 52306.1922416687, "step_time_sec": 8.229414747998817, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6307, "loss": 3.9444310665130615, "lr": 0.0002, "elapsed_sec": 52314.418541908264, "step_time_sec": 8.226092433993472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6308, "loss": 4.100748538970947, "lr": 0.0002, "elapsed_sec": 52322.647450208664, "step_time_sec": 8.228786505991593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6309, "loss": 3.975951910018921, "lr": 0.0002, "elapsed_sec": 52330.87661123276, "step_time_sec": 8.22901397800888, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6310, "loss": 3.9145100116729736, "lr": 0.0002, "elapsed_sec": 52339.105543375015, "step_time_sec": 8.228755402000388, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6311, "loss": 3.9734981060028076, "lr": 0.0002, "elapsed_sec": 52347.33421063423, "step_time_sec": 8.228564771998208, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6312, "loss": 4.141246795654297, "lr": 0.0002, "elapsed_sec": 52355.56486129761, "step_time_sec": 8.230417148995912, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6313, "loss": 4.1002349853515625, "lr": 0.0002, "elapsed_sec": 52363.79580998421, "step_time_sec": 8.23080668100738, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6314, "loss": 3.760819911956787, "lr": 0.0002, "elapsed_sec": 52372.02704620361, "step_time_sec": 8.231095030991128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6315, "loss": 3.933067560195923, "lr": 0.0002, "elapsed_sec": 52380.256701231, "step_time_sec": 8.229545802983921, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6316, "loss": 4.019864559173584, "lr": 0.0002, "elapsed_sec": 52388.488013505936, "step_time_sec": 8.231112007022602, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6317, "loss": 4.130690097808838, "lr": 0.0002, "elapsed_sec": 52396.71852207184, "step_time_sec": 8.230371323006693, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6318, "loss": 4.000553131103516, "lr": 0.0002, "elapsed_sec": 52404.9495370388, "step_time_sec": 8.23080254701199, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6319, "loss": 4.031735897064209, "lr": 0.0002, "elapsed_sec": 52413.18081331253, "step_time_sec": 8.231141658994602, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6320, "loss": 3.9873225688934326, "lr": 0.0002, "elapsed_sec": 52421.41111946106, "step_time_sec": 8.23014858001261, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6321, "loss": 4.096611976623535, "lr": 0.0002, "elapsed_sec": 52429.639504909515, "step_time_sec": 8.22825706700678, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6322, "loss": 4.221170425415039, "lr": 0.0002, "elapsed_sec": 52437.867533922195, "step_time_sec": 8.227843038999708, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6323, "loss": 3.992229700088501, "lr": 0.0002, "elapsed_sec": 52446.09712576866, "step_time_sec": 8.229453199979616, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6324, "loss": 4.096049785614014, "lr": 0.0002, "elapsed_sec": 52454.32618165016, "step_time_sec": 8.228899181995075, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6325, "loss": 4.143571376800537, "lr": 0.0002, "elapsed_sec": 52462.553810834885, "step_time_sec": 8.2275106110028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6326, "loss": 3.9904696941375732, "lr": 0.0002, "elapsed_sec": 52470.782599687576, "step_time_sec": 8.22860820099595, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6327, "loss": 4.084518909454346, "lr": 0.0002, "elapsed_sec": 52479.01358675957, "step_time_sec": 8.230825944017852, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6328, "loss": 4.1202874183654785, "lr": 0.0002, "elapsed_sec": 52487.2447412014, "step_time_sec": 8.231013018987142, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6329, "loss": 3.932185411453247, "lr": 0.0002, "elapsed_sec": 52495.4759042263, "step_time_sec": 8.231044429994654, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6330, "loss": 3.9314000606536865, "lr": 0.0002, "elapsed_sec": 52503.70660972595, "step_time_sec": 8.230463904998032, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6331, "loss": 3.9915590286254883, "lr": 0.0002, "elapsed_sec": 52511.93790984154, "step_time_sec": 8.231158042006427, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6332, "loss": 4.058159351348877, "lr": 0.0002, "elapsed_sec": 52520.16784071922, "step_time_sec": 8.229849466995802, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6333, "loss": 4.150701522827148, "lr": 0.0002, "elapsed_sec": 52528.39784049988, "step_time_sec": 8.229781528993044, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6334, "loss": 4.068183422088623, "lr": 0.0002, "elapsed_sec": 52536.62793111801, "step_time_sec": 8.22997106902767, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6335, "loss": 4.0256571769714355, "lr": 0.0002, "elapsed_sec": 52544.85437345505, "step_time_sec": 8.22624271400855, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6336, "loss": 3.9848744869232178, "lr": 0.0002, "elapsed_sec": 52553.0824007988, "step_time_sec": 8.2278984549921, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6337, "loss": 4.00853157043457, "lr": 0.0002, "elapsed_sec": 52561.311233997345, "step_time_sec": 8.228714125987608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6338, "loss": 4.006185531616211, "lr": 0.0002, "elapsed_sec": 52569.54117655754, "step_time_sec": 8.229768763994798, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6339, "loss": 4.089603900909424, "lr": 0.0002, "elapsed_sec": 52577.76932525635, "step_time_sec": 8.227997331006918, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6340, "loss": 3.886918306350708, "lr": 0.0002, "elapsed_sec": 52585.999677181244, "step_time_sec": 8.230163903004723, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6341, "loss": 3.9484646320343018, "lr": 0.0002, "elapsed_sec": 52594.23078751564, "step_time_sec": 8.230976110993652, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6342, "loss": 3.985417604446411, "lr": 0.0002, "elapsed_sec": 52602.4597799778, "step_time_sec": 8.22888457600493, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6343, "loss": 3.9025299549102783, "lr": 0.0002, "elapsed_sec": 52610.69105362892, "step_time_sec": 8.231099352997262, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6344, "loss": 4.027802467346191, "lr": 0.0002, "elapsed_sec": 52618.92140293121, "step_time_sec": 8.230123465997167, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6345, "loss": 3.9470038414001465, "lr": 0.0002, "elapsed_sec": 52627.15216231346, "step_time_sec": 8.230677718005609, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6346, "loss": 3.9906866550445557, "lr": 0.0002, "elapsed_sec": 52635.38281297684, "step_time_sec": 8.23050606198376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6347, "loss": 4.007834434509277, "lr": 0.0002, "elapsed_sec": 52643.614335775375, "step_time_sec": 8.231307770998683, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6348, "loss": 4.009519577026367, "lr": 0.0002, "elapsed_sec": 52651.84446763992, "step_time_sec": 8.229975780006498, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6349, "loss": 4.01297664642334, "lr": 0.0002, "elapsed_sec": 52660.074696302414, "step_time_sec": 8.230090286990162, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6350, "loss": 4.1150221824646, "lr": 0.0002, "elapsed_sec": 52668.305502176285, "step_time_sec": 8.230678803985938, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6351, "loss": 4.09898042678833, "lr": 0.0002, "elapsed_sec": 52676.53512763977, "step_time_sec": 8.229420076007955, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6352, "loss": 3.9203248023986816, "lr": 0.0002, "elapsed_sec": 52684.76589465141, "step_time_sec": 8.23061639998923, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6353, "loss": 3.993317127227783, "lr": 0.0002, "elapsed_sec": 52692.996408224106, "step_time_sec": 8.230433160002576, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6354, "loss": 4.106831073760986, "lr": 0.0002, "elapsed_sec": 52701.227167367935, "step_time_sec": 8.230533456982812, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6355, "loss": 4.050827503204346, "lr": 0.0002, "elapsed_sec": 52709.457528829575, "step_time_sec": 8.23020511699724, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6356, "loss": 3.9531311988830566, "lr": 0.0002, "elapsed_sec": 52717.687435388565, "step_time_sec": 8.229777306987671, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6357, "loss": 3.9623401165008545, "lr": 0.0002, "elapsed_sec": 52725.91903734207, "step_time_sec": 8.231406045000767, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6358, "loss": 4.054720878601074, "lr": 0.0002, "elapsed_sec": 52734.15063786507, "step_time_sec": 8.231432483997196, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6359, "loss": 4.207406997680664, "lr": 0.0002, "elapsed_sec": 52742.381308317184, "step_time_sec": 8.23051893999218, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6360, "loss": 3.9584786891937256, "lr": 0.0002, "elapsed_sec": 52751.66746711731, "step_time_sec": 9.286004359018989, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6361, "loss": 3.9530441761016846, "lr": 0.0002, "elapsed_sec": 52759.894860982895, "step_time_sec": 8.227254007011652, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6362, "loss": 4.148501873016357, "lr": 0.0002, "elapsed_sec": 52768.124394893646, "step_time_sec": 8.22938100900501, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6363, "loss": 4.0958147048950195, "lr": 0.0002, "elapsed_sec": 52776.35604310036, "step_time_sec": 8.231489719997626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6364, "loss": 4.051467418670654, "lr": 0.0002, "elapsed_sec": 52784.58679699898, "step_time_sec": 8.23060981699382, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6365, "loss": 4.064938545227051, "lr": 0.0002, "elapsed_sec": 52792.81807088852, "step_time_sec": 8.231149818981066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6366, "loss": 4.060205459594727, "lr": 0.0002, "elapsed_sec": 52801.047807216644, "step_time_sec": 8.229518812993774, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6367, "loss": 4.037891864776611, "lr": 0.0002, "elapsed_sec": 52809.27767896652, "step_time_sec": 8.2297271520074, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6368, "loss": 4.094666481018066, "lr": 0.0002, "elapsed_sec": 52817.50929045677, "step_time_sec": 8.231513774982886, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6369, "loss": 3.9542713165283203, "lr": 0.0002, "elapsed_sec": 52825.738329172134, "step_time_sec": 8.228872086998308, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6370, "loss": 4.1095147132873535, "lr": 0.0002, "elapsed_sec": 52833.96670341492, "step_time_sec": 8.228241169010289, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6371, "loss": 4.0898871421813965, "lr": 0.0002, "elapsed_sec": 52842.19474482536, "step_time_sec": 8.22780015799799, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6372, "loss": 4.00507926940918, "lr": 0.0002, "elapsed_sec": 52850.42507839203, "step_time_sec": 8.230182249011705, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6373, "loss": 4.001865386962891, "lr": 0.0002, "elapsed_sec": 52858.654025793076, "step_time_sec": 8.228780693025328, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6374, "loss": 4.039176940917969, "lr": 0.0002, "elapsed_sec": 52866.88384413719, "step_time_sec": 8.229686559992842, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6375, "loss": 4.094254970550537, "lr": 0.0002, "elapsed_sec": 52875.11469435692, "step_time_sec": 8.23072746899561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6376, "loss": 3.9616401195526123, "lr": 0.0002, "elapsed_sec": 52883.34509110451, "step_time_sec": 8.230149660987081, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6377, "loss": 3.9721028804779053, "lr": 0.0002, "elapsed_sec": 52891.57613539696, "step_time_sec": 8.23093092101044, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6378, "loss": 3.908820867538452, "lr": 0.0002, "elapsed_sec": 52899.80802345276, "step_time_sec": 8.231722266005818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6379, "loss": 3.986532211303711, "lr": 0.0002, "elapsed_sec": 52908.03844046593, "step_time_sec": 8.230246250983328, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6380, "loss": 4.164778232574463, "lr": 0.0002, "elapsed_sec": 52916.268936395645, "step_time_sec": 8.230299368995475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6381, "loss": 4.098295211791992, "lr": 0.0002, "elapsed_sec": 52924.49837732315, "step_time_sec": 8.229311274015345, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6382, "loss": 4.161055088043213, "lr": 0.0002, "elapsed_sec": 52932.730075359344, "step_time_sec": 8.231582582986448, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6383, "loss": 3.9793577194213867, "lr": 0.0002, "elapsed_sec": 52940.96087765694, "step_time_sec": 8.230633238999872, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6384, "loss": 4.023858070373535, "lr": 0.0002, "elapsed_sec": 52949.19178771973, "step_time_sec": 8.230705842986936, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6385, "loss": 3.9807448387145996, "lr": 0.0002, "elapsed_sec": 52957.42263388634, "step_time_sec": 8.230697937018704, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6386, "loss": 3.9566493034362793, "lr": 0.0002, "elapsed_sec": 52965.65470504761, "step_time_sec": 8.231955288007157, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6387, "loss": 3.9600934982299805, "lr": 0.0002, "elapsed_sec": 52973.88548231125, "step_time_sec": 8.230604535987368, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6388, "loss": 4.045100688934326, "lr": 0.0002, "elapsed_sec": 52982.116250514984, "step_time_sec": 8.230638584005646, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6389, "loss": 4.0927815437316895, "lr": 0.0002, "elapsed_sec": 52990.3476729393, "step_time_sec": 8.231238011998357, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6390, "loss": 3.884284496307373, "lr": 0.0002, "elapsed_sec": 52998.578043699265, "step_time_sec": 8.23021370000788, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6391, "loss": 3.9657392501831055, "lr": 0.0002, "elapsed_sec": 53006.80830383301, "step_time_sec": 8.230102002999047, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6392, "loss": 3.9647817611694336, "lr": 0.0002, "elapsed_sec": 53015.03647994995, "step_time_sec": 8.228036629996495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6393, "loss": 3.958455801010132, "lr": 0.0002, "elapsed_sec": 53023.263682842255, "step_time_sec": 8.22704484401038, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6394, "loss": 3.9569334983825684, "lr": 0.0002, "elapsed_sec": 53031.4954867363, "step_time_sec": 8.231631041009678, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6395, "loss": 3.884908437728882, "lr": 0.0002, "elapsed_sec": 53039.72611451149, "step_time_sec": 8.230492253002012, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6396, "loss": 3.958719253540039, "lr": 0.0002, "elapsed_sec": 53047.956483364105, "step_time_sec": 8.23025814598077, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6397, "loss": 3.8780672550201416, "lr": 0.0002, "elapsed_sec": 53056.18663644791, "step_time_sec": 8.229980477015488, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6398, "loss": 4.001979351043701, "lr": 0.0002, "elapsed_sec": 53064.41742134094, "step_time_sec": 8.23059210399515, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6399, "loss": 4.01711368560791, "lr": 0.0002, "elapsed_sec": 53072.648218631744, "step_time_sec": 8.230634181993082, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6400, "loss": 4.038667678833008, "lr": 0.0002, "elapsed_sec": 53080.878323316574, "step_time_sec": 8.229997601010837, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6401, "loss": 4.067760467529297, "lr": 0.0002, "elapsed_sec": 53089.1097407341, "step_time_sec": 8.231233210011851, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6402, "loss": 4.019836902618408, "lr": 0.0002, "elapsed_sec": 53097.33998990059, "step_time_sec": 8.230112774996087, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6403, "loss": 4.039683818817139, "lr": 0.0002, "elapsed_sec": 53105.570790052414, "step_time_sec": 8.23058657598449, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6404, "loss": 3.8791637420654297, "lr": 0.0002, "elapsed_sec": 53113.8020131588, "step_time_sec": 8.231134615984047, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6405, "loss": 4.04738187789917, "lr": 0.0002, "elapsed_sec": 53122.03308439255, "step_time_sec": 8.230908391007688, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6406, "loss": 3.9800057411193848, "lr": 0.0002, "elapsed_sec": 53130.26454401016, "step_time_sec": 8.231255366001278, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6407, "loss": 3.8870158195495605, "lr": 0.0002, "elapsed_sec": 53138.49579548836, "step_time_sec": 8.231131476990413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6408, "loss": 3.968212127685547, "lr": 0.0002, "elapsed_sec": 53146.724908828735, "step_time_sec": 8.22895420299028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6409, "loss": 3.92930269241333, "lr": 0.0002, "elapsed_sec": 53154.953595638275, "step_time_sec": 8.228484135004692, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6410, "loss": 3.954648971557617, "lr": 0.0002, "elapsed_sec": 53163.18433570862, "step_time_sec": 8.230647974007297, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6411, "loss": 4.098351001739502, "lr": 0.0002, "elapsed_sec": 53171.41577553749, "step_time_sec": 8.231254890997661, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6412, "loss": 3.9215989112854004, "lr": 0.0002, "elapsed_sec": 53179.646710157394, "step_time_sec": 8.230788664019201, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6413, "loss": 4.059643268585205, "lr": 0.0002, "elapsed_sec": 53187.87763571739, "step_time_sec": 8.230732019001152, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6414, "loss": 3.9733784198760986, "lr": 0.0002, "elapsed_sec": 53196.10855340958, "step_time_sec": 8.230807639018167, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6415, "loss": 4.032944679260254, "lr": 0.0002, "elapsed_sec": 53204.33955168724, "step_time_sec": 8.230873907014029, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6416, "loss": 4.14865779876709, "lr": 0.0002, "elapsed_sec": 53212.56909489632, "step_time_sec": 8.229318719008006, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6417, "loss": 4.185150146484375, "lr": 0.0002, "elapsed_sec": 53220.79873299599, "step_time_sec": 8.229469989979407, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6418, "loss": 4.045266151428223, "lr": 0.0002, "elapsed_sec": 53229.02895665169, "step_time_sec": 8.23006745899329, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6419, "loss": 4.108531951904297, "lr": 0.0002, "elapsed_sec": 53237.26000523567, "step_time_sec": 8.230909125995822, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6420, "loss": 4.028292655944824, "lr": 0.0002, "elapsed_sec": 53245.4893579483, "step_time_sec": 8.229159612004878, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6421, "loss": 3.9487569332122803, "lr": 0.0002, "elapsed_sec": 53253.719985961914, "step_time_sec": 8.230496651027352, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6422, "loss": 4.002478122711182, "lr": 0.0002, "elapsed_sec": 53261.95055484772, "step_time_sec": 8.23038816798362, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6423, "loss": 3.8936855792999268, "lr": 0.0002, "elapsed_sec": 53270.18132352829, "step_time_sec": 8.23066612999537, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6424, "loss": 3.964799165725708, "lr": 0.0002, "elapsed_sec": 53278.412194252014, "step_time_sec": 8.230729411006905, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6425, "loss": 4.024555206298828, "lr": 0.0002, "elapsed_sec": 53286.64315509796, "step_time_sec": 8.230697449005675, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6426, "loss": 3.9056434631347656, "lr": 0.0002, "elapsed_sec": 53294.873782873154, "step_time_sec": 8.230461358005414, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6427, "loss": 3.9918088912963867, "lr": 0.0002, "elapsed_sec": 53303.104566812515, "step_time_sec": 8.230677305022255, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6428, "loss": 4.0484619140625, "lr": 0.0002, "elapsed_sec": 53311.334035396576, "step_time_sec": 8.229314714000793, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6429, "loss": 4.0769476890563965, "lr": 0.0002, "elapsed_sec": 53319.565311431885, "step_time_sec": 8.231072197988397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6430, "loss": 4.069423675537109, "lr": 0.0002, "elapsed_sec": 53327.79626607895, "step_time_sec": 8.230806512001436, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6431, "loss": 3.968865156173706, "lr": 0.0002, "elapsed_sec": 53336.02921271324, "step_time_sec": 8.232784886000445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6432, "loss": 3.933142900466919, "lr": 0.0002, "elapsed_sec": 53344.257850170135, "step_time_sec": 8.228490218985826, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6433, "loss": 4.030602931976318, "lr": 0.0002, "elapsed_sec": 53352.487203121185, "step_time_sec": 8.229180170979816, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6434, "loss": 3.9172842502593994, "lr": 0.0002, "elapsed_sec": 53360.71827197075, "step_time_sec": 8.230917573004263, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6435, "loss": 3.958195209503174, "lr": 0.0002, "elapsed_sec": 53368.949093580246, "step_time_sec": 8.230626521981321, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6436, "loss": 4.004999160766602, "lr": 0.0002, "elapsed_sec": 53377.17995810509, "step_time_sec": 8.23074058900238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6437, "loss": 4.038601398468018, "lr": 0.0002, "elapsed_sec": 53385.40986704826, "step_time_sec": 8.229752535000443, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6438, "loss": 3.9912636280059814, "lr": 0.0002, "elapsed_sec": 53393.639108181, "step_time_sec": 8.229052570997737, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6439, "loss": 3.843899965286255, "lr": 0.0002, "elapsed_sec": 53401.869389534, "step_time_sec": 8.230128317984054, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6440, "loss": 4.069980621337891, "lr": 0.0002, "elapsed_sec": 53410.09964990616, "step_time_sec": 8.23015499097528, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6441, "loss": 4.067249774932861, "lr": 0.0002, "elapsed_sec": 53418.33033943176, "step_time_sec": 8.230527525010984, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6442, "loss": 3.890073776245117, "lr": 0.0002, "elapsed_sec": 53426.5603518486, "step_time_sec": 8.229832151992014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6443, "loss": 4.049488067626953, "lr": 0.0002, "elapsed_sec": 53434.79154062271, "step_time_sec": 8.230980863998411, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6444, "loss": 3.8306729793548584, "lr": 0.0002, "elapsed_sec": 53443.01898813248, "step_time_sec": 8.227314290998038, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6445, "loss": 4.079565048217773, "lr": 0.0002, "elapsed_sec": 53451.24591779709, "step_time_sec": 8.226797798008192, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6446, "loss": 4.050688743591309, "lr": 0.0002, "elapsed_sec": 53459.47616958618, "step_time_sec": 8.23005864402512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6447, "loss": 3.9393508434295654, "lr": 0.0002, "elapsed_sec": 53467.70527768135, "step_time_sec": 8.228946956980508, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6448, "loss": 3.817108631134033, "lr": 0.0002, "elapsed_sec": 53475.93542456627, "step_time_sec": 8.22999251598958, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6449, "loss": 4.046084403991699, "lr": 0.0002, "elapsed_sec": 53484.1665687561, "step_time_sec": 8.23097122501349, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6450, "loss": 3.9570610523223877, "lr": 0.0002, "elapsed_sec": 53492.396904706955, "step_time_sec": 8.230204696999863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6451, "loss": 3.9169435501098633, "lr": 0.0002, "elapsed_sec": 53500.62803387642, "step_time_sec": 8.231041633000132, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6452, "loss": 3.756537914276123, "lr": 0.0002, "elapsed_sec": 53508.858874082565, "step_time_sec": 8.230610443977639, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6453, "loss": 4.120080471038818, "lr": 0.0002, "elapsed_sec": 53517.0870885849, "step_time_sec": 8.22808717598673, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6454, "loss": 4.084008693695068, "lr": 0.0002, "elapsed_sec": 53525.31812405586, "step_time_sec": 8.230843917990569, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6455, "loss": 4.191123962402344, "lr": 0.0002, "elapsed_sec": 53533.549840688705, "step_time_sec": 8.231588653987274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6456, "loss": 4.212149143218994, "lr": 0.0002, "elapsed_sec": 53541.78037595749, "step_time_sec": 8.230397248000372, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6457, "loss": 3.998175859451294, "lr": 0.0002, "elapsed_sec": 53550.011058568954, "step_time_sec": 8.230483267980162, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6458, "loss": 3.9680182933807373, "lr": 0.0002, "elapsed_sec": 53558.242361068726, "step_time_sec": 8.231227178999688, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6459, "loss": 4.059285640716553, "lr": 0.0002, "elapsed_sec": 53566.47219491005, "step_time_sec": 8.229635974974371, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6460, "loss": 3.9935734272003174, "lr": 0.0002, "elapsed_sec": 53574.70312309265, "step_time_sec": 8.230726489011431, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6461, "loss": 3.9266982078552246, "lr": 0.0002, "elapsed_sec": 53582.93312740326, "step_time_sec": 8.229841396998381, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6462, "loss": 4.017544269561768, "lr": 0.0002, "elapsed_sec": 53591.16088652611, "step_time_sec": 8.227585720014758, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6463, "loss": 4.018298625946045, "lr": 0.0002, "elapsed_sec": 53599.39141511917, "step_time_sec": 8.230395468999632, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6464, "loss": 4.068049907684326, "lr": 0.0002, "elapsed_sec": 53607.62046480179, "step_time_sec": 8.228884025011212, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6465, "loss": 4.047763824462891, "lr": 0.0002, "elapsed_sec": 53615.84972691536, "step_time_sec": 8.229127138998592, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6466, "loss": 4.147669315338135, "lr": 0.0002, "elapsed_sec": 53624.07923960686, "step_time_sec": 8.229362479003612, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6467, "loss": 3.9467968940734863, "lr": 0.0002, "elapsed_sec": 53632.30846977234, "step_time_sec": 8.229089659987949, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6468, "loss": 4.04522705078125, "lr": 0.0002, "elapsed_sec": 53640.536942481995, "step_time_sec": 8.22830706401146, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6469, "loss": 3.955962896347046, "lr": 0.0002, "elapsed_sec": 53648.76452255249, "step_time_sec": 8.22742170499987, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6470, "loss": 3.907884359359741, "lr": 0.0002, "elapsed_sec": 53656.995371580124, "step_time_sec": 8.230738629994448, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6471, "loss": 3.8709025382995605, "lr": 0.0002, "elapsed_sec": 53665.22636413574, "step_time_sec": 8.230795554991346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6472, "loss": 4.025010108947754, "lr": 0.0002, "elapsed_sec": 53673.45593833923, "step_time_sec": 8.229466999007855, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6473, "loss": 3.961733341217041, "lr": 0.0002, "elapsed_sec": 53681.68486714363, "step_time_sec": 8.228779897995992, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6474, "loss": 3.899610996246338, "lr": 0.0002, "elapsed_sec": 53689.91309809685, "step_time_sec": 8.22805758798495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6475, "loss": 3.854517936706543, "lr": 0.0002, "elapsed_sec": 53698.14407634735, "step_time_sec": 8.230801728001097, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6476, "loss": 4.027774333953857, "lr": 0.0002, "elapsed_sec": 53706.374249219894, "step_time_sec": 8.230023998999968, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6477, "loss": 4.25165319442749, "lr": 0.0002, "elapsed_sec": 53714.60426616669, "step_time_sec": 8.229879561986309, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6478, "loss": 4.004417419433594, "lr": 0.0002, "elapsed_sec": 53722.834089279175, "step_time_sec": 8.2296307090146, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6479, "loss": 4.001035690307617, "lr": 0.0002, "elapsed_sec": 53731.06450724602, "step_time_sec": 8.2303015270154, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6480, "loss": 3.9678919315338135, "lr": 0.0002, "elapsed_sec": 53739.293937683105, "step_time_sec": 8.229215037019458, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6481, "loss": 3.9379022121429443, "lr": 0.0002, "elapsed_sec": 53747.52297830582, "step_time_sec": 8.228870663006091, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6482, "loss": 4.010372161865234, "lr": 0.0002, "elapsed_sec": 53755.751750946045, "step_time_sec": 8.228631158999633, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6483, "loss": 3.9607744216918945, "lr": 0.0002, "elapsed_sec": 53763.98148441315, "step_time_sec": 8.22956775300554, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6484, "loss": 3.988752603530884, "lr": 0.0002, "elapsed_sec": 53772.20997595787, "step_time_sec": 8.228341628011549, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6485, "loss": 3.7379491329193115, "lr": 0.0002, "elapsed_sec": 53780.4415371418, "step_time_sec": 8.231384207989322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6486, "loss": 4.045934200286865, "lr": 0.0002, "elapsed_sec": 53788.671612501144, "step_time_sec": 8.229962192999665, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6487, "loss": 3.866682529449463, "lr": 0.0002, "elapsed_sec": 53796.89973092079, "step_time_sec": 8.227965538011631, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6488, "loss": 3.964895248413086, "lr": 0.0002, "elapsed_sec": 53805.12877011299, "step_time_sec": 8.228817564988276, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6489, "loss": 4.010685920715332, "lr": 0.0002, "elapsed_sec": 53813.358761548996, "step_time_sec": 8.229865518020233, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6490, "loss": 3.771085023880005, "lr": 0.0002, "elapsed_sec": 53821.589394807816, "step_time_sec": 8.230476958007785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6491, "loss": 3.988055944442749, "lr": 0.0002, "elapsed_sec": 53829.81933760643, "step_time_sec": 8.229763795010513, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6492, "loss": 3.9242103099823, "lr": 0.0002, "elapsed_sec": 53838.04852151871, "step_time_sec": 8.228996639023535, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6493, "loss": 3.8930253982543945, "lr": 0.0002, "elapsed_sec": 53846.2769651413, "step_time_sec": 8.22827968900674, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6494, "loss": 3.9440598487854004, "lr": 0.0002, "elapsed_sec": 53854.50611662865, "step_time_sec": 8.229045226995368, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6495, "loss": 4.0494279861450195, "lr": 0.0002, "elapsed_sec": 53862.7368645668, "step_time_sec": 8.230568001017673, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6496, "loss": 4.065281391143799, "lr": 0.0002, "elapsed_sec": 53870.96716403961, "step_time_sec": 8.230174569005612, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6497, "loss": 3.8552610874176025, "lr": 0.0002, "elapsed_sec": 53879.195687294006, "step_time_sec": 8.228344866016414, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6498, "loss": 3.9025161266326904, "lr": 0.0002, "elapsed_sec": 53887.42383432388, "step_time_sec": 8.22801149301813, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6499, "loss": 3.8929905891418457, "lr": 0.0002, "elapsed_sec": 53895.65378308296, "step_time_sec": 8.229838350001955, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6500, "loss": 3.871833086013794, "lr": 0.0002, "elapsed_sec": 53903.88508439064, "step_time_sec": 30.858908451977186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6501, "loss": 4.067469596862793, "lr": 0.0002, "elapsed_sec": 53934.75958585739, "step_time_sec": 8.246178942994447, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6502, "loss": 3.95497465133667, "lr": 0.0002, "elapsed_sec": 53942.97621536255, "step_time_sec": 8.216474054002902, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6503, "loss": 4.1164870262146, "lr": 0.0002, "elapsed_sec": 53951.19237732887, "step_time_sec": 8.215999045991339, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6504, "loss": 4.038687229156494, "lr": 0.0002, "elapsed_sec": 53959.40956020355, "step_time_sec": 8.217060627997853, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6505, "loss": 3.949143171310425, "lr": 0.0002, "elapsed_sec": 53967.627729177475, "step_time_sec": 8.217993105994537, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6506, "loss": 4.008578777313232, "lr": 0.0002, "elapsed_sec": 53975.85693573952, "step_time_sec": 8.22906451698509, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6507, "loss": 3.988384485244751, "lr": 0.0002, "elapsed_sec": 53984.087337732315, "step_time_sec": 8.230253543006256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6508, "loss": 3.9827518463134766, "lr": 0.0002, "elapsed_sec": 53992.318182468414, "step_time_sec": 8.230735057004495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6509, "loss": 3.9971749782562256, "lr": 0.0002, "elapsed_sec": 54000.54851317406, "step_time_sec": 8.230118103005225, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6510, "loss": 3.6183362007141113, "lr": 0.0002, "elapsed_sec": 54008.77624678612, "step_time_sec": 8.227619170997059, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6511, "loss": 4.160771369934082, "lr": 0.0002, "elapsed_sec": 54017.00582432747, "step_time_sec": 8.229395439004293, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6512, "loss": 3.9039907455444336, "lr": 0.0002, "elapsed_sec": 54025.23408842087, "step_time_sec": 8.22806162800407, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6513, "loss": 4.078165531158447, "lr": 0.0002, "elapsed_sec": 54033.462878227234, "step_time_sec": 8.228633621009067, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6514, "loss": 4.158659934997559, "lr": 0.0002, "elapsed_sec": 54041.691891908646, "step_time_sec": 8.228853106003953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6515, "loss": 4.014571666717529, "lr": 0.0002, "elapsed_sec": 54049.92256331444, "step_time_sec": 8.230554344016127, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6516, "loss": 4.021698474884033, "lr": 0.0002, "elapsed_sec": 54058.15290617943, "step_time_sec": 8.230231351975817, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6517, "loss": 3.896308422088623, "lr": 0.0002, "elapsed_sec": 54066.3836081028, "step_time_sec": 8.230472297000233, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6518, "loss": 4.136531352996826, "lr": 0.0002, "elapsed_sec": 54074.61362671852, "step_time_sec": 8.229895514989039, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6519, "loss": 3.9971864223480225, "lr": 0.0002, "elapsed_sec": 54082.844133377075, "step_time_sec": 8.23033227500855, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6520, "loss": 3.964874029159546, "lr": 0.0002, "elapsed_sec": 54091.0726480484, "step_time_sec": 8.22835598301026, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6521, "loss": 3.982651472091675, "lr": 0.0002, "elapsed_sec": 54099.30199956894, "step_time_sec": 8.22920565700042, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6522, "loss": 4.0763092041015625, "lr": 0.0002, "elapsed_sec": 54107.532036304474, "step_time_sec": 8.2298610379803, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6523, "loss": 4.054582595825195, "lr": 0.0002, "elapsed_sec": 54115.762768268585, "step_time_sec": 8.230544669000665, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6524, "loss": 4.086359977722168, "lr": 0.0002, "elapsed_sec": 54123.99341964722, "step_time_sec": 8.230559443007223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6525, "loss": 3.8625199794769287, "lr": 0.0002, "elapsed_sec": 54132.22395658493, "step_time_sec": 8.230385301983915, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6526, "loss": 3.850266933441162, "lr": 0.0002, "elapsed_sec": 54140.45563197136, "step_time_sec": 8.231459239992546, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6527, "loss": 4.003978252410889, "lr": 0.0002, "elapsed_sec": 54148.685423374176, "step_time_sec": 8.229693111992674, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6528, "loss": 3.9143738746643066, "lr": 0.0002, "elapsed_sec": 54156.916841983795, "step_time_sec": 8.23121518798871, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6529, "loss": 3.979189157485962, "lr": 0.0002, "elapsed_sec": 54165.147975206375, "step_time_sec": 8.230971886019688, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6530, "loss": 3.8784773349761963, "lr": 0.0002, "elapsed_sec": 54173.37888669968, "step_time_sec": 8.230785424006172, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6531, "loss": 3.9375455379486084, "lr": 0.0002, "elapsed_sec": 54181.60948634148, "step_time_sec": 8.230448009009706, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6532, "loss": 3.882333755493164, "lr": 0.0002, "elapsed_sec": 54189.84022068977, "step_time_sec": 8.230553061002865, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6533, "loss": 3.821518659591675, "lr": 0.0002, "elapsed_sec": 54198.069190979004, "step_time_sec": 8.228817130002426, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6534, "loss": 4.247792720794678, "lr": 0.0002, "elapsed_sec": 54206.29779911041, "step_time_sec": 8.228466325002955, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6535, "loss": 3.961920738220215, "lr": 0.0002, "elapsed_sec": 54214.52697253227, "step_time_sec": 8.229034701973433, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6536, "loss": 4.072026252746582, "lr": 0.0002, "elapsed_sec": 54222.756756305695, "step_time_sec": 8.229667439998593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6537, "loss": 3.9031591415405273, "lr": 0.0002, "elapsed_sec": 54230.98818278313, "step_time_sec": 8.231252073019277, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6538, "loss": 3.936478614807129, "lr": 0.0002, "elapsed_sec": 54239.217728853226, "step_time_sec": 8.229364453989547, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6539, "loss": 4.213637351989746, "lr": 0.0002, "elapsed_sec": 54247.44616866112, "step_time_sec": 8.228362241003197, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6540, "loss": 3.9264931678771973, "lr": 0.0002, "elapsed_sec": 54255.67665362358, "step_time_sec": 8.230326070974115, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6541, "loss": 3.9307661056518555, "lr": 0.0002, "elapsed_sec": 54263.907052755356, "step_time_sec": 8.230221139005153, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6542, "loss": 3.9447929859161377, "lr": 0.0002, "elapsed_sec": 54272.13757276535, "step_time_sec": 8.230366062984103, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6543, "loss": 4.109988212585449, "lr": 0.0002, "elapsed_sec": 54280.368465423584, "step_time_sec": 8.230702304979786, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6544, "loss": 3.9945716857910156, "lr": 0.0002, "elapsed_sec": 54288.598575115204, "step_time_sec": 8.2299499050132, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6545, "loss": 3.8871521949768066, "lr": 0.0002, "elapsed_sec": 54296.828296899796, "step_time_sec": 8.229606120992685, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6546, "loss": 3.8220643997192383, "lr": 0.0002, "elapsed_sec": 54305.05851149559, "step_time_sec": 8.230029291997198, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6547, "loss": 3.960928440093994, "lr": 0.0002, "elapsed_sec": 54313.28915262222, "step_time_sec": 8.230500281002605, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6548, "loss": 4.01149845123291, "lr": 0.0002, "elapsed_sec": 54321.51947975159, "step_time_sec": 8.230164148000767, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6549, "loss": 4.054417610168457, "lr": 0.0002, "elapsed_sec": 54329.74922847748, "step_time_sec": 8.229605294996873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6550, "loss": 3.860774278640747, "lr": 0.0002, "elapsed_sec": 54337.97966384888, "step_time_sec": 8.230338476016186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6551, "loss": 4.065942764282227, "lr": 0.0002, "elapsed_sec": 54346.20890212059, "step_time_sec": 8.229074595990824, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6552, "loss": 4.050365447998047, "lr": 0.0002, "elapsed_sec": 54354.4389295578, "step_time_sec": 8.229826772992965, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6553, "loss": 3.8280246257781982, "lr": 0.0002, "elapsed_sec": 54362.66745495796, "step_time_sec": 8.22840738800005, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6554, "loss": 3.9817099571228027, "lr": 0.0002, "elapsed_sec": 54370.89567875862, "step_time_sec": 8.228025528980652, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6555, "loss": 3.868565082550049, "lr": 0.0002, "elapsed_sec": 54379.12344408035, "step_time_sec": 8.22761584399268, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6556, "loss": 4.096829414367676, "lr": 0.0002, "elapsed_sec": 54387.35337114334, "step_time_sec": 8.229745878983522, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6557, "loss": 4.009466171264648, "lr": 0.0002, "elapsed_sec": 54395.58340072632, "step_time_sec": 8.229860341991298, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6558, "loss": 3.9468939304351807, "lr": 0.0002, "elapsed_sec": 54403.81308913231, "step_time_sec": 8.229602961015189, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6559, "loss": 3.879892110824585, "lr": 0.0002, "elapsed_sec": 54412.04218864441, "step_time_sec": 8.228888072975678, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6560, "loss": 3.970079183578491, "lr": 0.0002, "elapsed_sec": 54420.27004814148, "step_time_sec": 8.227699559007306, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6561, "loss": 4.067088603973389, "lr": 0.0002, "elapsed_sec": 54428.498888731, "step_time_sec": 8.22871535498416, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6562, "loss": 3.857961893081665, "lr": 0.0002, "elapsed_sec": 54436.72986268997, "step_time_sec": 8.230797567986883, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6563, "loss": 4.101456642150879, "lr": 0.0002, "elapsed_sec": 54444.95887541771, "step_time_sec": 8.228807683015475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6564, "loss": 3.935904026031494, "lr": 0.0002, "elapsed_sec": 54453.18636035919, "step_time_sec": 8.227316069009248, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6565, "loss": 3.873857021331787, "lr": 0.0002, "elapsed_sec": 54461.41335821152, "step_time_sec": 8.226821988006122, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6566, "loss": 3.94610595703125, "lr": 0.0002, "elapsed_sec": 54469.643959999084, "step_time_sec": 8.230467178975232, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6567, "loss": 3.79402494430542, "lr": 0.0002, "elapsed_sec": 54477.87366914749, "step_time_sec": 8.229614901007153, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6568, "loss": 4.199707508087158, "lr": 0.0002, "elapsed_sec": 54486.10408473015, "step_time_sec": 8.23016607601312, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6569, "loss": 3.868206739425659, "lr": 0.0002, "elapsed_sec": 54494.33229470253, "step_time_sec": 8.22807072501746, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6570, "loss": 3.911656618118286, "lr": 0.0002, "elapsed_sec": 54502.56249308586, "step_time_sec": 8.230018768023001, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6571, "loss": 4.016030311584473, "lr": 0.0002, "elapsed_sec": 54510.79199242592, "step_time_sec": 8.229362886981107, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6572, "loss": 4.013605117797852, "lr": 0.0002, "elapsed_sec": 54519.02148413658, "step_time_sec": 8.229355382005451, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6573, "loss": 3.933534860610962, "lr": 0.0002, "elapsed_sec": 54527.25103235245, "step_time_sec": 8.229350734996842, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6574, "loss": 3.8024260997772217, "lr": 0.0002, "elapsed_sec": 54535.48008775711, "step_time_sec": 8.228876832989044, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6575, "loss": 4.135559558868408, "lr": 0.0002, "elapsed_sec": 54543.70899248123, "step_time_sec": 8.228768035012763, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6576, "loss": 3.9328482151031494, "lr": 0.0002, "elapsed_sec": 54551.939133644104, "step_time_sec": 8.229943362995982, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6577, "loss": 3.9892258644104004, "lr": 0.0002, "elapsed_sec": 54560.17047691345, "step_time_sec": 8.23118889000034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6578, "loss": 3.914517879486084, "lr": 0.0002, "elapsed_sec": 54568.3996450901, "step_time_sec": 8.228969749005046, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6579, "loss": 3.9221558570861816, "lr": 0.0002, "elapsed_sec": 54576.62777352333, "step_time_sec": 8.227975057001458, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6580, "loss": 4.095399856567383, "lr": 0.0002, "elapsed_sec": 54584.857719659805, "step_time_sec": 8.229789779987186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6581, "loss": 3.855846405029297, "lr": 0.0002, "elapsed_sec": 54593.08573889732, "step_time_sec": 8.227912050002487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6582, "loss": 3.8956964015960693, "lr": 0.0002, "elapsed_sec": 54601.31432890892, "step_time_sec": 8.228365089016734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6583, "loss": 4.080234527587891, "lr": 0.0002, "elapsed_sec": 54609.54264950752, "step_time_sec": 8.22817712899996, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6584, "loss": 3.9661672115325928, "lr": 0.0002, "elapsed_sec": 54617.77081465721, "step_time_sec": 8.228057459025877, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6585, "loss": 4.073153018951416, "lr": 0.0002, "elapsed_sec": 54626.00095272064, "step_time_sec": 8.229912953014718, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6586, "loss": 3.8885304927825928, "lr": 0.0002, "elapsed_sec": 54634.23089361191, "step_time_sec": 8.229838447994553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6587, "loss": 4.048900127410889, "lr": 0.0002, "elapsed_sec": 54642.46129274368, "step_time_sec": 8.230178951023845, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6588, "loss": 3.8878138065338135, "lr": 0.0002, "elapsed_sec": 54650.69311213493, "step_time_sec": 8.231659065990243, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6589, "loss": 3.884115219116211, "lr": 0.0002, "elapsed_sec": 54658.92362642288, "step_time_sec": 8.230376298015472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6590, "loss": 4.055511951446533, "lr": 0.0002, "elapsed_sec": 54667.154789447784, "step_time_sec": 8.23101407600916, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6591, "loss": 3.9150443077087402, "lr": 0.0002, "elapsed_sec": 54675.3853187561, "step_time_sec": 8.230429452989483, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6592, "loss": 3.969630002975464, "lr": 0.0002, "elapsed_sec": 54683.61652326584, "step_time_sec": 8.230974532023538, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6593, "loss": 3.876009702682495, "lr": 0.0002, "elapsed_sec": 54691.84511804581, "step_time_sec": 8.22845280400361, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6594, "loss": 3.807100534439087, "lr": 0.0002, "elapsed_sec": 54700.073050260544, "step_time_sec": 8.22781786401174, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6595, "loss": 4.047976016998291, "lr": 0.0002, "elapsed_sec": 54708.30228662491, "step_time_sec": 8.229111595021095, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6596, "loss": 3.946650505065918, "lr": 0.0002, "elapsed_sec": 54716.53225135803, "step_time_sec": 8.229711289022816, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6597, "loss": 3.8875527381896973, "lr": 0.0002, "elapsed_sec": 54724.762665748596, "step_time_sec": 8.230301547999261, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6598, "loss": 3.9657936096191406, "lr": 0.0002, "elapsed_sec": 54732.99340605736, "step_time_sec": 8.230606765981065, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6599, "loss": 4.1070051193237305, "lr": 0.0002, "elapsed_sec": 54741.224804878235, "step_time_sec": 8.231171469000401, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6600, "loss": 3.838104009628296, "lr": 0.0002, "elapsed_sec": 54749.455137491226, "step_time_sec": 8.230226652987767, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6601, "loss": 4.078604221343994, "lr": 0.0002, "elapsed_sec": 54757.685710430145, "step_time_sec": 8.23035673800041, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6602, "loss": 3.881230592727661, "lr": 0.0002, "elapsed_sec": 54765.9169318676, "step_time_sec": 8.231078436976532, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6603, "loss": 3.925508737564087, "lr": 0.0002, "elapsed_sec": 54774.14730095863, "step_time_sec": 8.230269097985001, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6604, "loss": 3.841207265853882, "lr": 0.0002, "elapsed_sec": 54782.37817597389, "step_time_sec": 8.230682468012674, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6605, "loss": 3.9385175704956055, "lr": 0.0002, "elapsed_sec": 54790.608936309814, "step_time_sec": 8.230572310014395, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6606, "loss": 3.9744439125061035, "lr": 0.0002, "elapsed_sec": 54798.84029555321, "step_time_sec": 8.23120662101428, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6607, "loss": 4.00516939163208, "lr": 0.0002, "elapsed_sec": 54807.06984138489, "step_time_sec": 8.229379161988618, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6608, "loss": 3.892514705657959, "lr": 0.0002, "elapsed_sec": 54815.29733753204, "step_time_sec": 8.22735027098679, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6609, "loss": 3.9219257831573486, "lr": 0.0002, "elapsed_sec": 54823.52819895744, "step_time_sec": 8.230692879005801, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6610, "loss": 3.890136480331421, "lr": 0.0002, "elapsed_sec": 54831.758697748184, "step_time_sec": 8.23036389099434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6611, "loss": 4.013800144195557, "lr": 0.0002, "elapsed_sec": 54839.98943686485, "step_time_sec": 8.230584370001452, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6612, "loss": 4.075018405914307, "lr": 0.0002, "elapsed_sec": 54848.21846675873, "step_time_sec": 8.228891712991754, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6613, "loss": 4.063129425048828, "lr": 0.0002, "elapsed_sec": 54856.44895076752, "step_time_sec": 8.230293842992978, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6614, "loss": 3.944270372390747, "lr": 0.0002, "elapsed_sec": 54864.67732477188, "step_time_sec": 8.228228255000431, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6615, "loss": 4.002574920654297, "lr": 0.0002, "elapsed_sec": 54872.906170368195, "step_time_sec": 8.22865128700505, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6616, "loss": 4.054176330566406, "lr": 0.0002, "elapsed_sec": 54881.13613986969, "step_time_sec": 8.229803214984713, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6617, "loss": 3.885982036590576, "lr": 0.0002, "elapsed_sec": 54889.36648392677, "step_time_sec": 8.230230065004434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6618, "loss": 4.067574977874756, "lr": 0.0002, "elapsed_sec": 54897.59725522995, "step_time_sec": 8.23053084500134, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6619, "loss": 4.057928562164307, "lr": 0.0002, "elapsed_sec": 54905.826813697815, "step_time_sec": 8.229395873000612, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6620, "loss": 4.1505584716796875, "lr": 0.0002, "elapsed_sec": 54914.05514025688, "step_time_sec": 8.22818798298249, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6621, "loss": 3.896184206008911, "lr": 0.0002, "elapsed_sec": 54922.28356003761, "step_time_sec": 8.22832691099029, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6622, "loss": 4.037418842315674, "lr": 0.0002, "elapsed_sec": 54930.51265716553, "step_time_sec": 8.228887305012904, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6623, "loss": 3.8655264377593994, "lr": 0.0002, "elapsed_sec": 54938.74255776405, "step_time_sec": 8.229723938013194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6624, "loss": 3.9330971240997314, "lr": 0.0002, "elapsed_sec": 54946.973776102066, "step_time_sec": 8.23111801900086, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6625, "loss": 3.826066017150879, "lr": 0.0002, "elapsed_sec": 54955.204807281494, "step_time_sec": 8.230895492975833, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6626, "loss": 3.8966875076293945, "lr": 0.0002, "elapsed_sec": 54963.435750961304, "step_time_sec": 8.230791668989696, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6627, "loss": 4.0091938972473145, "lr": 0.0002, "elapsed_sec": 54971.66428089142, "step_time_sec": 8.228329664008925, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6628, "loss": 3.9789822101593018, "lr": 0.0002, "elapsed_sec": 54979.895401239395, "step_time_sec": 8.231000194995431, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6629, "loss": 3.9469172954559326, "lr": 0.0002, "elapsed_sec": 54988.125918626785, "step_time_sec": 8.230314223997993, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6630, "loss": 3.9659013748168945, "lr": 0.0002, "elapsed_sec": 54996.35679149628, "step_time_sec": 8.23071769499802, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6631, "loss": 3.9436962604522705, "lr": 0.0002, "elapsed_sec": 55004.58816409111, "step_time_sec": 8.231251809018431, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6632, "loss": 4.040942192077637, "lr": 0.0002, "elapsed_sec": 55012.819113492966, "step_time_sec": 8.23072604299523, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6633, "loss": 3.89459490776062, "lr": 0.0002, "elapsed_sec": 55021.047924757004, "step_time_sec": 8.228640932007693, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6634, "loss": 3.9432621002197266, "lr": 0.0002, "elapsed_sec": 55029.27812099457, "step_time_sec": 8.230099792999681, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6635, "loss": 3.885833501815796, "lr": 0.0002, "elapsed_sec": 55037.50980806351, "step_time_sec": 8.23151642800076, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6636, "loss": 3.8843350410461426, "lr": 0.0002, "elapsed_sec": 55045.741107702255, "step_time_sec": 8.231080202007433, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6637, "loss": 4.01121187210083, "lr": 0.0002, "elapsed_sec": 55053.97089600563, "step_time_sec": 8.229707689984934, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6638, "loss": 4.022536277770996, "lr": 0.0002, "elapsed_sec": 55062.201357126236, "step_time_sec": 8.230214861017885, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6639, "loss": 3.884678363800049, "lr": 0.0002, "elapsed_sec": 55070.431462049484, "step_time_sec": 8.22996219000197, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6640, "loss": 3.915024518966675, "lr": 0.0002, "elapsed_sec": 55078.66248226166, "step_time_sec": 8.230892428022344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6641, "loss": 3.9306910037994385, "lr": 0.0002, "elapsed_sec": 55086.89303421974, "step_time_sec": 8.230326326010982, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6642, "loss": 4.183370590209961, "lr": 0.0002, "elapsed_sec": 55095.121666669846, "step_time_sec": 8.22855101400637, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6643, "loss": 3.934264659881592, "lr": 0.0002, "elapsed_sec": 55103.35210251808, "step_time_sec": 8.23023894999642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6644, "loss": 3.847156286239624, "lr": 0.0002, "elapsed_sec": 55111.58334040642, "step_time_sec": 8.23104611999588, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6645, "loss": 3.82957124710083, "lr": 0.0002, "elapsed_sec": 55119.813921928406, "step_time_sec": 8.23038926400477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6646, "loss": 3.902285575866699, "lr": 0.0002, "elapsed_sec": 55128.04459357262, "step_time_sec": 8.230502719990909, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6647, "loss": 3.944124698638916, "lr": 0.0002, "elapsed_sec": 55136.2763106823, "step_time_sec": 8.231568322982639, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6648, "loss": 3.9589877128601074, "lr": 0.0002, "elapsed_sec": 55144.50688743591, "step_time_sec": 8.230468466994353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6649, "loss": 3.969559669494629, "lr": 0.0002, "elapsed_sec": 55152.73804354668, "step_time_sec": 8.230958746018587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6650, "loss": 4.0442070960998535, "lr": 0.0002, "elapsed_sec": 55160.96837592125, "step_time_sec": 8.230136461992515, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6651, "loss": 3.7184698581695557, "lr": 0.0002, "elapsed_sec": 55169.19875526428, "step_time_sec": 8.23025079301442, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6652, "loss": 3.9503424167633057, "lr": 0.0002, "elapsed_sec": 55177.42938375473, "step_time_sec": 8.230459415994119, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6653, "loss": 3.825741767883301, "lr": 0.0002, "elapsed_sec": 55185.659603595734, "step_time_sec": 8.230115607992047, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6654, "loss": 4.013542175292969, "lr": 0.0002, "elapsed_sec": 55193.891033649445, "step_time_sec": 8.231279778003227, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6655, "loss": 3.986527442932129, "lr": 0.0002, "elapsed_sec": 55202.12090444565, "step_time_sec": 8.229637085983995, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6656, "loss": 3.9263949394226074, "lr": 0.0002, "elapsed_sec": 55210.35162949562, "step_time_sec": 8.230592488020193, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6657, "loss": 4.012850284576416, "lr": 0.0002, "elapsed_sec": 55218.58225488663, "step_time_sec": 8.230491850001272, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6658, "loss": 3.9479238986968994, "lr": 0.0002, "elapsed_sec": 55226.81263041496, "step_time_sec": 8.230173151008785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6659, "loss": 3.9349937438964844, "lr": 0.0002, "elapsed_sec": 55235.04325866699, "step_time_sec": 8.230474787007552, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6660, "loss": 3.962836265563965, "lr": 0.0002, "elapsed_sec": 55243.27203774452, "step_time_sec": 8.228675126010785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6661, "loss": 3.9022128582000732, "lr": 0.0002, "elapsed_sec": 55251.50257730484, "step_time_sec": 8.230336628010264, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6662, "loss": 3.750035524368286, "lr": 0.0002, "elapsed_sec": 55259.73311090469, "step_time_sec": 8.230394952988718, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6663, "loss": 3.901221513748169, "lr": 0.0002, "elapsed_sec": 55267.963676929474, "step_time_sec": 8.230372430989519, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6664, "loss": 4.121871471405029, "lr": 0.0002, "elapsed_sec": 55276.19432735443, "step_time_sec": 8.230537716008257, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6665, "loss": 3.8915648460388184, "lr": 0.0002, "elapsed_sec": 55284.42433929443, "step_time_sec": 8.229843037988758, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6666, "loss": 3.8396503925323486, "lr": 0.0002, "elapsed_sec": 55292.654183387756, "step_time_sec": 8.229706163983792, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6667, "loss": 4.061798095703125, "lr": 0.0002, "elapsed_sec": 55300.882611989975, "step_time_sec": 8.228230730019277, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6668, "loss": 3.943091630935669, "lr": 0.0002, "elapsed_sec": 55309.11337852478, "step_time_sec": 8.230615223001223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6669, "loss": 4.148443698883057, "lr": 0.0002, "elapsed_sec": 55317.34481072426, "step_time_sec": 8.231265359005192, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6670, "loss": 3.939049243927002, "lr": 0.0002, "elapsed_sec": 55325.57613945007, "step_time_sec": 8.231195939995814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6671, "loss": 3.9379351139068604, "lr": 0.0002, "elapsed_sec": 55333.80470609665, "step_time_sec": 8.228463987004943, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6672, "loss": 3.9401297569274902, "lr": 0.0002, "elapsed_sec": 55342.03406047821, "step_time_sec": 8.229117659007898, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6673, "loss": 4.031454563140869, "lr": 0.0002, "elapsed_sec": 55350.26522278786, "step_time_sec": 8.231023806001758, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6674, "loss": 3.8351709842681885, "lr": 0.0002, "elapsed_sec": 55358.4958922863, "step_time_sec": 8.230472919007298, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6675, "loss": 3.9039595127105713, "lr": 0.0002, "elapsed_sec": 55366.727244615555, "step_time_sec": 8.231245677015977, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6676, "loss": 3.962076425552368, "lr": 0.0002, "elapsed_sec": 55374.95818090439, "step_time_sec": 8.230763188999845, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6677, "loss": 3.959362030029297, "lr": 0.0002, "elapsed_sec": 55383.18841171265, "step_time_sec": 8.230021027004113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6678, "loss": 3.863994598388672, "lr": 0.0002, "elapsed_sec": 55391.41897392273, "step_time_sec": 8.230414263001876, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6679, "loss": 3.9769022464752197, "lr": 0.0002, "elapsed_sec": 55399.64902353287, "step_time_sec": 8.229951424000319, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6680, "loss": 3.9370222091674805, "lr": 0.0002, "elapsed_sec": 55407.87965607643, "step_time_sec": 8.230405295995297, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6681, "loss": 4.019830703735352, "lr": 0.0002, "elapsed_sec": 55416.10808134079, "step_time_sec": 8.228258834977169, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6682, "loss": 3.9072935581207275, "lr": 0.0002, "elapsed_sec": 55424.334607839584, "step_time_sec": 8.226346954004839, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6683, "loss": 3.9067959785461426, "lr": 0.0002, "elapsed_sec": 55432.56412625313, "step_time_sec": 8.229357542004436, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6684, "loss": 3.998234987258911, "lr": 0.0002, "elapsed_sec": 55440.79100227356, "step_time_sec": 8.226722032006364, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6685, "loss": 3.8864665031433105, "lr": 0.0002, "elapsed_sec": 55449.01874113083, "step_time_sec": 8.227625474013621, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6686, "loss": 3.937774896621704, "lr": 0.0002, "elapsed_sec": 55457.246810913086, "step_time_sec": 8.227856360026635, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6687, "loss": 3.8305552005767822, "lr": 0.0002, "elapsed_sec": 55465.47504091263, "step_time_sec": 8.22807737099356, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6688, "loss": 4.019756317138672, "lr": 0.0002, "elapsed_sec": 55473.703882694244, "step_time_sec": 8.228659246989992, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6689, "loss": 3.9414970874786377, "lr": 0.0002, "elapsed_sec": 55481.93298316002, "step_time_sec": 8.228956998995272, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6690, "loss": 4.0002336502075195, "lr": 0.0002, "elapsed_sec": 55490.162695884705, "step_time_sec": 8.229663478006842, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6691, "loss": 3.9385149478912354, "lr": 0.0002, "elapsed_sec": 55498.392402648926, "step_time_sec": 8.229509892989881, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6692, "loss": 4.071692943572998, "lr": 0.0002, "elapsed_sec": 55506.62048792839, "step_time_sec": 8.22790150399669, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6693, "loss": 4.060425281524658, "lr": 0.0002, "elapsed_sec": 55514.850190639496, "step_time_sec": 8.229523715010146, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6694, "loss": 3.8607990741729736, "lr": 0.0002, "elapsed_sec": 55523.07953763008, "step_time_sec": 8.229159340000479, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6695, "loss": 3.8682563304901123, "lr": 0.0002, "elapsed_sec": 55531.30993652344, "step_time_sec": 8.230238126008771, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6696, "loss": 3.8741493225097656, "lr": 0.0002, "elapsed_sec": 55539.53902244568, "step_time_sec": 8.22892150998814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6697, "loss": 3.936208963394165, "lr": 0.0002, "elapsed_sec": 55547.76822042465, "step_time_sec": 8.229039560013916, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6698, "loss": 3.932182788848877, "lr": 0.0002, "elapsed_sec": 55555.99948120117, "step_time_sec": 8.231112044013571, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6699, "loss": 4.019916534423828, "lr": 0.0002, "elapsed_sec": 55564.230469465256, "step_time_sec": 8.230885085999034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6700, "loss": 4.0324296951293945, "lr": 0.0002, "elapsed_sec": 55572.46131873131, "step_time_sec": 8.230647991003934, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6701, "loss": 3.9353842735290527, "lr": 0.0002, "elapsed_sec": 55580.69231462479, "step_time_sec": 8.230826049984898, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6702, "loss": 4.069361686706543, "lr": 0.0002, "elapsed_sec": 55588.922726392746, "step_time_sec": 8.23031605000142, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6703, "loss": 4.049458980560303, "lr": 0.0002, "elapsed_sec": 55597.15400099754, "step_time_sec": 8.2311372430122, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6704, "loss": 3.981907844543457, "lr": 0.0002, "elapsed_sec": 55605.38278675079, "step_time_sec": 8.228576519992203, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6705, "loss": 4.100305080413818, "lr": 0.0002, "elapsed_sec": 55613.61227226257, "step_time_sec": 8.229280594008742, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6706, "loss": 4.062626838684082, "lr": 0.0002, "elapsed_sec": 55621.84047937393, "step_time_sec": 8.228046520001953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6707, "loss": 4.061092853546143, "lr": 0.0002, "elapsed_sec": 55630.069716215134, "step_time_sec": 8.22915254702093, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6708, "loss": 3.9645886421203613, "lr": 0.0002, "elapsed_sec": 55638.29755163193, "step_time_sec": 8.227608905988745, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6709, "loss": 4.188321113586426, "lr": 0.0002, "elapsed_sec": 55646.52740430832, "step_time_sec": 8.229685808997601, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6710, "loss": 4.0904765129089355, "lr": 0.0002, "elapsed_sec": 55654.75775337219, "step_time_sec": 8.2302404550137, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6711, "loss": 3.9729254245758057, "lr": 0.0002, "elapsed_sec": 55662.98778605461, "step_time_sec": 8.229859507002402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6712, "loss": 3.98009991645813, "lr": 0.0002, "elapsed_sec": 55671.219474077225, "step_time_sec": 8.231470395985525, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6713, "loss": 3.98095703125, "lr": 0.0002, "elapsed_sec": 55679.44704246521, "step_time_sec": 8.227442421979504, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6714, "loss": 4.022159576416016, "lr": 0.0002, "elapsed_sec": 55687.674041986465, "step_time_sec": 8.226780012017116, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6715, "loss": 4.020989418029785, "lr": 0.0002, "elapsed_sec": 55695.90265870094, "step_time_sec": 8.2284410049906, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6716, "loss": 3.889666795730591, "lr": 0.0002, "elapsed_sec": 55704.13399839401, "step_time_sec": 8.231240769993747, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6717, "loss": 3.8926823139190674, "lr": 0.0002, "elapsed_sec": 55712.36412358284, "step_time_sec": 8.229949680011487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6718, "loss": 3.9504587650299072, "lr": 0.0002, "elapsed_sec": 55720.59488749504, "step_time_sec": 8.230556338996394, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6719, "loss": 3.856999635696411, "lr": 0.0002, "elapsed_sec": 55728.82498574257, "step_time_sec": 8.229929530993104, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6720, "loss": 3.963273048400879, "lr": 0.0002, "elapsed_sec": 55737.05490088463, "step_time_sec": 8.22974180901656, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6721, "loss": 3.9080164432525635, "lr": 0.0002, "elapsed_sec": 55745.28394651413, "step_time_sec": 8.22889666899573, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6722, "loss": 3.9723918437957764, "lr": 0.0002, "elapsed_sec": 55753.51156234741, "step_time_sec": 8.227442005998455, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6723, "loss": 4.080097198486328, "lr": 0.0002, "elapsed_sec": 55761.74057793617, "step_time_sec": 8.228852030995768, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6724, "loss": 4.001108169555664, "lr": 0.0002, "elapsed_sec": 55769.969469070435, "step_time_sec": 8.22877288100426, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6725, "loss": 3.8922219276428223, "lr": 0.0002, "elapsed_sec": 55778.199038267136, "step_time_sec": 8.229417197988369, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6726, "loss": 3.9396872520446777, "lr": 0.0002, "elapsed_sec": 55786.4266974926, "step_time_sec": 8.22743941002409, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6727, "loss": 3.8409981727600098, "lr": 0.0002, "elapsed_sec": 55794.65585422516, "step_time_sec": 8.228993783995975, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6728, "loss": 3.948594331741333, "lr": 0.0002, "elapsed_sec": 55802.886390924454, "step_time_sec": 8.230385255999863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6729, "loss": 3.9229326248168945, "lr": 0.0002, "elapsed_sec": 55811.11714196205, "step_time_sec": 8.230585159006296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6730, "loss": 3.9731132984161377, "lr": 0.0002, "elapsed_sec": 55819.34848737717, "step_time_sec": 8.231228794000344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6731, "loss": 3.986945152282715, "lr": 0.0002, "elapsed_sec": 55827.576466560364, "step_time_sec": 8.227764659008244, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6732, "loss": 4.048515319824219, "lr": 0.0002, "elapsed_sec": 55835.80529713631, "step_time_sec": 8.228660811990267, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6733, "loss": 3.986564874649048, "lr": 0.0002, "elapsed_sec": 55844.03461456299, "step_time_sec": 8.229185756004881, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6734, "loss": 3.891709566116333, "lr": 0.0002, "elapsed_sec": 55852.26482272148, "step_time_sec": 8.230048206023639, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6735, "loss": 4.001583099365234, "lr": 0.0002, "elapsed_sec": 55860.49551534653, "step_time_sec": 8.230537117022322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6736, "loss": 3.993048667907715, "lr": 0.0002, "elapsed_sec": 55868.724277973175, "step_time_sec": 8.228570273990044, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6737, "loss": 4.016592025756836, "lr": 0.0002, "elapsed_sec": 55876.95379400253, "step_time_sec": 8.229381482000463, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6738, "loss": 4.009341716766357, "lr": 0.0002, "elapsed_sec": 55885.18172454834, "step_time_sec": 8.22779228098807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6739, "loss": 4.018731594085693, "lr": 0.0002, "elapsed_sec": 55893.4121632576, "step_time_sec": 8.23022781900363, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6740, "loss": 4.118954658508301, "lr": 0.0002, "elapsed_sec": 55901.642622709274, "step_time_sec": 8.23029992901138, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6741, "loss": 3.9924368858337402, "lr": 0.0002, "elapsed_sec": 55909.87128639221, "step_time_sec": 8.228490612003952, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6742, "loss": 3.718067169189453, "lr": 0.0002, "elapsed_sec": 55918.09928441048, "step_time_sec": 8.227910895016976, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6743, "loss": 4.097124099731445, "lr": 0.0002, "elapsed_sec": 55926.327493190765, "step_time_sec": 8.227985250996426, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6744, "loss": 4.081872463226318, "lr": 0.0002, "elapsed_sec": 55934.55765199661, "step_time_sec": 8.230045624019112, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6745, "loss": 3.8735764026641846, "lr": 0.0002, "elapsed_sec": 55942.78852057457, "step_time_sec": 8.230679975997191, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6746, "loss": 4.005336761474609, "lr": 0.0002, "elapsed_sec": 55951.018584012985, "step_time_sec": 8.229863761982415, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6747, "loss": 4.0705647468566895, "lr": 0.0002, "elapsed_sec": 55959.24721288681, "step_time_sec": 8.228480563993799, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6748, "loss": 4.025579929351807, "lr": 0.0002, "elapsed_sec": 55967.47563934326, "step_time_sec": 8.228248061990598, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6749, "loss": 4.0016398429870605, "lr": 0.0002, "elapsed_sec": 55975.7060174942, "step_time_sec": 8.230213926988654, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6750, "loss": 4.01898193359375, "lr": 0.0002, "elapsed_sec": 55983.93678069115, "step_time_sec": 8.23060198200983, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6751, "loss": 4.142134666442871, "lr": 0.0002, "elapsed_sec": 55992.16588926315, "step_time_sec": 8.228948903008131, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6752, "loss": 3.9020111560821533, "lr": 0.0002, "elapsed_sec": 56000.39535832405, "step_time_sec": 8.229361589998007, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6753, "loss": 3.9376039505004883, "lr": 0.0002, "elapsed_sec": 56008.6230430603, "step_time_sec": 8.227517684019404, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6754, "loss": 4.153228282928467, "lr": 0.0002, "elapsed_sec": 56016.853013277054, "step_time_sec": 8.229768303019227, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6755, "loss": 4.071016788482666, "lr": 0.0002, "elapsed_sec": 56025.083156347275, "step_time_sec": 8.230037067987723, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6756, "loss": 3.9246392250061035, "lr": 0.0002, "elapsed_sec": 56033.31381082535, "step_time_sec": 8.230431942996802, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6757, "loss": 3.8513216972351074, "lr": 0.0002, "elapsed_sec": 56041.54387784004, "step_time_sec": 8.229911197995534, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6758, "loss": 3.9229981899261475, "lr": 0.0002, "elapsed_sec": 56049.77446103096, "step_time_sec": 8.230457007011864, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6759, "loss": 4.06650972366333, "lr": 0.0002, "elapsed_sec": 56058.004596948624, "step_time_sec": 8.22992166801123, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6760, "loss": 4.057176113128662, "lr": 0.0002, "elapsed_sec": 56066.234198093414, "step_time_sec": 8.229453155014198, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6761, "loss": 4.091211318969727, "lr": 0.0002, "elapsed_sec": 56074.464720487595, "step_time_sec": 8.230413068988128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6762, "loss": 4.061831951141357, "lr": 0.0002, "elapsed_sec": 56082.69588160515, "step_time_sec": 8.230974085017806, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6763, "loss": 4.038200378417969, "lr": 0.0002, "elapsed_sec": 56090.9270029068, "step_time_sec": 8.230939619999845, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6764, "loss": 4.015891075134277, "lr": 0.0002, "elapsed_sec": 56099.157567977905, "step_time_sec": 8.230409448995488, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6765, "loss": 4.080648899078369, "lr": 0.0002, "elapsed_sec": 56107.38849520683, "step_time_sec": 8.230762179999147, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6766, "loss": 4.048790454864502, "lr": 0.0002, "elapsed_sec": 56115.618522167206, "step_time_sec": 8.229940902994713, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6767, "loss": 3.828212022781372, "lr": 0.0002, "elapsed_sec": 56123.849715948105, "step_time_sec": 8.230998318991624, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6768, "loss": 4.120938777923584, "lr": 0.0002, "elapsed_sec": 56132.08055925369, "step_time_sec": 8.230623323994223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6769, "loss": 3.8492329120635986, "lr": 0.0002, "elapsed_sec": 56140.31068396568, "step_time_sec": 8.230055499996524, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6770, "loss": 3.8085415363311768, "lr": 0.0002, "elapsed_sec": 56148.541493177414, "step_time_sec": 8.230591682018712, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6771, "loss": 4.054371356964111, "lr": 0.0002, "elapsed_sec": 56156.771976709366, "step_time_sec": 8.230339245987125, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6772, "loss": 4.038695812225342, "lr": 0.0002, "elapsed_sec": 56165.00230431557, "step_time_sec": 8.230217211996205, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6773, "loss": 3.9768879413604736, "lr": 0.0002, "elapsed_sec": 56173.23201799393, "step_time_sec": 8.229475087020546, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6774, "loss": 3.9015872478485107, "lr": 0.0002, "elapsed_sec": 56181.461399793625, "step_time_sec": 8.229205022013048, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6775, "loss": 3.948389768600464, "lr": 0.0002, "elapsed_sec": 56189.68968629837, "step_time_sec": 8.228180245001568, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6776, "loss": 3.92635178565979, "lr": 0.0002, "elapsed_sec": 56197.91806769371, "step_time_sec": 8.228186376014492, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6777, "loss": 3.957775354385376, "lr": 0.0002, "elapsed_sec": 56206.14871931076, "step_time_sec": 8.230483274004655, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6778, "loss": 4.00319766998291, "lr": 0.0002, "elapsed_sec": 56214.379657268524, "step_time_sec": 8.230771876988001, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6779, "loss": 3.9378607273101807, "lr": 0.0002, "elapsed_sec": 56222.60918498039, "step_time_sec": 8.229401254007826, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6780, "loss": 3.9452974796295166, "lr": 0.0002, "elapsed_sec": 56230.838272571564, "step_time_sec": 8.228920222987654, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6781, "loss": 3.933351993560791, "lr": 0.0002, "elapsed_sec": 56239.06921362877, "step_time_sec": 8.230826753017027, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6782, "loss": 3.922969102859497, "lr": 0.0002, "elapsed_sec": 56247.29908847809, "step_time_sec": 8.229661623016, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6783, "loss": 3.881481409072876, "lr": 0.0002, "elapsed_sec": 56255.52847003937, "step_time_sec": 8.229257074999623, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6784, "loss": 3.9608139991760254, "lr": 0.0002, "elapsed_sec": 56263.75623035431, "step_time_sec": 8.2276637640025, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6785, "loss": 4.004958629608154, "lr": 0.0002, "elapsed_sec": 56271.98625421524, "step_time_sec": 8.229860837978777, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6786, "loss": 3.999690055847168, "lr": 0.0002, "elapsed_sec": 56280.21237421036, "step_time_sec": 8.225943619007012, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6787, "loss": 4.041109085083008, "lr": 0.0002, "elapsed_sec": 56288.440514326096, "step_time_sec": 8.227931376983179, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6788, "loss": 3.9340014457702637, "lr": 0.0002, "elapsed_sec": 56296.66939878464, "step_time_sec": 8.228786688006949, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6789, "loss": 3.838923931121826, "lr": 0.0002, "elapsed_sec": 56304.90069937706, "step_time_sec": 8.231136956979753, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6790, "loss": 4.014912128448486, "lr": 0.0002, "elapsed_sec": 56313.13170194626, "step_time_sec": 8.230811953981174, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6791, "loss": 4.007974147796631, "lr": 0.0002, "elapsed_sec": 56321.36302947998, "step_time_sec": 8.231166255980497, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6792, "loss": 4.063453674316406, "lr": 0.0002, "elapsed_sec": 56329.59419989586, "step_time_sec": 8.231048700981773, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6793, "loss": 3.859520435333252, "lr": 0.0002, "elapsed_sec": 56337.824394226074, "step_time_sec": 8.229999602015596, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6794, "loss": 3.99415922164917, "lr": 0.0002, "elapsed_sec": 56346.056023836136, "step_time_sec": 8.231464112002868, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6795, "loss": 3.9381892681121826, "lr": 0.0002, "elapsed_sec": 56354.28647542, "step_time_sec": 8.230278178991284, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6796, "loss": 4.02406644821167, "lr": 0.0002, "elapsed_sec": 56362.517691373825, "step_time_sec": 8.231143684999552, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6797, "loss": 3.970428228378296, "lr": 0.0002, "elapsed_sec": 56370.74737882614, "step_time_sec": 8.229463889001636, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6798, "loss": 3.880463123321533, "lr": 0.0002, "elapsed_sec": 56378.97814702988, "step_time_sec": 8.230641597998329, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6799, "loss": 4.008396148681641, "lr": 0.0002, "elapsed_sec": 56387.2090678215, "step_time_sec": 8.23074040201027, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6800, "loss": 4.010987758636475, "lr": 0.0002, "elapsed_sec": 56395.43839097023, "step_time_sec": 8.229188529017847, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6801, "loss": 4.041487216949463, "lr": 0.0002, "elapsed_sec": 56403.66737031937, "step_time_sec": 8.228781504993094, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6802, "loss": 4.044915199279785, "lr": 0.0002, "elapsed_sec": 56411.89737081528, "step_time_sec": 8.229848404007498, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6803, "loss": 3.911872625350952, "lr": 0.0002, "elapsed_sec": 56420.128224372864, "step_time_sec": 8.230769727000734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6804, "loss": 4.016534328460693, "lr": 0.0002, "elapsed_sec": 56428.35880756378, "step_time_sec": 8.230361811001785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6805, "loss": 4.081536293029785, "lr": 0.0002, "elapsed_sec": 56436.58926153183, "step_time_sec": 8.230299675022252, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6806, "loss": 3.947039842605591, "lr": 0.0002, "elapsed_sec": 56444.8204703331, "step_time_sec": 8.23108227099874, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6807, "loss": 3.9683961868286133, "lr": 0.0002, "elapsed_sec": 56453.05111384392, "step_time_sec": 8.230494456016459, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6808, "loss": 3.834191083908081, "lr": 0.0002, "elapsed_sec": 56461.28194975853, "step_time_sec": 8.230656834988622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6809, "loss": 4.016458034515381, "lr": 0.0002, "elapsed_sec": 56469.51235818863, "step_time_sec": 8.230235010007164, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6810, "loss": 4.0356316566467285, "lr": 0.0002, "elapsed_sec": 56477.74187493324, "step_time_sec": 8.229363082995405, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6811, "loss": 3.9280550479888916, "lr": 0.0002, "elapsed_sec": 56485.97189092636, "step_time_sec": 8.229859621991636, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6812, "loss": 3.9215641021728516, "lr": 0.0002, "elapsed_sec": 56494.19900083542, "step_time_sec": 8.226983753003879, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6813, "loss": 4.026139736175537, "lr": 0.0002, "elapsed_sec": 56502.429742097855, "step_time_sec": 8.230584346980322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6814, "loss": 4.041325569152832, "lr": 0.0002, "elapsed_sec": 56510.66082572937, "step_time_sec": 8.230956617975608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6815, "loss": 4.140219211578369, "lr": 0.0002, "elapsed_sec": 56518.89207029343, "step_time_sec": 8.23104926297674, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6816, "loss": 3.9841842651367188, "lr": 0.0002, "elapsed_sec": 56527.12326455116, "step_time_sec": 8.23106104900944, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6817, "loss": 4.027373313903809, "lr": 0.0002, "elapsed_sec": 56535.35438632965, "step_time_sec": 8.231022460007807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6818, "loss": 3.9593706130981445, "lr": 0.0002, "elapsed_sec": 56543.58512187004, "step_time_sec": 8.23048441199353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6819, "loss": 4.013697624206543, "lr": 0.0002, "elapsed_sec": 56551.81612467766, "step_time_sec": 8.230861149990233, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6820, "loss": 4.135967254638672, "lr": 0.0002, "elapsed_sec": 56560.046828746796, "step_time_sec": 8.230613377992995, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6821, "loss": 3.8795664310455322, "lr": 0.0002, "elapsed_sec": 56568.27762413025, "step_time_sec": 8.230630967998877, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6822, "loss": 3.9814321994781494, "lr": 0.0002, "elapsed_sec": 56576.5087018013, "step_time_sec": 8.230897439992987, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6823, "loss": 4.002744197845459, "lr": 0.0002, "elapsed_sec": 56584.73831987381, "step_time_sec": 8.229433925997, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6824, "loss": 4.0611891746521, "lr": 0.0002, "elapsed_sec": 56592.96846628189, "step_time_sec": 8.230026671022642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6825, "loss": 3.992929458618164, "lr": 0.0002, "elapsed_sec": 56601.199387550354, "step_time_sec": 8.230765336978948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6826, "loss": 4.0243306159973145, "lr": 0.0002, "elapsed_sec": 56609.42910504341, "step_time_sec": 8.22955060901586, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6827, "loss": 3.9736528396606445, "lr": 0.0002, "elapsed_sec": 56617.660108327866, "step_time_sec": 8.230836736998754, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6828, "loss": 3.9413979053497314, "lr": 0.0002, "elapsed_sec": 56625.889810323715, "step_time_sec": 8.229570611991221, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6829, "loss": 4.005590915679932, "lr": 0.0002, "elapsed_sec": 56634.12083888054, "step_time_sec": 8.23084952600766, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6830, "loss": 3.914750099182129, "lr": 0.0002, "elapsed_sec": 56642.351687669754, "step_time_sec": 8.23070449999068, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6831, "loss": 4.068127155303955, "lr": 0.0002, "elapsed_sec": 56650.582260370255, "step_time_sec": 8.230477507022442, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6832, "loss": 4.024408340454102, "lr": 0.0002, "elapsed_sec": 56658.81304335594, "step_time_sec": 8.23055336001562, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6833, "loss": 4.004371166229248, "lr": 0.0002, "elapsed_sec": 56667.04258465767, "step_time_sec": 8.229389451007592, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6834, "loss": 4.153554439544678, "lr": 0.0002, "elapsed_sec": 56675.27279090881, "step_time_sec": 8.23010457601049, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6835, "loss": 4.06609582901001, "lr": 0.0002, "elapsed_sec": 56683.50380516052, "step_time_sec": 8.23080076900078, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6836, "loss": 4.101661205291748, "lr": 0.0002, "elapsed_sec": 56691.734558820724, "step_time_sec": 8.23060208800598, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6837, "loss": 4.068800926208496, "lr": 0.0002, "elapsed_sec": 56699.96488404274, "step_time_sec": 8.230229255015729, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6838, "loss": 3.9566357135772705, "lr": 0.0002, "elapsed_sec": 56708.19523024559, "step_time_sec": 8.230215558985947, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6839, "loss": 3.981255054473877, "lr": 0.0002, "elapsed_sec": 56716.4253885746, "step_time_sec": 8.229947993997484, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6840, "loss": 3.896245002746582, "lr": 0.0002, "elapsed_sec": 56724.653673172, "step_time_sec": 8.228119981999043, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6841, "loss": 3.897057294845581, "lr": 0.0002, "elapsed_sec": 56732.88153767586, "step_time_sec": 8.227700489980634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6842, "loss": 3.9792442321777344, "lr": 0.0002, "elapsed_sec": 56741.10963654518, "step_time_sec": 8.227997577021597, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6843, "loss": 3.805574893951416, "lr": 0.0002, "elapsed_sec": 56749.33869218826, "step_time_sec": 8.228852819011081, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6844, "loss": 4.052221775054932, "lr": 0.0002, "elapsed_sec": 56757.56941127777, "step_time_sec": 8.230588027014164, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6845, "loss": 4.097324848175049, "lr": 0.0002, "elapsed_sec": 56765.79996299744, "step_time_sec": 8.23041731599369, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6846, "loss": 3.9560208320617676, "lr": 0.0002, "elapsed_sec": 56774.093069553375, "step_time_sec": 8.236324066005182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6847, "loss": 4.13460636138916, "lr": 0.0002, "elapsed_sec": 56782.32134747505, "step_time_sec": 8.228092226025183, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6848, "loss": 4.069876194000244, "lr": 0.0002, "elapsed_sec": 56790.550704956055, "step_time_sec": 8.229244715999812, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6849, "loss": 4.103945732116699, "lr": 0.0002, "elapsed_sec": 56798.78057169914, "step_time_sec": 8.229638435004745, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6850, "loss": 4.1548686027526855, "lr": 0.0002, "elapsed_sec": 56807.01088356972, "step_time_sec": 8.230175142001826, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6851, "loss": 3.9675276279449463, "lr": 0.0002, "elapsed_sec": 56815.24164509773, "step_time_sec": 8.23059450299479, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6852, "loss": 4.164885520935059, "lr": 0.0002, "elapsed_sec": 56823.472286462784, "step_time_sec": 8.230499097990105, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6853, "loss": 3.956989049911499, "lr": 0.0002, "elapsed_sec": 56831.703093767166, "step_time_sec": 8.230630600010045, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6854, "loss": 4.226017475128174, "lr": 0.0002, "elapsed_sec": 56839.933000802994, "step_time_sec": 8.229777359985746, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6855, "loss": 4.04807710647583, "lr": 0.0002, "elapsed_sec": 56848.160158872604, "step_time_sec": 8.22694927698467, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6856, "loss": 4.0915608406066895, "lr": 0.0002, "elapsed_sec": 56856.389484643936, "step_time_sec": 8.229229552001925, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6857, "loss": 4.040955066680908, "lr": 0.0002, "elapsed_sec": 56864.61724805832, "step_time_sec": 8.227628426015144, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6858, "loss": 3.9868712425231934, "lr": 0.0002, "elapsed_sec": 56872.84600138664, "step_time_sec": 8.228518111020094, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6859, "loss": 4.098319053649902, "lr": 0.0002, "elapsed_sec": 56881.07469820976, "step_time_sec": 8.228593473002547, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6860, "loss": 4.030511856079102, "lr": 0.0002, "elapsed_sec": 56889.30423307419, "step_time_sec": 8.22932418101118, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6861, "loss": 4.179208755493164, "lr": 0.0002, "elapsed_sec": 56897.534239053726, "step_time_sec": 8.22989588099881, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6862, "loss": 4.117051124572754, "lr": 0.0002, "elapsed_sec": 56905.76303315163, "step_time_sec": 8.228659804008203, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6863, "loss": 4.171983242034912, "lr": 0.0002, "elapsed_sec": 56913.99149060249, "step_time_sec": 8.22830445697764, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6864, "loss": 4.240285873413086, "lr": 0.0002, "elapsed_sec": 56922.22106337547, "step_time_sec": 8.229329218011117, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6865, "loss": 3.997610092163086, "lr": 0.0002, "elapsed_sec": 56930.45165395737, "step_time_sec": 8.230450786009897, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6866, "loss": 3.9791207313537598, "lr": 0.0002, "elapsed_sec": 56938.67958664894, "step_time_sec": 8.227760781010147, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6867, "loss": 3.972749948501587, "lr": 0.0002, "elapsed_sec": 56946.907797575, "step_time_sec": 8.228024143987568, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6868, "loss": 4.0040082931518555, "lr": 0.0002, "elapsed_sec": 56955.13681721687, "step_time_sec": 8.228987134003546, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6869, "loss": 4.102136135101318, "lr": 0.0002, "elapsed_sec": 56963.36679267883, "step_time_sec": 8.229709288978484, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6870, "loss": 3.9129488468170166, "lr": 0.0002, "elapsed_sec": 56971.596794843674, "step_time_sec": 8.229892467992613, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6871, "loss": 4.038017749786377, "lr": 0.0002, "elapsed_sec": 56979.82747721672, "step_time_sec": 8.230494849005481, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6872, "loss": 4.288708686828613, "lr": 0.0002, "elapsed_sec": 56988.057727098465, "step_time_sec": 8.230123896995792, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6873, "loss": 4.098853588104248, "lr": 0.0002, "elapsed_sec": 56996.28967642784, "step_time_sec": 8.231769326986978, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6874, "loss": 3.9342002868652344, "lr": 0.0002, "elapsed_sec": 57004.51932477951, "step_time_sec": 8.229484521987615, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6875, "loss": 4.200708866119385, "lr": 0.0002, "elapsed_sec": 57012.750490665436, "step_time_sec": 8.231004186993232, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6876, "loss": 4.147547245025635, "lr": 0.0002, "elapsed_sec": 57020.97928023338, "step_time_sec": 8.22869105899008, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6877, "loss": 3.9841437339782715, "lr": 0.0002, "elapsed_sec": 57029.20804023743, "step_time_sec": 8.228531075001229, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6878, "loss": 3.8881256580352783, "lr": 0.0002, "elapsed_sec": 57037.43718671799, "step_time_sec": 8.228996335004922, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6879, "loss": 4.130175590515137, "lr": 0.0002, "elapsed_sec": 57045.666870594025, "step_time_sec": 8.229581057996256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6880, "loss": 3.997246742248535, "lr": 0.0002, "elapsed_sec": 57053.89659023285, "step_time_sec": 8.229565713001648, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6881, "loss": 4.125446796417236, "lr": 0.0002, "elapsed_sec": 57062.12356567383, "step_time_sec": 8.226749090012163, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6882, "loss": 4.154426574707031, "lr": 0.0002, "elapsed_sec": 57070.351779937744, "step_time_sec": 8.22806805500295, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6883, "loss": 4.1613359451293945, "lr": 0.0002, "elapsed_sec": 57078.58202910423, "step_time_sec": 8.230088453012286, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6884, "loss": 4.054304122924805, "lr": 0.0002, "elapsed_sec": 57086.81248283386, "step_time_sec": 8.230301170988241, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6885, "loss": 3.9161102771759033, "lr": 0.0002, "elapsed_sec": 57095.042889118195, "step_time_sec": 8.230301670002518, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6886, "loss": 3.8607499599456787, "lr": 0.0002, "elapsed_sec": 57103.273476839066, "step_time_sec": 8.230392427009065, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6887, "loss": 4.287466526031494, "lr": 0.0002, "elapsed_sec": 57111.50448489189, "step_time_sec": 8.230832125002053, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6888, "loss": 4.152257442474365, "lr": 0.0002, "elapsed_sec": 57119.73529577255, "step_time_sec": 8.230667230003746, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6889, "loss": 4.0188117027282715, "lr": 0.0002, "elapsed_sec": 57127.964549064636, "step_time_sec": 8.229092497000238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6890, "loss": 4.086432933807373, "lr": 0.0002, "elapsed_sec": 57136.194994449615, "step_time_sec": 8.230341210990446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6891, "loss": 3.9071075916290283, "lr": 0.0002, "elapsed_sec": 57144.42456269264, "step_time_sec": 8.229429292026907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6892, "loss": 4.083230972290039, "lr": 0.0002, "elapsed_sec": 57152.6548371315, "step_time_sec": 8.23008830100298, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6893, "loss": 4.140836715698242, "lr": 0.0002, "elapsed_sec": 57160.88608789444, "step_time_sec": 8.23112549202051, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6894, "loss": 4.141688823699951, "lr": 0.0002, "elapsed_sec": 57169.1159427166, "step_time_sec": 8.229622556013055, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6895, "loss": 4.079252243041992, "lr": 0.0002, "elapsed_sec": 57177.34365487099, "step_time_sec": 8.227549274015473, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6896, "loss": 4.170923709869385, "lr": 0.0002, "elapsed_sec": 57185.57258749008, "step_time_sec": 8.228761698992457, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6897, "loss": 4.104813098907471, "lr": 0.0002, "elapsed_sec": 57193.80258488655, "step_time_sec": 8.229916583979502, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6898, "loss": 4.199407577514648, "lr": 0.0002, "elapsed_sec": 57202.03251671791, "step_time_sec": 8.229739082977176, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6899, "loss": 4.201441764831543, "lr": 0.0002, "elapsed_sec": 57210.26219701767, "step_time_sec": 8.229495058010798, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6900, "loss": 4.150923728942871, "lr": 0.0002, "elapsed_sec": 57218.4928355217, "step_time_sec": 8.230457973986631, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6901, "loss": 4.192602157592773, "lr": 0.0002, "elapsed_sec": 57226.72049474716, "step_time_sec": 8.22755874300492, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6902, "loss": 4.135822772979736, "lr": 0.0002, "elapsed_sec": 57234.95211791992, "step_time_sec": 8.231442756019533, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6903, "loss": 4.2943902015686035, "lr": 0.0002, "elapsed_sec": 57243.18262910843, "step_time_sec": 8.230368635995546, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6904, "loss": 4.401382923126221, "lr": 0.0002, "elapsed_sec": 57251.41009759903, "step_time_sec": 8.227313062001485, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6905, "loss": 4.155056953430176, "lr": 0.0002, "elapsed_sec": 57259.63911700249, "step_time_sec": 8.228844213008415, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6906, "loss": 4.1859354972839355, "lr": 0.0002, "elapsed_sec": 57267.86844301224, "step_time_sec": 8.229192841012264, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6907, "loss": 3.9766411781311035, "lr": 0.0002, "elapsed_sec": 57276.099618673325, "step_time_sec": 8.231065751984715, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6908, "loss": 4.103596210479736, "lr": 0.0002, "elapsed_sec": 57284.330290555954, "step_time_sec": 8.230451629991876, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6909, "loss": 4.0201334953308105, "lr": 0.0002, "elapsed_sec": 57292.56099629402, "step_time_sec": 8.230573442997411, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6910, "loss": 4.116074085235596, "lr": 0.0002, "elapsed_sec": 57300.79157114029, "step_time_sec": 8.230404703994282, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6911, "loss": 4.247630596160889, "lr": 0.0002, "elapsed_sec": 57309.022028923035, "step_time_sec": 8.230308442987734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6912, "loss": 4.0352582931518555, "lr": 0.0002, "elapsed_sec": 57317.250824928284, "step_time_sec": 8.228628861979814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6913, "loss": 3.9679274559020996, "lr": 0.0002, "elapsed_sec": 57325.47938537598, "step_time_sec": 8.228411824005889, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6914, "loss": 4.164205551147461, "lr": 0.0002, "elapsed_sec": 57333.707146167755, "step_time_sec": 8.227612216986017, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6915, "loss": 4.279995441436768, "lr": 0.0002, "elapsed_sec": 57341.93635940552, "step_time_sec": 8.229064598010154, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6916, "loss": 4.0874433517456055, "lr": 0.0002, "elapsed_sec": 57350.163276433945, "step_time_sec": 8.226860780006973, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6917, "loss": 4.106481552124023, "lr": 0.0002, "elapsed_sec": 57358.3947019577, "step_time_sec": 8.231246341019869, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6918, "loss": 4.169209957122803, "lr": 0.0002, "elapsed_sec": 57366.62535762787, "step_time_sec": 8.230416214995785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6919, "loss": 4.200072288513184, "lr": 0.0002, "elapsed_sec": 57374.855453014374, "step_time_sec": 8.229946612002095, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6920, "loss": 4.124337196350098, "lr": 0.0002, "elapsed_sec": 57383.085565805435, "step_time_sec": 8.229980101983529, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6921, "loss": 4.145857810974121, "lr": 0.0002, "elapsed_sec": 57391.316363334656, "step_time_sec": 8.230704076006077, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6922, "loss": 4.245279788970947, "lr": 0.0002, "elapsed_sec": 57399.54672861099, "step_time_sec": 8.230128409981262, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6923, "loss": 4.110450267791748, "lr": 0.0002, "elapsed_sec": 57407.776146411896, "step_time_sec": 8.229300597973634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6924, "loss": 3.9745500087738037, "lr": 0.0002, "elapsed_sec": 57416.00658750534, "step_time_sec": 8.23030773299979, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6925, "loss": 4.224374294281006, "lr": 0.0002, "elapsed_sec": 57424.23555231094, "step_time_sec": 8.228779247991042, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6926, "loss": 4.100505352020264, "lr": 0.0002, "elapsed_sec": 57432.46426343918, "step_time_sec": 8.22861270798603, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6927, "loss": 4.188836097717285, "lr": 0.0002, "elapsed_sec": 57440.69443845749, "step_time_sec": 8.229932449001353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6928, "loss": 4.058791637420654, "lr": 0.0002, "elapsed_sec": 57448.9222574234, "step_time_sec": 8.22766392599442, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6929, "loss": 4.136904716491699, "lr": 0.0002, "elapsed_sec": 57457.15297961235, "step_time_sec": 8.230574154003989, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6930, "loss": 4.097421646118164, "lr": 0.0002, "elapsed_sec": 57465.38341450691, "step_time_sec": 8.230339276982704, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6931, "loss": 3.884533405303955, "lr": 0.0002, "elapsed_sec": 57473.61452746391, "step_time_sec": 8.23087912099436, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6932, "loss": 4.025027275085449, "lr": 0.0002, "elapsed_sec": 57481.845041036606, "step_time_sec": 8.23036057499121, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6933, "loss": 4.119470119476318, "lr": 0.0002, "elapsed_sec": 57490.075524806976, "step_time_sec": 8.230365714000072, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6934, "loss": 4.104611873626709, "lr": 0.0002, "elapsed_sec": 57498.30610489845, "step_time_sec": 8.230396364000626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6935, "loss": 4.134943008422852, "lr": 0.0002, "elapsed_sec": 57506.53646826744, "step_time_sec": 8.230260924989125, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6936, "loss": 4.152744770050049, "lr": 0.0002, "elapsed_sec": 57514.767536878586, "step_time_sec": 8.23084626000491, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6937, "loss": 4.037284851074219, "lr": 0.0002, "elapsed_sec": 57522.99594068527, "step_time_sec": 8.228260831005173, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6938, "loss": 4.206108570098877, "lr": 0.0002, "elapsed_sec": 57531.22393774986, "step_time_sec": 8.227912898000795, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6939, "loss": 4.0170512199401855, "lr": 0.0002, "elapsed_sec": 57539.45073509216, "step_time_sec": 8.226580702001229, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6940, "loss": 4.048300266265869, "lr": 0.0002, "elapsed_sec": 57547.68102359772, "step_time_sec": 8.230118865001714, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6941, "loss": 4.058335781097412, "lr": 0.0002, "elapsed_sec": 57555.911430597305, "step_time_sec": 8.230234337999718, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6942, "loss": 4.103797435760498, "lr": 0.0002, "elapsed_sec": 57564.14143395424, "step_time_sec": 8.229846494010417, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6943, "loss": 4.093710422515869, "lr": 0.0002, "elapsed_sec": 57572.37063026428, "step_time_sec": 8.229057988995919, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6944, "loss": 4.201979637145996, "lr": 0.0002, "elapsed_sec": 57580.598616838455, "step_time_sec": 8.22783355199499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6945, "loss": 4.3888115882873535, "lr": 0.0002, "elapsed_sec": 57588.82852053642, "step_time_sec": 8.2297424619901, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6946, "loss": 4.273011207580566, "lr": 0.0002, "elapsed_sec": 57597.05847263336, "step_time_sec": 8.229799397988245, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6947, "loss": 4.0080084800720215, "lr": 0.0002, "elapsed_sec": 57605.28814816475, "step_time_sec": 8.229545676003909, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6948, "loss": 4.1089582443237305, "lr": 0.0002, "elapsed_sec": 57613.51816701889, "step_time_sec": 8.229844765999587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6949, "loss": 4.130831718444824, "lr": 0.0002, "elapsed_sec": 57621.74908852577, "step_time_sec": 8.23076094299904, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6950, "loss": 4.14695930480957, "lr": 0.0002, "elapsed_sec": 57629.97890710831, "step_time_sec": 8.229680325021036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6951, "loss": 4.186397552490234, "lr": 0.0002, "elapsed_sec": 57638.21001291275, "step_time_sec": 8.23099242700846, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6952, "loss": 4.175738334655762, "lr": 0.0002, "elapsed_sec": 57646.44014334679, "step_time_sec": 8.229960856988328, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6953, "loss": 4.276651382446289, "lr": 0.0002, "elapsed_sec": 57654.670196056366, "step_time_sec": 8.229825242015067, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6954, "loss": 4.098978042602539, "lr": 0.0002, "elapsed_sec": 57662.90008997917, "step_time_sec": 8.229790972021874, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6955, "loss": 4.391275405883789, "lr": 0.0002, "elapsed_sec": 57671.12920188904, "step_time_sec": 8.228920652996749, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6956, "loss": 4.372820854187012, "lr": 0.0002, "elapsed_sec": 57679.35749554634, "step_time_sec": 8.228125524998177, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6957, "loss": 4.176952838897705, "lr": 0.0002, "elapsed_sec": 57687.5873465538, "step_time_sec": 8.229676753020613, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6958, "loss": 4.026494979858398, "lr": 0.0002, "elapsed_sec": 57695.81706237793, "step_time_sec": 8.229548004979733, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6959, "loss": 4.162850856781006, "lr": 0.0002, "elapsed_sec": 57704.047627449036, "step_time_sec": 8.230431243981002, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6960, "loss": 4.274752140045166, "lr": 0.0002, "elapsed_sec": 57712.27878713608, "step_time_sec": 8.231021921994397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6961, "loss": 4.111442565917969, "lr": 0.0002, "elapsed_sec": 57720.509303331375, "step_time_sec": 8.230339166009799, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6962, "loss": 4.130885124206543, "lr": 0.0002, "elapsed_sec": 57728.74034380913, "step_time_sec": 8.23089428700041, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6963, "loss": 4.110902786254883, "lr": 0.0002, "elapsed_sec": 57736.97184443474, "step_time_sec": 8.231361711019417, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6964, "loss": 3.9700984954833984, "lr": 0.0002, "elapsed_sec": 57745.20220708847, "step_time_sec": 8.230185605993029, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6965, "loss": 4.122156620025635, "lr": 0.0002, "elapsed_sec": 57753.432025909424, "step_time_sec": 8.229665368999122, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6966, "loss": 4.319647789001465, "lr": 0.0002, "elapsed_sec": 57761.66220641136, "step_time_sec": 8.230080117005855, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6967, "loss": 4.353724956512451, "lr": 0.0002, "elapsed_sec": 57769.89437675476, "step_time_sec": 8.231944405997638, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6968, "loss": 4.236881732940674, "lr": 0.0002, "elapsed_sec": 57778.12499451637, "step_time_sec": 8.230459216982126, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6969, "loss": 4.256467819213867, "lr": 0.0002, "elapsed_sec": 57786.35486793518, "step_time_sec": 8.229785024974262, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6970, "loss": 4.111676216125488, "lr": 0.0002, "elapsed_sec": 57794.58365440369, "step_time_sec": 8.228592069994193, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6971, "loss": 4.189267158508301, "lr": 0.0002, "elapsed_sec": 57802.8135843277, "step_time_sec": 8.229826213995693, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6972, "loss": 4.379019260406494, "lr": 0.0002, "elapsed_sec": 57811.04473614693, "step_time_sec": 8.230929765995825, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6973, "loss": 4.321106433868408, "lr": 0.0002, "elapsed_sec": 57819.27513027191, "step_time_sec": 8.230270008003572, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6974, "loss": 4.363229274749756, "lr": 0.0002, "elapsed_sec": 57827.505927324295, "step_time_sec": 8.230634170991834, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6975, "loss": 4.1371989250183105, "lr": 0.0002, "elapsed_sec": 57835.73650455475, "step_time_sec": 8.230458940000972, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6976, "loss": 4.230884552001953, "lr": 0.0002, "elapsed_sec": 57843.967079639435, "step_time_sec": 8.230368759017438, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6977, "loss": 4.411556720733643, "lr": 0.0002, "elapsed_sec": 57852.19828248024, "step_time_sec": 8.23103537800489, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6978, "loss": 4.210374355316162, "lr": 0.0002, "elapsed_sec": 57860.42674088478, "step_time_sec": 8.228308085002936, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6979, "loss": 4.3409647941589355, "lr": 0.0002, "elapsed_sec": 57868.65492725372, "step_time_sec": 8.228048121993197, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6980, "loss": 4.269221782684326, "lr": 0.0002, "elapsed_sec": 57876.88342022896, "step_time_sec": 8.228376643994125, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6981, "loss": 4.180107593536377, "lr": 0.0002, "elapsed_sec": 57885.11286520958, "step_time_sec": 8.22924882400548, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6982, "loss": 4.233018398284912, "lr": 0.0002, "elapsed_sec": 57893.34490418434, "step_time_sec": 8.231928141991375, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6983, "loss": 4.258309364318848, "lr": 0.0002, "elapsed_sec": 57901.574800252914, "step_time_sec": 8.229752751009073, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6984, "loss": 4.338129997253418, "lr": 0.0002, "elapsed_sec": 57909.80412721634, "step_time_sec": 8.229117019975092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6985, "loss": 4.14637565612793, "lr": 0.0002, "elapsed_sec": 57918.03484749794, "step_time_sec": 8.230549586005509, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6986, "loss": 4.277188777923584, "lr": 0.0002, "elapsed_sec": 57926.264516830444, "step_time_sec": 8.229595379991224, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6987, "loss": 4.26658296585083, "lr": 0.0002, "elapsed_sec": 57934.4929831028, "step_time_sec": 8.228247154009296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6988, "loss": 4.131990432739258, "lr": 0.0002, "elapsed_sec": 57942.72146463394, "step_time_sec": 8.228357268002583, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6989, "loss": 4.359495162963867, "lr": 0.0002, "elapsed_sec": 57950.950145721436, "step_time_sec": 8.228499824996106, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6990, "loss": 4.221188068389893, "lr": 0.0002, "elapsed_sec": 57959.17821884155, "step_time_sec": 8.227921592013445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6991, "loss": 4.405880928039551, "lr": 0.0002, "elapsed_sec": 57967.40826678276, "step_time_sec": 8.229904206003994, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6992, "loss": 4.273005485534668, "lr": 0.0002, "elapsed_sec": 57975.63928604126, "step_time_sec": 8.230908996978542, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6993, "loss": 4.658147811889648, "lr": 0.0002, "elapsed_sec": 57983.86862921715, "step_time_sec": 8.229164335993119, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6994, "loss": 4.327227592468262, "lr": 0.0002, "elapsed_sec": 57992.097762823105, "step_time_sec": 8.22893541501253, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6995, "loss": 4.418062210083008, "lr": 0.0002, "elapsed_sec": 58000.32623767853, "step_time_sec": 8.228338532004273, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6996, "loss": 4.259218215942383, "lr": 0.0002, "elapsed_sec": 58008.55652117729, "step_time_sec": 8.230131869000616, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6997, "loss": 4.091646194458008, "lr": 0.0002, "elapsed_sec": 58016.78670620918, "step_time_sec": 8.23006457000156, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6998, "loss": 4.262274742126465, "lr": 0.0002, "elapsed_sec": 58025.01563334465, "step_time_sec": 8.228750999987824, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 6999, "loss": 4.240515232086182, "lr": 0.0002, "elapsed_sec": 58033.24628353119, "step_time_sec": 8.230466867011273, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7000, "loss": 4.260456085205078, "lr": 0.0002, "elapsed_sec": 58041.475135326385, "step_time_sec": 52.83529147101217, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": true, "probe_ran": true, "probe_elapsed_sec": 0.9821570510102902, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7001, "loss": 4.34794807434082, "lr": 0.0002, "elapsed_sec": 58094.324305295944, "step_time_sec": 8.242464379000012, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7002, "loss": 4.2375807762146, "lr": 0.0002, "elapsed_sec": 58102.55512428284, "step_time_sec": 8.230655590014067, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7003, "loss": 4.163426399230957, "lr": 0.0002, "elapsed_sec": 58110.78632354736, "step_time_sec": 8.231061375001445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7004, "loss": 4.277596950531006, "lr": 0.0002, "elapsed_sec": 58119.01732969284, "step_time_sec": 8.23082368899486, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7005, "loss": 4.231134414672852, "lr": 0.0002, "elapsed_sec": 58127.24815273285, "step_time_sec": 8.230687880975893, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7006, "loss": 4.233760833740234, "lr": 0.0002, "elapsed_sec": 58135.47966527939, "step_time_sec": 8.231307909009047, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7007, "loss": 4.180541038513184, "lr": 0.0002, "elapsed_sec": 58143.709958553314, "step_time_sec": 8.230137795006158, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7008, "loss": 4.306268215179443, "lr": 0.0002, "elapsed_sec": 58151.94117355347, "step_time_sec": 8.231133754976327, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7009, "loss": 4.150060653686523, "lr": 0.0002, "elapsed_sec": 58160.17226076126, "step_time_sec": 8.230896787979873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7010, "loss": 4.120386600494385, "lr": 0.0002, "elapsed_sec": 58168.40427279472, "step_time_sec": 8.231806404015515, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7011, "loss": 4.216342926025391, "lr": 0.0002, "elapsed_sec": 58176.63510465622, "step_time_sec": 8.230716941005085, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7012, "loss": 4.297092914581299, "lr": 0.0002, "elapsed_sec": 58184.86638665199, "step_time_sec": 8.231078618991887, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7013, "loss": 4.368104457855225, "lr": 0.0002, "elapsed_sec": 58193.09754443169, "step_time_sec": 8.2310216879996, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7014, "loss": 4.211121082305908, "lr": 0.0002, "elapsed_sec": 58201.32805109024, "step_time_sec": 8.230308386002434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7015, "loss": 4.220433712005615, "lr": 0.0002, "elapsed_sec": 58209.5587952137, "step_time_sec": 8.230577298003482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7016, "loss": 4.1872944831848145, "lr": 0.0002, "elapsed_sec": 58217.78896021843, "step_time_sec": 8.229993527987972, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7017, "loss": 4.199427127838135, "lr": 0.0002, "elapsed_sec": 58226.01969027519, "step_time_sec": 8.230587266007205, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7018, "loss": 4.350005149841309, "lr": 0.0002, "elapsed_sec": 58234.24947476387, "step_time_sec": 8.229681480996078, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7019, "loss": 4.081399440765381, "lr": 0.0002, "elapsed_sec": 58242.48013472557, "step_time_sec": 8.230436517013004, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7020, "loss": 4.311728000640869, "lr": 0.0002, "elapsed_sec": 58250.711047410965, "step_time_sec": 8.230757753015496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7021, "loss": 4.282634735107422, "lr": 0.0002, "elapsed_sec": 58258.94048547745, "step_time_sec": 8.22927326298668, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7022, "loss": 4.23936653137207, "lr": 0.0002, "elapsed_sec": 58267.16973686218, "step_time_sec": 8.229119951021858, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7023, "loss": 4.217772483825684, "lr": 0.0002, "elapsed_sec": 58275.39782452583, "step_time_sec": 8.227939541015076, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7024, "loss": 4.093644142150879, "lr": 0.0002, "elapsed_sec": 58283.62787413597, "step_time_sec": 8.229791407007724, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7025, "loss": 4.340470790863037, "lr": 0.0002, "elapsed_sec": 58291.8558986187, "step_time_sec": 8.227905117993942, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7026, "loss": 4.466811656951904, "lr": 0.0002, "elapsed_sec": 58300.08631324768, "step_time_sec": 8.230257936986163, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7027, "loss": 4.23583984375, "lr": 0.0002, "elapsed_sec": 58308.31743836403, "step_time_sec": 8.230976675986312, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7028, "loss": 4.219661712646484, "lr": 0.0002, "elapsed_sec": 58316.5460062027, "step_time_sec": 8.228349244018318, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7029, "loss": 4.310572624206543, "lr": 0.0002, "elapsed_sec": 58324.77765083313, "step_time_sec": 8.231493507017149, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7030, "loss": 4.055715560913086, "lr": 0.0002, "elapsed_sec": 58333.00877547264, "step_time_sec": 8.230962539993925, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7031, "loss": 4.316799640655518, "lr": 0.0002, "elapsed_sec": 58341.239542245865, "step_time_sec": 8.230610065016663, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7032, "loss": 4.332141399383545, "lr": 0.0002, "elapsed_sec": 58349.470484018326, "step_time_sec": 8.230776490003336, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7033, "loss": 4.18350076675415, "lr": 0.0002, "elapsed_sec": 58357.70134806633, "step_time_sec": 8.230718053004239, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7034, "loss": 4.293123722076416, "lr": 0.0002, "elapsed_sec": 58365.932094335556, "step_time_sec": 8.230588319973322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7035, "loss": 4.266149997711182, "lr": 0.0002, "elapsed_sec": 58374.163177490234, "step_time_sec": 8.230919917026768, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7036, "loss": 4.295401096343994, "lr": 0.0002, "elapsed_sec": 58382.392857551575, "step_time_sec": 8.229566838999745, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7037, "loss": 4.3060078620910645, "lr": 0.0002, "elapsed_sec": 58390.62171959877, "step_time_sec": 8.228656073013553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7038, "loss": 4.296298027038574, "lr": 0.0002, "elapsed_sec": 58398.85306406021, "step_time_sec": 8.231198786990717, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7039, "loss": 4.277548789978027, "lr": 0.0002, "elapsed_sec": 58407.08304858208, "step_time_sec": 8.229882863990497, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7040, "loss": 4.3473310470581055, "lr": 0.0002, "elapsed_sec": 58415.31197333336, "step_time_sec": 8.22871597998892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7041, "loss": 4.291843891143799, "lr": 0.0002, "elapsed_sec": 58423.54323172569, "step_time_sec": 8.231102771009319, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7042, "loss": 4.094958305358887, "lr": 0.0002, "elapsed_sec": 58431.77499294281, "step_time_sec": 8.231588937982451, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7043, "loss": 4.294558048248291, "lr": 0.0002, "elapsed_sec": 58440.0056040287, "step_time_sec": 8.230493435024982, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7044, "loss": 4.366809368133545, "lr": 0.0002, "elapsed_sec": 58448.236193180084, "step_time_sec": 8.230428251990816, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7045, "loss": 4.325551986694336, "lr": 0.0002, "elapsed_sec": 58456.46707820892, "step_time_sec": 8.230745009001112, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7046, "loss": 4.281888008117676, "lr": 0.0002, "elapsed_sec": 58464.69797921181, "step_time_sec": 8.230743394000456, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7047, "loss": 4.217240810394287, "lr": 0.0002, "elapsed_sec": 58472.92810463905, "step_time_sec": 8.229916405980475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7048, "loss": 4.350986957550049, "lr": 0.0002, "elapsed_sec": 58481.159616708755, "step_time_sec": 8.23138032699353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7049, "loss": 4.081053733825684, "lr": 0.0002, "elapsed_sec": 58489.39038062096, "step_time_sec": 8.23061375398538, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7050, "loss": 4.273712635040283, "lr": 0.0002, "elapsed_sec": 58497.6212747097, "step_time_sec": 8.230791841022437, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7051, "loss": 4.185316562652588, "lr": 0.0002, "elapsed_sec": 58505.85249662399, "step_time_sec": 8.231001047999598, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7052, "loss": 4.153670787811279, "lr": 0.0002, "elapsed_sec": 58514.0829308033, "step_time_sec": 8.230266116006533, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7053, "loss": 4.175570487976074, "lr": 0.0002, "elapsed_sec": 58522.3140296936, "step_time_sec": 8.230980036983965, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7054, "loss": 4.279809951782227, "lr": 0.0002, "elapsed_sec": 58530.54418849945, "step_time_sec": 8.229964939993806, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7055, "loss": 4.357296943664551, "lr": 0.0002, "elapsed_sec": 58538.77535843849, "step_time_sec": 8.23101583000971, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7056, "loss": 4.387138366699219, "lr": 0.0002, "elapsed_sec": 58547.00590181351, "step_time_sec": 8.230414182995446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7057, "loss": 4.15646505355835, "lr": 0.0002, "elapsed_sec": 58555.23610711098, "step_time_sec": 8.230013895023149, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7058, "loss": 4.374375343322754, "lr": 0.0002, "elapsed_sec": 58563.467005968094, "step_time_sec": 8.230734581011347, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7059, "loss": 4.27750825881958, "lr": 0.0002, "elapsed_sec": 58571.6977391243, "step_time_sec": 8.230571546999272, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7060, "loss": 4.222591876983643, "lr": 0.0002, "elapsed_sec": 58579.92766833305, "step_time_sec": 8.229823200003011, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7061, "loss": 4.254866123199463, "lr": 0.0002, "elapsed_sec": 58588.156682252884, "step_time_sec": 8.228858461981872, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7062, "loss": 4.518808364868164, "lr": 0.0002, "elapsed_sec": 58596.38801193237, "step_time_sec": 8.231136958987918, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7063, "loss": 4.271884441375732, "lr": 0.0002, "elapsed_sec": 58604.61874938011, "step_time_sec": 8.230564736004453, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7064, "loss": 4.1908955574035645, "lr": 0.0002, "elapsed_sec": 58612.849816799164, "step_time_sec": 8.23095875699073, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7065, "loss": 4.130068302154541, "lr": 0.0002, "elapsed_sec": 58621.08041143417, "step_time_sec": 8.230369799013715, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7066, "loss": 4.237207412719727, "lr": 0.0002, "elapsed_sec": 58629.31138944626, "step_time_sec": 8.230835540976841, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7067, "loss": 4.1123456954956055, "lr": 0.0002, "elapsed_sec": 58637.54297876358, "step_time_sec": 8.23147924701334, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7068, "loss": 4.206380844116211, "lr": 0.0002, "elapsed_sec": 58645.77402663231, "step_time_sec": 8.230811827001162, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7069, "loss": 4.184826374053955, "lr": 0.0002, "elapsed_sec": 58654.00507092476, "step_time_sec": 8.230928693985334, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7070, "loss": 4.194674968719482, "lr": 0.0002, "elapsed_sec": 58662.232934474945, "step_time_sec": 8.227711745013949, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7071, "loss": 4.44033670425415, "lr": 0.0002, "elapsed_sec": 58670.46185731888, "step_time_sec": 8.228727329988033, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7072, "loss": 4.504659175872803, "lr": 0.0002, "elapsed_sec": 58678.691573381424, "step_time_sec": 8.229574705008417, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7073, "loss": 4.32603645324707, "lr": 0.0002, "elapsed_sec": 58686.92187809944, "step_time_sec": 8.2301441860036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7074, "loss": 4.302230358123779, "lr": 0.0002, "elapsed_sec": 58695.15256476402, "step_time_sec": 8.230611391016282, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7075, "loss": 4.121055603027344, "lr": 0.0002, "elapsed_sec": 58703.38112902641, "step_time_sec": 8.228338040993549, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7076, "loss": 4.40626335144043, "lr": 0.0002, "elapsed_sec": 58711.609537124634, "step_time_sec": 8.228268922015559, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7077, "loss": 4.275505065917969, "lr": 0.0002, "elapsed_sec": 58719.83800339699, "step_time_sec": 8.228275613015285, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7078, "loss": 4.306453704833984, "lr": 0.0002, "elapsed_sec": 58728.06940150261, "step_time_sec": 8.231300718005514, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7079, "loss": 4.397117614746094, "lr": 0.0002, "elapsed_sec": 58736.29996395111, "step_time_sec": 8.230338649998885, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7080, "loss": 4.160373210906982, "lr": 0.0002, "elapsed_sec": 58744.530170440674, "step_time_sec": 8.230050760001177, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7081, "loss": 4.2254719734191895, "lr": 0.0002, "elapsed_sec": 58752.7612016201, "step_time_sec": 8.230900886992458, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7082, "loss": 4.456532001495361, "lr": 0.0002, "elapsed_sec": 58760.99185156822, "step_time_sec": 8.230454200995155, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7083, "loss": 4.222287654876709, "lr": 0.0002, "elapsed_sec": 58769.2225317955, "step_time_sec": 8.230519442993682, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7084, "loss": 4.204832077026367, "lr": 0.0002, "elapsed_sec": 58777.45326566696, "step_time_sec": 8.230631012003869, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7085, "loss": 4.176448345184326, "lr": 0.0002, "elapsed_sec": 58785.68329286575, "step_time_sec": 8.229842658009147, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7086, "loss": 4.27053689956665, "lr": 0.0002, "elapsed_sec": 58793.91521906853, "step_time_sec": 8.231756691995542, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7087, "loss": 4.267270088195801, "lr": 0.0002, "elapsed_sec": 58802.145027160645, "step_time_sec": 8.229689846019028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7088, "loss": 4.246140956878662, "lr": 0.0002, "elapsed_sec": 58810.373413324356, "step_time_sec": 8.228196549986023, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7089, "loss": 4.24771785736084, "lr": 0.0002, "elapsed_sec": 58818.60281038284, "step_time_sec": 8.229220961016836, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7090, "loss": 4.240167140960693, "lr": 0.0002, "elapsed_sec": 58826.831315517426, "step_time_sec": 8.228351892001228, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7091, "loss": 4.386175155639648, "lr": 0.0002, "elapsed_sec": 58835.06170606613, "step_time_sec": 8.23024596000323, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7092, "loss": 4.274939060211182, "lr": 0.0002, "elapsed_sec": 58843.29218530655, "step_time_sec": 8.230380506982328, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7093, "loss": 4.310187339782715, "lr": 0.0002, "elapsed_sec": 58851.52332878113, "step_time_sec": 8.230962709989399, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7094, "loss": 4.2662787437438965, "lr": 0.0002, "elapsed_sec": 58859.75407290459, "step_time_sec": 8.230582413001684, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7095, "loss": 4.156515598297119, "lr": 0.0002, "elapsed_sec": 58867.98528337479, "step_time_sec": 8.231038530007936, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7096, "loss": 4.328863143920898, "lr": 0.0002, "elapsed_sec": 58876.21635103226, "step_time_sec": 8.230925440991996, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7097, "loss": 4.4083147048950195, "lr": 0.0002, "elapsed_sec": 58884.447347164154, "step_time_sec": 8.23083370301174, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7098, "loss": 4.301839351654053, "lr": 0.0002, "elapsed_sec": 58892.675567388535, "step_time_sec": 8.228056563006248, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7099, "loss": 4.373326301574707, "lr": 0.0002, "elapsed_sec": 58900.90620112419, "step_time_sec": 8.230485244974261, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7100, "loss": 4.351839065551758, "lr": 0.0002, "elapsed_sec": 58909.13731956482, "step_time_sec": 8.23096385700046, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7101, "loss": 4.230680465698242, "lr": 0.0002, "elapsed_sec": 58917.364933252335, "step_time_sec": 8.227489837998291, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7102, "loss": 4.3708577156066895, "lr": 0.0002, "elapsed_sec": 58925.59561038017, "step_time_sec": 8.230521613004385, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7103, "loss": 4.304628849029541, "lr": 0.0002, "elapsed_sec": 58933.825184345245, "step_time_sec": 8.22944303098484, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7104, "loss": 4.33763313293457, "lr": 0.0002, "elapsed_sec": 58942.05602359772, "step_time_sec": 8.230600777984364, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7105, "loss": 4.260396957397461, "lr": 0.0002, "elapsed_sec": 58950.28805613518, "step_time_sec": 8.231881257990608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7106, "loss": 4.390584468841553, "lr": 0.0002, "elapsed_sec": 58958.51876807213, "step_time_sec": 8.230583806987852, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7107, "loss": 4.2535929679870605, "lr": 0.0002, "elapsed_sec": 58966.74970912933, "step_time_sec": 8.230807672982337, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7108, "loss": 4.272114276885986, "lr": 0.0002, "elapsed_sec": 58974.98124551773, "step_time_sec": 8.231322102976264, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7109, "loss": 4.164805889129639, "lr": 0.0002, "elapsed_sec": 58983.21146225929, "step_time_sec": 8.230050066980766, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7110, "loss": 4.194883823394775, "lr": 0.0002, "elapsed_sec": 58991.44222283363, "step_time_sec": 8.230605433986057, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7111, "loss": 4.210783958435059, "lr": 0.0002, "elapsed_sec": 58999.6712539196, "step_time_sec": 8.228863694006577, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7112, "loss": 4.2188262939453125, "lr": 0.0002, "elapsed_sec": 59007.90209746361, "step_time_sec": 8.230682427994907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7113, "loss": 4.070855140686035, "lr": 0.0002, "elapsed_sec": 59016.132452487946, "step_time_sec": 8.230170410010032, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7114, "loss": 4.354476451873779, "lr": 0.0002, "elapsed_sec": 59024.361807346344, "step_time_sec": 8.229186000011396, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7115, "loss": 4.402198791503906, "lr": 0.0002, "elapsed_sec": 59032.5906829834, "step_time_sec": 8.228771886002505, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7116, "loss": 4.240890026092529, "lr": 0.0002, "elapsed_sec": 59040.819915771484, "step_time_sec": 8.22903166001197, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7117, "loss": 4.263121604919434, "lr": 0.0002, "elapsed_sec": 59049.05128574371, "step_time_sec": 8.23120444099186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7118, "loss": 4.244585037231445, "lr": 0.0002, "elapsed_sec": 59057.282624959946, "step_time_sec": 8.23122282398981, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7119, "loss": 4.482052803039551, "lr": 0.0002, "elapsed_sec": 59065.51329278946, "step_time_sec": 8.230475432006642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7120, "loss": 4.25828742980957, "lr": 0.0002, "elapsed_sec": 59073.74436068535, "step_time_sec": 8.230972714984091, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7121, "loss": 4.288383483886719, "lr": 0.0002, "elapsed_sec": 59081.974665403366, "step_time_sec": 8.230092835001415, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7122, "loss": 4.377641677856445, "lr": 0.0002, "elapsed_sec": 59090.20705413818, "step_time_sec": 8.232248406013241, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7123, "loss": 4.301202774047852, "lr": 0.0002, "elapsed_sec": 59098.43743824959, "step_time_sec": 8.230264092009747, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7124, "loss": 4.245328903198242, "lr": 0.0002, "elapsed_sec": 59106.66753101349, "step_time_sec": 8.229859162995126, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7125, "loss": 4.206491947174072, "lr": 0.0002, "elapsed_sec": 59114.8974506855, "step_time_sec": 8.229768176999642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7126, "loss": 4.2842278480529785, "lr": 0.0002, "elapsed_sec": 59123.12844824791, "step_time_sec": 8.230842990014935, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7127, "loss": 4.258076190948486, "lr": 0.0002, "elapsed_sec": 59131.359362363815, "step_time_sec": 8.230761820013868, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7128, "loss": 4.2326273918151855, "lr": 0.0002, "elapsed_sec": 59139.589896678925, "step_time_sec": 8.230355550011154, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7129, "loss": 4.311878681182861, "lr": 0.0002, "elapsed_sec": 59147.821400642395, "step_time_sec": 8.231344872008776, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7130, "loss": 4.220608234405518, "lr": 0.0002, "elapsed_sec": 59156.05207180977, "step_time_sec": 8.230514958006097, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7131, "loss": 4.517419338226318, "lr": 0.0002, "elapsed_sec": 59164.28285527229, "step_time_sec": 8.230574926012196, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7132, "loss": 4.169648170471191, "lr": 0.0002, "elapsed_sec": 59172.51308512688, "step_time_sec": 8.230100244021742, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7133, "loss": 4.28350305557251, "lr": 0.0002, "elapsed_sec": 59180.74366545677, "step_time_sec": 8.230365247000009, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7134, "loss": 4.427554130554199, "lr": 0.0002, "elapsed_sec": 59188.974600076675, "step_time_sec": 8.230814519018168, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7135, "loss": 4.333747863769531, "lr": 0.0002, "elapsed_sec": 59197.20516681671, "step_time_sec": 8.230357397987973, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7136, "loss": 4.230134963989258, "lr": 0.0002, "elapsed_sec": 59205.43592214584, "step_time_sec": 8.230619676993228, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7137, "loss": 4.406367778778076, "lr": 0.0002, "elapsed_sec": 59213.66601610184, "step_time_sec": 8.229916004027473, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7138, "loss": 4.290208339691162, "lr": 0.0002, "elapsed_sec": 59221.89684462547, "step_time_sec": 8.230658916989341, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7139, "loss": 4.1879777908325195, "lr": 0.0002, "elapsed_sec": 59230.127707481384, "step_time_sec": 8.23072126400075, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7140, "loss": 4.256072044372559, "lr": 0.0002, "elapsed_sec": 59238.35807824135, "step_time_sec": 8.230162590974942, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7141, "loss": 4.190896034240723, "lr": 0.0002, "elapsed_sec": 59246.587225437164, "step_time_sec": 8.229005181987304, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7142, "loss": 4.205082416534424, "lr": 0.0002, "elapsed_sec": 59254.815856695175, "step_time_sec": 8.228485321014887, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7143, "loss": 4.221002101898193, "lr": 0.0002, "elapsed_sec": 59263.04735302925, "step_time_sec": 8.231314628996188, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7144, "loss": 4.2996392250061035, "lr": 0.0002, "elapsed_sec": 59271.27769398689, "step_time_sec": 8.230172572017182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7145, "loss": 4.269443511962891, "lr": 0.0002, "elapsed_sec": 59279.509115219116, "step_time_sec": 8.231327704997966, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7146, "loss": 4.283168315887451, "lr": 0.0002, "elapsed_sec": 59287.739943265915, "step_time_sec": 8.230635627987795, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7147, "loss": 4.114511966705322, "lr": 0.0002, "elapsed_sec": 59295.970720767975, "step_time_sec": 8.230557352973847, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7148, "loss": 4.213223934173584, "lr": 0.0002, "elapsed_sec": 59304.20124864578, "step_time_sec": 8.230436398996972, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7149, "loss": 4.257501125335693, "lr": 0.0002, "elapsed_sec": 59312.431666851044, "step_time_sec": 8.230196218006313, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7150, "loss": 4.1778106689453125, "lr": 0.0002, "elapsed_sec": 59320.66246056557, "step_time_sec": 8.230677572981222, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7151, "loss": 4.058121204376221, "lr": 0.0002, "elapsed_sec": 59328.893225193024, "step_time_sec": 8.230595583998365, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7152, "loss": 4.256896495819092, "lr": 0.0002, "elapsed_sec": 59337.12384843826, "step_time_sec": 8.23046594200423, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7153, "loss": 4.194982051849365, "lr": 0.0002, "elapsed_sec": 59345.35239481926, "step_time_sec": 8.22838879399933, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7154, "loss": 4.267329216003418, "lr": 0.0002, "elapsed_sec": 59353.58192110062, "step_time_sec": 8.22944634698797, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7155, "loss": 4.162489414215088, "lr": 0.0002, "elapsed_sec": 59361.811676979065, "step_time_sec": 8.229513158992631, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7156, "loss": 4.2403154373168945, "lr": 0.0002, "elapsed_sec": 59370.04259634018, "step_time_sec": 8.230836572009139, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7157, "loss": 4.50042200088501, "lr": 0.0002, "elapsed_sec": 59378.273690223694, "step_time_sec": 8.230927588010672, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7158, "loss": 4.452422142028809, "lr": 0.0002, "elapsed_sec": 59386.501870155334, "step_time_sec": 8.22800205097883, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7159, "loss": 4.872154235839844, "lr": 0.0002, "elapsed_sec": 59394.730516672134, "step_time_sec": 8.228438450983958, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7160, "loss": 4.587770462036133, "lr": 0.0002, "elapsed_sec": 59402.959575653076, "step_time_sec": 8.228957284998614, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7161, "loss": 4.658787250518799, "lr": 0.0002, "elapsed_sec": 59411.189395427704, "step_time_sec": 8.22963781899307, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7162, "loss": 4.64914608001709, "lr": 0.0002, "elapsed_sec": 59419.41969180107, "step_time_sec": 8.230177807010477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7163, "loss": 4.23443603515625, "lr": 0.0002, "elapsed_sec": 59427.64754700661, "step_time_sec": 8.227684436977142, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7164, "loss": 4.450344562530518, "lr": 0.0002, "elapsed_sec": 59435.8759226799, "step_time_sec": 8.228183703002287, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7165, "loss": 4.351222991943359, "lr": 0.0002, "elapsed_sec": 59444.106313467026, "step_time_sec": 8.230299318005564, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7166, "loss": 4.382931232452393, "lr": 0.0002, "elapsed_sec": 59452.33742785454, "step_time_sec": 8.230912250000983, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7167, "loss": 4.3332085609436035, "lr": 0.0002, "elapsed_sec": 59460.5690677166, "step_time_sec": 8.23146971600363, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7168, "loss": 4.377816677093506, "lr": 0.0002, "elapsed_sec": 59468.79793906212, "step_time_sec": 8.228699145984137, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7169, "loss": 4.254580497741699, "lr": 0.0002, "elapsed_sec": 59477.02912712097, "step_time_sec": 8.23108176101232, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7170, "loss": 4.342175006866455, "lr": 0.0002, "elapsed_sec": 59485.26046514511, "step_time_sec": 8.231131064996589, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7171, "loss": 4.208620071411133, "lr": 0.0002, "elapsed_sec": 59493.49001455307, "step_time_sec": 8.229398901981767, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7172, "loss": 4.4035444259643555, "lr": 0.0002, "elapsed_sec": 59501.721959114075, "step_time_sec": 8.231767959019635, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7173, "loss": 4.2528510093688965, "lr": 0.0002, "elapsed_sec": 59510.909136772156, "step_time_sec": 9.187017279997235, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7174, "loss": 4.296019554138184, "lr": 0.0002, "elapsed_sec": 59519.137508153915, "step_time_sec": 8.22825878902222, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7175, "loss": 4.345505714416504, "lr": 0.0002, "elapsed_sec": 59527.36869215965, "step_time_sec": 8.231005905981874, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7176, "loss": 4.3364338874816895, "lr": 0.0002, "elapsed_sec": 59535.60056185722, "step_time_sec": 8.231765679985983, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7177, "loss": 4.251715660095215, "lr": 0.0002, "elapsed_sec": 59543.83169221878, "step_time_sec": 8.230883518001065, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7178, "loss": 4.332816123962402, "lr": 0.0002, "elapsed_sec": 59552.06261134148, "step_time_sec": 8.230772508017253, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7179, "loss": 4.145097255706787, "lr": 0.0002, "elapsed_sec": 59560.29370713234, "step_time_sec": 8.230992969998624, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7180, "loss": 4.281844615936279, "lr": 0.0002, "elapsed_sec": 59568.525403261185, "step_time_sec": 8.2314957450144, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7181, "loss": 4.33909797668457, "lr": 0.0002, "elapsed_sec": 59576.755293130875, "step_time_sec": 8.229720231989631, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7182, "loss": 4.47258186340332, "lr": 0.0002, "elapsed_sec": 59584.98654985428, "step_time_sec": 8.231112524023047, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7183, "loss": 3.9383041858673096, "lr": 0.0002, "elapsed_sec": 59593.216844558716, "step_time_sec": 8.230132397002308, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7184, "loss": 4.222435474395752, "lr": 0.0002, "elapsed_sec": 59601.447201251984, "step_time_sec": 8.23019145001308, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7185, "loss": 4.200333595275879, "lr": 0.0002, "elapsed_sec": 59609.677765369415, "step_time_sec": 8.230431884003337, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7186, "loss": 4.465637683868408, "lr": 0.0002, "elapsed_sec": 59617.908515930176, "step_time_sec": 8.230583361000754, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7187, "loss": 4.243476390838623, "lr": 0.0002, "elapsed_sec": 59626.13974881172, "step_time_sec": 8.231059348996496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7188, "loss": 4.274372577667236, "lr": 0.0002, "elapsed_sec": 59634.36960363388, "step_time_sec": 8.229712886997731, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7189, "loss": 4.33759880065918, "lr": 0.0002, "elapsed_sec": 59642.60121202469, "step_time_sec": 8.2314065110113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7190, "loss": 4.001141548156738, "lr": 0.0002, "elapsed_sec": 59650.83200383186, "step_time_sec": 8.23061807799968, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7191, "loss": 4.375055313110352, "lr": 0.0002, "elapsed_sec": 59659.06193137169, "step_time_sec": 8.229759551992174, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7192, "loss": 4.070371150970459, "lr": 0.0002, "elapsed_sec": 59667.2914249897, "step_time_sec": 8.229397584014805, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7193, "loss": 4.324862957000732, "lr": 0.0002, "elapsed_sec": 59675.52137994766, "step_time_sec": 8.229754202999175, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7194, "loss": 4.241950035095215, "lr": 0.0002, "elapsed_sec": 59683.752281188965, "step_time_sec": 8.230795518000377, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7195, "loss": 4.172049045562744, "lr": 0.0002, "elapsed_sec": 59691.983850479126, "step_time_sec": 8.23133160098223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7196, "loss": 4.213099002838135, "lr": 0.0002, "elapsed_sec": 59700.21498465538, "step_time_sec": 8.231017443991732, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7197, "loss": 4.16690731048584, "lr": 0.0002, "elapsed_sec": 59708.444991350174, "step_time_sec": 8.229861644009361, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7198, "loss": 4.315940856933594, "lr": 0.0002, "elapsed_sec": 59716.67588067055, "step_time_sec": 8.230655173014384, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7199, "loss": 4.277928829193115, "lr": 0.0002, "elapsed_sec": 59724.906304359436, "step_time_sec": 8.230229591019452, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7200, "loss": 4.313178062438965, "lr": 0.0002, "elapsed_sec": 59733.137373924255, "step_time_sec": 8.230966191011248, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7201, "loss": 4.281368255615234, "lr": 0.0002, "elapsed_sec": 59741.36859226227, "step_time_sec": 8.231011725991266, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7202, "loss": 4.303987503051758, "lr": 0.0002, "elapsed_sec": 59749.59990119934, "step_time_sec": 8.231212308979593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7203, "loss": 4.305237293243408, "lr": 0.0002, "elapsed_sec": 59757.83105158806, "step_time_sec": 8.230991685006302, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7204, "loss": 4.19396448135376, "lr": 0.0002, "elapsed_sec": 59766.062467336655, "step_time_sec": 8.231209263991332, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7205, "loss": 4.039100646972656, "lr": 0.0002, "elapsed_sec": 59774.29405093193, "step_time_sec": 8.231462680996628, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7206, "loss": 4.3837056159973145, "lr": 0.0002, "elapsed_sec": 59782.52504181862, "step_time_sec": 8.230822219979018, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7207, "loss": 4.248893737792969, "lr": 0.0002, "elapsed_sec": 59790.75599980354, "step_time_sec": 8.23084631099482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7208, "loss": 4.28562068939209, "lr": 0.0002, "elapsed_sec": 59798.98715925217, "step_time_sec": 8.230928910983494, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7209, "loss": 4.375720977783203, "lr": 0.0002, "elapsed_sec": 59807.217477321625, "step_time_sec": 8.230195003998233, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7210, "loss": 4.227180004119873, "lr": 0.0002, "elapsed_sec": 59815.448776721954, "step_time_sec": 8.231115406000754, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7211, "loss": 4.410199165344238, "lr": 0.0002, "elapsed_sec": 59823.679233551025, "step_time_sec": 8.230327661003685, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7212, "loss": 4.239195346832275, "lr": 0.0002, "elapsed_sec": 59831.90957927704, "step_time_sec": 8.230175061005866, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7213, "loss": 4.285372257232666, "lr": 0.0002, "elapsed_sec": 59840.139823675156, "step_time_sec": 8.23015053299605, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7214, "loss": 4.09864616394043, "lr": 0.0002, "elapsed_sec": 59848.370612859726, "step_time_sec": 8.230602419003844, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7215, "loss": 4.244015216827393, "lr": 0.0002, "elapsed_sec": 59856.60215616226, "step_time_sec": 8.231386088009458, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7216, "loss": 4.260793209075928, "lr": 0.0002, "elapsed_sec": 59864.83342337608, "step_time_sec": 8.231150567997247, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7217, "loss": 4.15254545211792, "lr": 0.0002, "elapsed_sec": 59873.06563925743, "step_time_sec": 8.232054182997672, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7218, "loss": 4.288451671600342, "lr": 0.0002, "elapsed_sec": 59881.29634785652, "step_time_sec": 8.230569595005363, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7219, "loss": 4.3102707862854, "lr": 0.0002, "elapsed_sec": 59889.52738547325, "step_time_sec": 8.23085470998194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7220, "loss": 4.1770920753479, "lr": 0.0002, "elapsed_sec": 59897.75837016106, "step_time_sec": 8.230839205003576, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7221, "loss": 4.265194416046143, "lr": 0.0002, "elapsed_sec": 59905.989135980606, "step_time_sec": 8.230551239976194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7222, "loss": 4.258901596069336, "lr": 0.0002, "elapsed_sec": 59914.2207736969, "step_time_sec": 8.231523179012584, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7223, "loss": 4.211154460906982, "lr": 0.0002, "elapsed_sec": 59922.452394247055, "step_time_sec": 8.231457467016298, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7224, "loss": 4.341794490814209, "lr": 0.0002, "elapsed_sec": 59930.684222221375, "step_time_sec": 8.231671943998663, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7225, "loss": 4.2820539474487305, "lr": 0.0002, "elapsed_sec": 59938.91515684128, "step_time_sec": 8.230770596012007, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7226, "loss": 4.300375461578369, "lr": 0.0002, "elapsed_sec": 59947.146510362625, "step_time_sec": 8.231182018003892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7227, "loss": 4.104107856750488, "lr": 0.0002, "elapsed_sec": 59955.376271247864, "step_time_sec": 8.229717818991048, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7228, "loss": 4.267893314361572, "lr": 0.0002, "elapsed_sec": 59963.60523676872, "step_time_sec": 8.228717154008336, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7229, "loss": 4.043006896972656, "lr": 0.0002, "elapsed_sec": 59971.834648132324, "step_time_sec": 8.229249206982786, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7230, "loss": 4.377396583557129, "lr": 0.0002, "elapsed_sec": 59980.06305146217, "step_time_sec": 8.22830009800964, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7231, "loss": 4.214759826660156, "lr": 0.0002, "elapsed_sec": 59988.29471182823, "step_time_sec": 8.23150630100281, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7232, "loss": 4.112260341644287, "lr": 0.0002, "elapsed_sec": 59996.52520108223, "step_time_sec": 8.23031820700271, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7233, "loss": 4.102215766906738, "lr": 0.0002, "elapsed_sec": 60004.752178907394, "step_time_sec": 8.226853875006782, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7234, "loss": 4.2932000160217285, "lr": 0.0002, "elapsed_sec": 60012.98154759407, "step_time_sec": 8.22917674598284, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7235, "loss": 4.325409889221191, "lr": 0.0002, "elapsed_sec": 60021.208933115005, "step_time_sec": 8.22721219700179, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7236, "loss": 4.308413028717041, "lr": 0.0002, "elapsed_sec": 60029.44013547897, "step_time_sec": 8.231036682002014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7237, "loss": 4.239589691162109, "lr": 0.0002, "elapsed_sec": 60037.67045855522, "step_time_sec": 8.230169574992033, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7238, "loss": 4.164575099945068, "lr": 0.0002, "elapsed_sec": 60045.90043926239, "step_time_sec": 8.229813440993894, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7239, "loss": 4.104711055755615, "lr": 0.0002, "elapsed_sec": 60054.13007283211, "step_time_sec": 8.229465657001128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7240, "loss": 4.176596164703369, "lr": 0.0002, "elapsed_sec": 60062.35777139664, "step_time_sec": 8.227532283985056, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7241, "loss": 4.196403503417969, "lr": 0.0002, "elapsed_sec": 60070.58999347687, "step_time_sec": 8.232055427011801, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7242, "loss": 4.312899112701416, "lr": 0.0002, "elapsed_sec": 60078.82047176361, "step_time_sec": 8.230313329026103, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7243, "loss": 4.056081295013428, "lr": 0.0002, "elapsed_sec": 60087.050465106964, "step_time_sec": 8.229822658991907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7244, "loss": 4.081727981567383, "lr": 0.0002, "elapsed_sec": 60095.281019210815, "step_time_sec": 8.230456371005857, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7245, "loss": 4.0981035232543945, "lr": 0.0002, "elapsed_sec": 60103.512046575546, "step_time_sec": 8.230804694991093, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7246, "loss": 4.1755828857421875, "lr": 0.0002, "elapsed_sec": 60111.74298238754, "step_time_sec": 8.230742165993433, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7247, "loss": 4.169550895690918, "lr": 0.0002, "elapsed_sec": 60119.973046541214, "step_time_sec": 8.229993553017266, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7248, "loss": 4.239980697631836, "lr": 0.0002, "elapsed_sec": 60128.20272278786, "step_time_sec": 8.229529250995256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7249, "loss": 4.0488104820251465, "lr": 0.0002, "elapsed_sec": 60136.4333319664, "step_time_sec": 8.230395563994534, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7250, "loss": 4.15069580078125, "lr": 0.0002, "elapsed_sec": 60144.6648581028, "step_time_sec": 8.231359819998033, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7251, "loss": 4.28174352645874, "lr": 0.0002, "elapsed_sec": 60152.89359354973, "step_time_sec": 8.228567076002946, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7252, "loss": 4.192539215087891, "lr": 0.0002, "elapsed_sec": 60161.122460603714, "step_time_sec": 8.228727347013773, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7253, "loss": 4.095254421234131, "lr": 0.0002, "elapsed_sec": 60169.35155797005, "step_time_sec": 8.228938488988206, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7254, "loss": 4.214378833770752, "lr": 0.0002, "elapsed_sec": 60177.581986904144, "step_time_sec": 8.23027781600831, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7255, "loss": 4.290624618530273, "lr": 0.0002, "elapsed_sec": 60185.81292462349, "step_time_sec": 8.230786051019095, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7256, "loss": 4.282445907592773, "lr": 0.0002, "elapsed_sec": 60194.04342460632, "step_time_sec": 8.230374206003034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7257, "loss": 4.193511486053467, "lr": 0.0002, "elapsed_sec": 60202.274344205856, "step_time_sec": 8.230738965998171, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7258, "loss": 4.201292514801025, "lr": 0.0002, "elapsed_sec": 60210.50609564781, "step_time_sec": 8.231614604010247, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7259, "loss": 4.177244663238525, "lr": 0.0002, "elapsed_sec": 60218.73691391945, "step_time_sec": 8.230673887010198, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7260, "loss": 4.1964311599731445, "lr": 0.0002, "elapsed_sec": 60226.96799182892, "step_time_sec": 8.230936931999167, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7261, "loss": 4.177684307098389, "lr": 0.0002, "elapsed_sec": 60235.197861909866, "step_time_sec": 8.229697794013191, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7262, "loss": 4.065543174743652, "lr": 0.0002, "elapsed_sec": 60243.428703308105, "step_time_sec": 8.230700513988268, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7263, "loss": 4.179266452789307, "lr": 0.0002, "elapsed_sec": 60251.65987586975, "step_time_sec": 8.23096524097491, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7264, "loss": 4.111903667449951, "lr": 0.0002, "elapsed_sec": 60259.89121365547, "step_time_sec": 8.231213379011024, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7265, "loss": 4.137232780456543, "lr": 0.0002, "elapsed_sec": 60268.12014770508, "step_time_sec": 8.228801755001768, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7266, "loss": 4.237411975860596, "lr": 0.0002, "elapsed_sec": 60276.34909963608, "step_time_sec": 8.228791483998066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7267, "loss": 4.155388832092285, "lr": 0.0002, "elapsed_sec": 60284.58047866821, "step_time_sec": 8.23121318500489, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7268, "loss": 4.166393280029297, "lr": 0.0002, "elapsed_sec": 60292.811357975006, "step_time_sec": 8.230715946003329, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7269, "loss": 4.161990642547607, "lr": 0.0002, "elapsed_sec": 60301.04133296013, "step_time_sec": 8.229861603991594, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7270, "loss": 4.333831787109375, "lr": 0.0002, "elapsed_sec": 60309.27207612991, "step_time_sec": 8.230560768017313, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7271, "loss": 4.20978307723999, "lr": 0.0002, "elapsed_sec": 60317.50357699394, "step_time_sec": 8.231320068007335, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7272, "loss": 4.121123313903809, "lr": 0.0002, "elapsed_sec": 60325.733528375626, "step_time_sec": 8.229795082006603, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7273, "loss": 4.053124904632568, "lr": 0.0002, "elapsed_sec": 60333.964179992676, "step_time_sec": 8.23051707400009, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7274, "loss": 4.070427417755127, "lr": 0.0002, "elapsed_sec": 60342.19195485115, "step_time_sec": 8.227619641984347, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7275, "loss": 4.168816566467285, "lr": 0.0002, "elapsed_sec": 60350.42381668091, "step_time_sec": 8.23176977399271, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7276, "loss": 4.020937442779541, "lr": 0.0002, "elapsed_sec": 60358.65506887436, "step_time_sec": 8.231118879979476, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7277, "loss": 4.237240791320801, "lr": 0.0002, "elapsed_sec": 60366.88597917557, "step_time_sec": 8.230677370011108, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7278, "loss": 3.9757814407348633, "lr": 0.0002, "elapsed_sec": 60375.11727833748, "step_time_sec": 8.231227976997616, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7279, "loss": 4.059634208679199, "lr": 0.0002, "elapsed_sec": 60383.34773540497, "step_time_sec": 8.230269306979608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7280, "loss": 4.11731481552124, "lr": 0.0002, "elapsed_sec": 60391.577770233154, "step_time_sec": 8.22985355500714, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7281, "loss": 4.222392559051514, "lr": 0.0002, "elapsed_sec": 60399.8093214035, "step_time_sec": 8.23137322999537, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7282, "loss": 4.015250205993652, "lr": 0.0002, "elapsed_sec": 60408.03964972496, "step_time_sec": 8.230164503998822, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7283, "loss": 4.013568878173828, "lr": 0.0002, "elapsed_sec": 60416.26862740517, "step_time_sec": 8.22882736599422, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7284, "loss": 4.1453166007995605, "lr": 0.0002, "elapsed_sec": 60424.49783992767, "step_time_sec": 8.229052081005648, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7285, "loss": 4.053742408752441, "lr": 0.0002, "elapsed_sec": 60432.728682518005, "step_time_sec": 8.230703212990193, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7286, "loss": 4.080325126647949, "lr": 0.0002, "elapsed_sec": 60440.96006011963, "step_time_sec": 8.231206277007004, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7287, "loss": 4.185528755187988, "lr": 0.0002, "elapsed_sec": 60449.19057703018, "step_time_sec": 8.230398426007014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7288, "loss": 4.066404342651367, "lr": 0.0002, "elapsed_sec": 60457.42306447029, "step_time_sec": 8.232300399016822, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7289, "loss": 4.045673847198486, "lr": 0.0002, "elapsed_sec": 60465.65411949158, "step_time_sec": 8.2309531729843, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7290, "loss": 4.278584957122803, "lr": 0.0002, "elapsed_sec": 60473.885877132416, "step_time_sec": 8.231538205989636, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7291, "loss": 3.9600324630737305, "lr": 0.0002, "elapsed_sec": 60482.11519098282, "step_time_sec": 8.229168257006677, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7292, "loss": 4.167283058166504, "lr": 0.0002, "elapsed_sec": 60490.342861413956, "step_time_sec": 8.227544403984211, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7293, "loss": 4.2410173416137695, "lr": 0.0002, "elapsed_sec": 60498.573927640915, "step_time_sec": 8.230902668001363, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7294, "loss": 4.150339603424072, "lr": 0.0002, "elapsed_sec": 60506.80444598198, "step_time_sec": 8.230307244986761, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7295, "loss": 4.199257850646973, "lr": 0.0002, "elapsed_sec": 60515.035903692245, "step_time_sec": 8.231323663989315, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7296, "loss": 4.212107181549072, "lr": 0.0002, "elapsed_sec": 60523.266518116, "step_time_sec": 8.230458992999047, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7297, "loss": 4.1220855712890625, "lr": 0.0002, "elapsed_sec": 60531.49782729149, "step_time_sec": 8.231151029001921, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7298, "loss": 4.059177398681641, "lr": 0.0002, "elapsed_sec": 60539.72795343399, "step_time_sec": 8.229986926016863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7299, "loss": 4.167227268218994, "lr": 0.0002, "elapsed_sec": 60547.95692539215, "step_time_sec": 8.228803587990114, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7300, "loss": 4.307835102081299, "lr": 0.0002, "elapsed_sec": 60556.1880774498, "step_time_sec": 8.23100225199596, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7301, "loss": 4.329254627227783, "lr": 0.0002, "elapsed_sec": 60564.41922259331, "step_time_sec": 8.23099129300681, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7302, "loss": 4.076909065246582, "lr": 0.0002, "elapsed_sec": 60572.65151691437, "step_time_sec": 8.232132584991632, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7303, "loss": 4.212214946746826, "lr": 0.0002, "elapsed_sec": 60580.88274717331, "step_time_sec": 8.231123011006275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7304, "loss": 4.144636631011963, "lr": 0.0002, "elapsed_sec": 60589.11373806, "step_time_sec": 8.230793458002154, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7305, "loss": 4.100246429443359, "lr": 0.0002, "elapsed_sec": 60597.3446867466, "step_time_sec": 8.230796284013195, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7306, "loss": 4.041450023651123, "lr": 0.0002, "elapsed_sec": 60605.57509422302, "step_time_sec": 8.230272115004482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7307, "loss": 4.398438453674316, "lr": 0.0002, "elapsed_sec": 60613.805931806564, "step_time_sec": 8.230734011012828, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7308, "loss": 4.212082386016846, "lr": 0.0002, "elapsed_sec": 60622.03691625595, "step_time_sec": 8.230742362997262, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7309, "loss": 4.218846321105957, "lr": 0.0002, "elapsed_sec": 60630.2691655159, "step_time_sec": 8.232114440994337, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7310, "loss": 4.203805923461914, "lr": 0.0002, "elapsed_sec": 60638.49966311455, "step_time_sec": 8.230396742001176, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7311, "loss": 4.104883193969727, "lr": 0.0002, "elapsed_sec": 60646.73175430298, "step_time_sec": 8.231916357995942, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7312, "loss": 4.00924825668335, "lr": 0.0002, "elapsed_sec": 60654.96348071098, "step_time_sec": 8.231525462004356, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7313, "loss": 4.21160888671875, "lr": 0.0002, "elapsed_sec": 60663.19415783882, "step_time_sec": 8.230520142009482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7314, "loss": 4.065821647644043, "lr": 0.0002, "elapsed_sec": 60671.42545771599, "step_time_sec": 8.231130833009956, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7315, "loss": 4.1273322105407715, "lr": 0.0002, "elapsed_sec": 60679.65653538704, "step_time_sec": 8.230976760998601, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7316, "loss": 4.152756214141846, "lr": 0.0002, "elapsed_sec": 60687.88752794266, "step_time_sec": 8.230786611995427, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7317, "loss": 4.062178611755371, "lr": 0.0002, "elapsed_sec": 60696.1180202961, "step_time_sec": 8.230388565018075, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7318, "loss": 4.081259727478027, "lr": 0.0002, "elapsed_sec": 60704.34962105751, "step_time_sec": 8.231406237988267, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7319, "loss": 4.155380725860596, "lr": 0.0002, "elapsed_sec": 60712.58101582527, "step_time_sec": 8.23123929798021, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7320, "loss": 4.128063201904297, "lr": 0.0002, "elapsed_sec": 60720.812972307205, "step_time_sec": 8.231845547998091, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7321, "loss": 4.1153950691223145, "lr": 0.0002, "elapsed_sec": 60729.043545246124, "step_time_sec": 8.2303647279914, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7322, "loss": 4.011810779571533, "lr": 0.0002, "elapsed_sec": 60737.274627923965, "step_time_sec": 8.230932887003291, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7323, "loss": 4.070493221282959, "lr": 0.0002, "elapsed_sec": 60745.5056681633, "step_time_sec": 8.230880776012782, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7324, "loss": 4.120403289794922, "lr": 0.0002, "elapsed_sec": 60753.736251592636, "step_time_sec": 8.23050103802234, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7325, "loss": 4.171430587768555, "lr": 0.0002, "elapsed_sec": 60761.96731901169, "step_time_sec": 8.230827375984518, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7326, "loss": 4.179549694061279, "lr": 0.0002, "elapsed_sec": 60770.19815468788, "step_time_sec": 8.230708335002419, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7327, "loss": 4.0780029296875, "lr": 0.0002, "elapsed_sec": 60778.43002510071, "step_time_sec": 8.23176218001754, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7328, "loss": 4.329292297363281, "lr": 0.0002, "elapsed_sec": 60786.660876750946, "step_time_sec": 8.230613200983498, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7329, "loss": 4.160965919494629, "lr": 0.0002, "elapsed_sec": 60794.89182305336, "step_time_sec": 8.230805497005349, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7330, "loss": 4.091414928436279, "lr": 0.0002, "elapsed_sec": 60803.1219933033, "step_time_sec": 8.230068331002258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7331, "loss": 4.1326704025268555, "lr": 0.0002, "elapsed_sec": 60811.35228872299, "step_time_sec": 8.23008900199784, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7332, "loss": 4.179546356201172, "lr": 0.0002, "elapsed_sec": 60819.58235192299, "step_time_sec": 8.229884933010908, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7333, "loss": 4.113008975982666, "lr": 0.0002, "elapsed_sec": 60827.81293606758, "step_time_sec": 8.230425080982968, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7334, "loss": 4.092822551727295, "lr": 0.0002, "elapsed_sec": 60836.04432845116, "step_time_sec": 8.23132928297855, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7335, "loss": 4.220553398132324, "lr": 0.0002, "elapsed_sec": 60844.27345252037, "step_time_sec": 8.228960947017185, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7336, "loss": 4.259091854095459, "lr": 0.0002, "elapsed_sec": 60852.50289940834, "step_time_sec": 8.22922080400167, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7337, "loss": 4.120635986328125, "lr": 0.0002, "elapsed_sec": 60860.73157811165, "step_time_sec": 8.228529443993466, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7338, "loss": 4.190451622009277, "lr": 0.0002, "elapsed_sec": 60868.961280584335, "step_time_sec": 8.229578936006874, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7339, "loss": 4.120813846588135, "lr": 0.0002, "elapsed_sec": 60877.19163894653, "step_time_sec": 8.230182819999754, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7340, "loss": 4.2063212394714355, "lr": 0.0002, "elapsed_sec": 60885.42001700401, "step_time_sec": 8.228239508986007, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7341, "loss": 4.181320667266846, "lr": 0.0002, "elapsed_sec": 60893.65016794205, "step_time_sec": 8.22999771501054, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7342, "loss": 4.188462734222412, "lr": 0.0002, "elapsed_sec": 60901.87883067131, "step_time_sec": 8.228457194985822, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7343, "loss": 4.236639499664307, "lr": 0.0002, "elapsed_sec": 60910.10982131958, "step_time_sec": 8.2308950489969, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7344, "loss": 4.278195858001709, "lr": 0.0002, "elapsed_sec": 60918.34082722664, "step_time_sec": 8.230817282979842, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7345, "loss": 4.005557060241699, "lr": 0.0002, "elapsed_sec": 60926.568739652634, "step_time_sec": 8.227700774994446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7346, "loss": 4.078395843505859, "lr": 0.0002, "elapsed_sec": 60934.7975769043, "step_time_sec": 8.228706505993614, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7347, "loss": 4.143735408782959, "lr": 0.0002, "elapsed_sec": 60943.02558016777, "step_time_sec": 8.227834140998311, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7348, "loss": 4.081812381744385, "lr": 0.0002, "elapsed_sec": 60951.25560069084, "step_time_sec": 8.229884273983771, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7349, "loss": 4.067895412445068, "lr": 0.0002, "elapsed_sec": 60959.48557186127, "step_time_sec": 8.22977201201138, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7350, "loss": 4.154891014099121, "lr": 0.0002, "elapsed_sec": 60967.71422600746, "step_time_sec": 8.228480982012115, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7351, "loss": 4.211248874664307, "lr": 0.0002, "elapsed_sec": 60975.9450340271, "step_time_sec": 8.23067369000637, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7352, "loss": 4.163737773895264, "lr": 0.0002, "elapsed_sec": 60984.17451834679, "step_time_sec": 8.229313319985522, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7353, "loss": 4.104503154754639, "lr": 0.0002, "elapsed_sec": 60992.40522646904, "step_time_sec": 8.230564538011095, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7354, "loss": 4.126185417175293, "lr": 0.0002, "elapsed_sec": 61000.63611078262, "step_time_sec": 8.230711251002504, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7355, "loss": 4.234287261962891, "lr": 0.0002, "elapsed_sec": 61008.86591696739, "step_time_sec": 8.229682864010101, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7356, "loss": 4.198551654815674, "lr": 0.0002, "elapsed_sec": 61017.09671854973, "step_time_sec": 8.230601125978865, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7357, "loss": 4.03156042098999, "lr": 0.0002, "elapsed_sec": 61025.32747960091, "step_time_sec": 8.23058567600674, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7358, "loss": 4.441244602203369, "lr": 0.0002, "elapsed_sec": 61033.55803608894, "step_time_sec": 8.230415906000417, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7359, "loss": 4.130824565887451, "lr": 0.0002, "elapsed_sec": 61041.78894519806, "step_time_sec": 8.23072754801251, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7360, "loss": 4.20040225982666, "lr": 0.0002, "elapsed_sec": 61050.019313812256, "step_time_sec": 8.230234960006783, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7361, "loss": 4.223947525024414, "lr": 0.0002, "elapsed_sec": 61058.250098228455, "step_time_sec": 8.23070800199639, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7362, "loss": 4.220559597015381, "lr": 0.0002, "elapsed_sec": 61066.48089385033, "step_time_sec": 8.230613319989061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7363, "loss": 4.201827526092529, "lr": 0.0002, "elapsed_sec": 61074.70967793465, "step_time_sec": 8.228595195978414, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7364, "loss": 4.207417964935303, "lr": 0.0002, "elapsed_sec": 61082.93858623505, "step_time_sec": 8.228772984002717, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7365, "loss": 4.195929527282715, "lr": 0.0002, "elapsed_sec": 61091.16669368744, "step_time_sec": 8.227981655974872, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7366, "loss": 4.179278373718262, "lr": 0.0002, "elapsed_sec": 61099.39794278145, "step_time_sec": 8.231011287018191, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7367, "loss": 4.138319492340088, "lr": 0.0002, "elapsed_sec": 61107.62856841087, "step_time_sec": 8.230455520009855, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7368, "loss": 4.261010646820068, "lr": 0.0002, "elapsed_sec": 61115.86004638672, "step_time_sec": 8.231303406995721, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7369, "loss": 4.060548305511475, "lr": 0.0002, "elapsed_sec": 61124.09195327759, "step_time_sec": 8.231776664004428, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7370, "loss": 4.108155250549316, "lr": 0.0002, "elapsed_sec": 61132.322937488556, "step_time_sec": 8.230793266004184, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7371, "loss": 4.173793315887451, "lr": 0.0002, "elapsed_sec": 61140.55347657204, "step_time_sec": 8.230412099015666, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7372, "loss": 4.188884258270264, "lr": 0.0002, "elapsed_sec": 61148.783662080765, "step_time_sec": 8.230035127984593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7373, "loss": 4.092854976654053, "lr": 0.0002, "elapsed_sec": 61157.01121520996, "step_time_sec": 8.227331154019339, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7374, "loss": 4.176169395446777, "lr": 0.0002, "elapsed_sec": 61165.24168896675, "step_time_sec": 8.230325241980609, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7375, "loss": 4.086298942565918, "lr": 0.0002, "elapsed_sec": 61173.47181034088, "step_time_sec": 8.22997927799588, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7376, "loss": 4.1344122886657715, "lr": 0.0002, "elapsed_sec": 61181.700444698334, "step_time_sec": 8.228478467994137, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7377, "loss": 4.118267059326172, "lr": 0.0002, "elapsed_sec": 61189.92982649803, "step_time_sec": 8.229225057002623, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7378, "loss": 4.164527893066406, "lr": 0.0002, "elapsed_sec": 61198.158537864685, "step_time_sec": 8.228594859014265, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7379, "loss": 4.105403900146484, "lr": 0.0002, "elapsed_sec": 61206.388979673386, "step_time_sec": 8.23026483602007, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7380, "loss": 4.161746025085449, "lr": 0.0002, "elapsed_sec": 61214.62015581131, "step_time_sec": 8.23097765102284, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7381, "loss": 4.213942527770996, "lr": 0.0002, "elapsed_sec": 61222.85019659996, "step_time_sec": 8.229858067003079, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7382, "loss": 4.222307205200195, "lr": 0.0002, "elapsed_sec": 61231.07955622673, "step_time_sec": 8.229210085002705, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7383, "loss": 4.050964832305908, "lr": 0.0002, "elapsed_sec": 61239.31121659279, "step_time_sec": 8.231546740978956, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7384, "loss": 4.147028923034668, "lr": 0.0002, "elapsed_sec": 61247.54224085808, "step_time_sec": 8.230830590007827, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7385, "loss": 4.097509860992432, "lr": 0.0002, "elapsed_sec": 61255.773634672165, "step_time_sec": 8.231232246005675, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7386, "loss": 4.057460308074951, "lr": 0.0002, "elapsed_sec": 61264.004735946655, "step_time_sec": 8.230982400011271, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7387, "loss": 4.230255603790283, "lr": 0.0002, "elapsed_sec": 61272.2357840538, "step_time_sec": 8.230859988019802, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7388, "loss": 4.270650863647461, "lr": 0.0002, "elapsed_sec": 61280.46709084511, "step_time_sec": 8.231154773005983, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7389, "loss": 4.13115119934082, "lr": 0.0002, "elapsed_sec": 61288.69737410545, "step_time_sec": 8.23011127798236, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7390, "loss": 4.204346656799316, "lr": 0.0002, "elapsed_sec": 61296.9292576313, "step_time_sec": 8.23172570799943, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7391, "loss": 4.13447380065918, "lr": 0.0002, "elapsed_sec": 61305.159675598145, "step_time_sec": 8.230247544008307, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7392, "loss": 4.123856067657471, "lr": 0.0002, "elapsed_sec": 61313.39105343819, "step_time_sec": 8.231277508981293, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7393, "loss": 4.228688716888428, "lr": 0.0002, "elapsed_sec": 61321.61937570572, "step_time_sec": 8.228160030004801, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7394, "loss": 4.305875301361084, "lr": 0.0002, "elapsed_sec": 61329.84972476959, "step_time_sec": 8.230193480994785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7395, "loss": 4.271513938903809, "lr": 0.0002, "elapsed_sec": 61338.080666065216, "step_time_sec": 8.230757452984108, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7396, "loss": 4.123651027679443, "lr": 0.0002, "elapsed_sec": 61346.3112654686, "step_time_sec": 8.230448279005941, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7397, "loss": 4.244271755218506, "lr": 0.0002, "elapsed_sec": 61354.542192697525, "step_time_sec": 8.23079612199217, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7398, "loss": 4.149947166442871, "lr": 0.0002, "elapsed_sec": 61362.77285027504, "step_time_sec": 8.230502146994695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7399, "loss": 4.134568214416504, "lr": 0.0002, "elapsed_sec": 61371.00456523895, "step_time_sec": 8.231601781997597, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7400, "loss": 4.06630277633667, "lr": 0.0002, "elapsed_sec": 61379.234922647476, "step_time_sec": 8.230174311000155, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7401, "loss": 4.024742603302002, "lr": 0.0002, "elapsed_sec": 61387.465973854065, "step_time_sec": 8.230869313003495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7402, "loss": 4.142577171325684, "lr": 0.0002, "elapsed_sec": 61395.695476293564, "step_time_sec": 8.229328913002973, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7403, "loss": 4.0484538078308105, "lr": 0.0002, "elapsed_sec": 61403.92457461357, "step_time_sec": 8.2289250989852, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7404, "loss": 4.266550064086914, "lr": 0.0002, "elapsed_sec": 61412.15258169174, "step_time_sec": 8.227877950994298, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7405, "loss": 4.107269287109375, "lr": 0.0002, "elapsed_sec": 61420.3829665184, "step_time_sec": 8.230216511990875, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7406, "loss": 4.114161491394043, "lr": 0.0002, "elapsed_sec": 61428.61440563202, "step_time_sec": 8.231292414013296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7407, "loss": 4.252710819244385, "lr": 0.0002, "elapsed_sec": 61436.845143556595, "step_time_sec": 8.230619924986968, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7408, "loss": 4.166845798492432, "lr": 0.0002, "elapsed_sec": 61445.07623386383, "step_time_sec": 8.23089253398939, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7409, "loss": 4.55488395690918, "lr": 0.0002, "elapsed_sec": 61453.306275606155, "step_time_sec": 8.229878120007925, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7410, "loss": 4.0947346687316895, "lr": 0.0002, "elapsed_sec": 61461.5367295742, "step_time_sec": 8.230300969997188, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7411, "loss": 4.14872407913208, "lr": 0.0002, "elapsed_sec": 61469.76776432991, "step_time_sec": 8.230888448015321, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7412, "loss": 4.095692157745361, "lr": 0.0002, "elapsed_sec": 61477.99738550186, "step_time_sec": 8.229469534009695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7413, "loss": 4.088355541229248, "lr": 0.0002, "elapsed_sec": 61486.228669166565, "step_time_sec": 8.231111411994789, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7414, "loss": 4.088093280792236, "lr": 0.0002, "elapsed_sec": 61494.45973229408, "step_time_sec": 8.23094474102254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7415, "loss": 4.112115383148193, "lr": 0.0002, "elapsed_sec": 61502.688118457794, "step_time_sec": 8.22820622700965, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7416, "loss": 4.245822906494141, "lr": 0.0002, "elapsed_sec": 61510.91581058502, "step_time_sec": 8.22755715000676, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7417, "loss": 4.078090190887451, "lr": 0.0002, "elapsed_sec": 61519.14507603645, "step_time_sec": 8.229131454019807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7418, "loss": 4.202305316925049, "lr": 0.0002, "elapsed_sec": 61527.37631177902, "step_time_sec": 8.23104960899218, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7419, "loss": 4.130024433135986, "lr": 0.0002, "elapsed_sec": 61535.60688686371, "step_time_sec": 8.230423522996716, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7420, "loss": 4.351149082183838, "lr": 0.0002, "elapsed_sec": 61543.83476424217, "step_time_sec": 8.227743408002425, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7421, "loss": 4.006217002868652, "lr": 0.0002, "elapsed_sec": 61552.06298947334, "step_time_sec": 8.228085244016256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7422, "loss": 4.26299524307251, "lr": 0.0002, "elapsed_sec": 61560.2908885479, "step_time_sec": 8.227700610994361, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7423, "loss": 4.061769485473633, "lr": 0.0002, "elapsed_sec": 61568.52251935005, "step_time_sec": 8.231477455992717, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7424, "loss": 4.239993572235107, "lr": 0.0002, "elapsed_sec": 61576.75301551819, "step_time_sec": 8.230403296998702, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7425, "loss": 4.102124214172363, "lr": 0.0002, "elapsed_sec": 61584.983536720276, "step_time_sec": 8.230345188989304, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7426, "loss": 4.124425888061523, "lr": 0.0002, "elapsed_sec": 61593.21187496185, "step_time_sec": 8.228137003985466, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7427, "loss": 4.209721088409424, "lr": 0.0002, "elapsed_sec": 61601.44138622284, "step_time_sec": 8.229409182997188, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7428, "loss": 3.973285675048828, "lr": 0.0002, "elapsed_sec": 61609.67315912247, "step_time_sec": 8.231580776977353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7429, "loss": 4.271390914916992, "lr": 0.0002, "elapsed_sec": 61617.90440988541, "step_time_sec": 8.23110062800697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7430, "loss": 4.060216426849365, "lr": 0.0002, "elapsed_sec": 61626.13384461403, "step_time_sec": 8.229332400020212, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7431, "loss": 4.179496765136719, "lr": 0.0002, "elapsed_sec": 61634.36301660538, "step_time_sec": 8.228989272989566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7432, "loss": 4.029781341552734, "lr": 0.0002, "elapsed_sec": 61642.59324550629, "step_time_sec": 8.230001095012994, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7433, "loss": 4.052314758300781, "lr": 0.0002, "elapsed_sec": 61650.82404589653, "step_time_sec": 8.2306756869948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7434, "loss": 4.233449459075928, "lr": 0.0002, "elapsed_sec": 61659.054601192474, "step_time_sec": 8.23039577199961, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7435, "loss": 4.011130332946777, "lr": 0.0002, "elapsed_sec": 61667.285064935684, "step_time_sec": 8.230317012988962, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7436, "loss": 4.196935176849365, "lr": 0.0002, "elapsed_sec": 61675.51692485809, "step_time_sec": 8.231693515990628, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7437, "loss": 4.143019676208496, "lr": 0.0002, "elapsed_sec": 61683.747984170914, "step_time_sec": 8.230920964997495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7438, "loss": 4.301884651184082, "lr": 0.0002, "elapsed_sec": 61691.97852897644, "step_time_sec": 8.230430533003528, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7439, "loss": 4.164508819580078, "lr": 0.0002, "elapsed_sec": 61700.20972061157, "step_time_sec": 8.231018480000785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7440, "loss": 4.207363128662109, "lr": 0.0002, "elapsed_sec": 61708.440205812454, "step_time_sec": 8.23030481301248, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7441, "loss": 4.207614898681641, "lr": 0.0002, "elapsed_sec": 61716.6713745594, "step_time_sec": 8.23099982997519, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7442, "loss": 4.069664478302002, "lr": 0.0002, "elapsed_sec": 61724.90221643448, "step_time_sec": 8.23071601902484, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7443, "loss": 4.165800094604492, "lr": 0.0002, "elapsed_sec": 61733.13322329521, "step_time_sec": 8.230840256990632, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7444, "loss": 4.180512428283691, "lr": 0.0002, "elapsed_sec": 61741.36453270912, "step_time_sec": 8.231141373020364, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7445, "loss": 4.18363094329834, "lr": 0.0002, "elapsed_sec": 61749.59603309631, "step_time_sec": 8.231392122979742, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7446, "loss": 4.0852227210998535, "lr": 0.0002, "elapsed_sec": 61757.827146053314, "step_time_sec": 8.230912103987066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7447, "loss": 4.190479278564453, "lr": 0.0002, "elapsed_sec": 61766.05786418915, "step_time_sec": 8.23058275799849, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7448, "loss": 4.212684154510498, "lr": 0.0002, "elapsed_sec": 61774.28853869438, "step_time_sec": 8.230585937999422, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7449, "loss": 4.1528520584106445, "lr": 0.0002, "elapsed_sec": 61782.52019357681, "step_time_sec": 8.231432591972407, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7450, "loss": 4.001092433929443, "lr": 0.0002, "elapsed_sec": 61790.75133728981, "step_time_sec": 8.231059340992942, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7451, "loss": 4.073563575744629, "lr": 0.0002, "elapsed_sec": 61798.9823782444, "step_time_sec": 8.230793900991557, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7452, "loss": 4.12124490737915, "lr": 0.0002, "elapsed_sec": 61807.212873220444, "step_time_sec": 8.230382317997282, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7453, "loss": 3.985982894897461, "lr": 0.0002, "elapsed_sec": 61815.44337081909, "step_time_sec": 8.230365343973972, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7454, "loss": 4.067300796508789, "lr": 0.0002, "elapsed_sec": 61823.67324781418, "step_time_sec": 8.229633203998674, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7455, "loss": 4.200835227966309, "lr": 0.0002, "elapsed_sec": 61831.90237855911, "step_time_sec": 8.229005588014843, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7456, "loss": 4.404501438140869, "lr": 0.0002, "elapsed_sec": 61840.13354730606, "step_time_sec": 8.231004207016667, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7457, "loss": 4.1407670974731445, "lr": 0.0002, "elapsed_sec": 61848.36485719681, "step_time_sec": 8.231158405018505, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7458, "loss": 4.384108543395996, "lr": 0.0002, "elapsed_sec": 61856.59668803215, "step_time_sec": 8.231643803010229, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7459, "loss": 4.212984085083008, "lr": 0.0002, "elapsed_sec": 61864.82725906372, "step_time_sec": 8.230419067986077, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7460, "loss": 4.216557502746582, "lr": 0.0002, "elapsed_sec": 61873.057970047, "step_time_sec": 8.230564608005807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7461, "loss": 4.0787272453308105, "lr": 0.0002, "elapsed_sec": 61881.288873672485, "step_time_sec": 8.23077171901241, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7462, "loss": 4.1293768882751465, "lr": 0.0002, "elapsed_sec": 61889.5196518898, "step_time_sec": 8.230640509020304, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7463, "loss": 4.079837322235107, "lr": 0.0002, "elapsed_sec": 61897.75080561638, "step_time_sec": 8.230941182991955, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7464, "loss": 4.06415319442749, "lr": 0.0002, "elapsed_sec": 61905.98063111305, "step_time_sec": 8.229674189991783, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7465, "loss": 4.178928375244141, "lr": 0.0002, "elapsed_sec": 61914.21240520477, "step_time_sec": 8.23162593098823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7466, "loss": 4.117701053619385, "lr": 0.0002, "elapsed_sec": 61922.508439302444, "step_time_sec": 8.238086223020218, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7467, "loss": 4.152710914611816, "lr": 0.0002, "elapsed_sec": 61930.73958873749, "step_time_sec": 8.23093335199519, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7468, "loss": 4.146187782287598, "lr": 0.0002, "elapsed_sec": 61938.970417022705, "step_time_sec": 8.230664250004338, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7469, "loss": 4.151744365692139, "lr": 0.0002, "elapsed_sec": 61947.20176410675, "step_time_sec": 8.231233708007494, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7470, "loss": 4.114970684051514, "lr": 0.0002, "elapsed_sec": 61955.43218708038, "step_time_sec": 8.230218837998109, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7471, "loss": 4.196475028991699, "lr": 0.0002, "elapsed_sec": 61963.66288661957, "step_time_sec": 8.230533660011133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7472, "loss": 4.009479522705078, "lr": 0.0002, "elapsed_sec": 61971.89475893974, "step_time_sec": 8.23169854900334, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7473, "loss": 4.431599140167236, "lr": 0.0002, "elapsed_sec": 61980.12580418587, "step_time_sec": 8.230922435002867, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7474, "loss": 4.298669815063477, "lr": 0.0002, "elapsed_sec": 61988.35614180565, "step_time_sec": 8.230223770980956, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7475, "loss": 4.297083377838135, "lr": 0.0002, "elapsed_sec": 61996.58753037453, "step_time_sec": 8.231180403003236, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7476, "loss": 4.135393142700195, "lr": 0.0002, "elapsed_sec": 62004.81863427162, "step_time_sec": 8.230955728009576, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7477, "loss": 4.144632339477539, "lr": 0.0002, "elapsed_sec": 62013.04773426056, "step_time_sec": 8.228902238013688, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7478, "loss": 4.227505207061768, "lr": 0.0002, "elapsed_sec": 62021.279129981995, "step_time_sec": 8.231264796981122, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7479, "loss": 4.104368686676025, "lr": 0.0002, "elapsed_sec": 62029.50994873047, "step_time_sec": 8.230707419017563, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7480, "loss": 4.249945163726807, "lr": 0.0002, "elapsed_sec": 62037.73944449425, "step_time_sec": 8.229239810985746, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7481, "loss": 4.282962322235107, "lr": 0.0002, "elapsed_sec": 62045.96788787842, "step_time_sec": 8.228262018994428, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7482, "loss": 4.280318737030029, "lr": 0.0002, "elapsed_sec": 62054.19496059418, "step_time_sec": 8.226969782001106, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7483, "loss": 4.272806644439697, "lr": 0.0002, "elapsed_sec": 62062.4265935421, "step_time_sec": 8.231479943002341, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7484, "loss": 4.305508136749268, "lr": 0.0002, "elapsed_sec": 62070.657428741455, "step_time_sec": 8.23062436300097, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7485, "loss": 4.450327396392822, "lr": 0.0002, "elapsed_sec": 62078.8885910511, "step_time_sec": 8.23100287700072, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7486, "loss": 4.286085605621338, "lr": 0.0002, "elapsed_sec": 62087.11909914017, "step_time_sec": 8.23039180401247, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7487, "loss": 4.318647861480713, "lr": 0.0002, "elapsed_sec": 62095.35016775131, "step_time_sec": 8.230859168979805, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7488, "loss": 4.270456314086914, "lr": 0.0002, "elapsed_sec": 62103.58094787598, "step_time_sec": 8.23062017999473, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7489, "loss": 4.293475151062012, "lr": 0.0002, "elapsed_sec": 62111.81188249588, "step_time_sec": 8.23074190801708, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7490, "loss": 4.232514381408691, "lr": 0.0002, "elapsed_sec": 62120.043637514114, "step_time_sec": 8.231622442981461, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7491, "loss": 4.327127933502197, "lr": 0.0002, "elapsed_sec": 62128.2740778923, "step_time_sec": 8.230259709001984, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7492, "loss": 4.2748613357543945, "lr": 0.0002, "elapsed_sec": 62136.50582265854, "step_time_sec": 8.231577692000428, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7493, "loss": 4.246073246002197, "lr": 0.0002, "elapsed_sec": 62144.73560667038, "step_time_sec": 8.229663467995124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7494, "loss": 4.244293689727783, "lr": 0.0002, "elapsed_sec": 62152.96434497833, "step_time_sec": 8.228573876986047, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7495, "loss": 4.112903118133545, "lr": 0.0002, "elapsed_sec": 62161.19242286682, "step_time_sec": 8.227881214988884, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7496, "loss": 4.276957988739014, "lr": 0.0002, "elapsed_sec": 62169.42298531532, "step_time_sec": 8.230393988982541, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7497, "loss": 4.225636959075928, "lr": 0.0002, "elapsed_sec": 62177.65464949608, "step_time_sec": 8.231518574000802, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7498, "loss": 4.185538291931152, "lr": 0.0002, "elapsed_sec": 62185.883101940155, "step_time_sec": 8.228269001003355, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7499, "loss": 4.45969295501709, "lr": 0.0002, "elapsed_sec": 62194.112511873245, "step_time_sec": 8.229262785986066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7500, "loss": 4.170921325683594, "lr": 0.0002, "elapsed_sec": 62202.33981060982, "step_time_sec": 30.708162619004725, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7501, "loss": 4.231043815612793, "lr": 0.0002, "elapsed_sec": 62233.056539297104, "step_time_sec": 8.235177900001872, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7502, "loss": 4.174661159515381, "lr": 0.00019999992493097322, "elapsed_sec": 62241.27357888222, "step_time_sec": 8.216865369991865, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7503, "loss": 4.117559432983398, "lr": 0.00019999969972401153, "elapsed_sec": 62249.49077272415, "step_time_sec": 8.217035887995735, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7504, "loss": 4.200078964233398, "lr": 0.00019999932437947084, "elapsed_sec": 62257.717832803726, "step_time_sec": 8.226978279009927, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7505, "loss": 4.363711833953857, "lr": 0.00019999879889794433, "elapsed_sec": 62265.94565701485, "step_time_sec": 8.227603008010192, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7506, "loss": 4.310970306396484, "lr": 0.00019999812328026244, "elapsed_sec": 62274.17851448059, "step_time_sec": 8.232692756020697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7507, "loss": 4.2701616287231445, "lr": 0.00019999729752749303, "elapsed_sec": 62282.40923810005, "step_time_sec": 8.23057423697901, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7508, "loss": 4.466797351837158, "lr": 0.00019999632164094101, "elapsed_sec": 62290.64055609703, "step_time_sec": 8.231166021025274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7509, "loss": 4.16969633102417, "lr": 0.0001999951956221487, "elapsed_sec": 62298.871376276016, "step_time_sec": 8.230670154007385, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7510, "loss": 4.151364803314209, "lr": 0.0001999939194728957, "elapsed_sec": 62307.10278058052, "step_time_sec": 8.231244572001742, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7511, "loss": 4.165115833282471, "lr": 0.0001999924931951988, "elapsed_sec": 62315.334159612656, "step_time_sec": 8.231192263017874, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7512, "loss": 4.144097805023193, "lr": 0.00019999091679131213, "elapsed_sec": 62323.56551027298, "step_time_sec": 8.23123148799641, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7513, "loss": 4.323545932769775, "lr": 0.00019998919026372695, "elapsed_sec": 62331.79728150368, "step_time_sec": 8.231579579005484, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7514, "loss": 4.130186557769775, "lr": 0.00019998731361517194, "elapsed_sec": 62340.02787923813, "step_time_sec": 8.230513624992454, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7515, "loss": 4.090141773223877, "lr": 0.00019998528684861294, "elapsed_sec": 62348.25915956497, "step_time_sec": 8.231078827986494, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7516, "loss": 4.293458938598633, "lr": 0.00019998310996725306, "elapsed_sec": 62356.490186452866, "step_time_sec": 8.230867075006245, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7517, "loss": 4.144264221191406, "lr": 0.00019998078297453264, "elapsed_sec": 62364.72159552574, "step_time_sec": 8.231233497994253, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7518, "loss": 4.149736404418945, "lr": 0.00019997830587412926, "elapsed_sec": 62372.95290160179, "step_time_sec": 8.231191605009371, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7519, "loss": 4.21456241607666, "lr": 0.00019997567866995774, "elapsed_sec": 62381.183911561966, "step_time_sec": 8.23081046299194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7520, "loss": 4.314664363861084, "lr": 0.00019997290136617007, "elapsed_sec": 62389.415177345276, "step_time_sec": 8.231119787000353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7521, "loss": 4.012003421783447, "lr": 0.00019996997396715558, "elapsed_sec": 62397.645466566086, "step_time_sec": 8.230119112005923, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7522, "loss": 4.193385601043701, "lr": 0.00019996689647754066, "elapsed_sec": 62405.87671494484, "step_time_sec": 8.231112290988676, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7523, "loss": 4.291706562042236, "lr": 0.00019996366890218898, "elapsed_sec": 62414.10752558708, "step_time_sec": 8.230688938987441, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7524, "loss": 4.301397800445557, "lr": 0.00019996029124620148, "elapsed_sec": 62422.33905506134, "step_time_sec": 8.231350259011379, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7525, "loss": 4.245148658752441, "lr": 0.00019995676351491613, "elapsed_sec": 62430.57008433342, "step_time_sec": 8.230873105989303, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7526, "loss": 4.2156219482421875, "lr": 0.0001999530857139082, "elapsed_sec": 62438.8018014431, "step_time_sec": 8.231555613019736, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7527, "loss": 4.191948413848877, "lr": 0.00019994925784899004, "elapsed_sec": 62447.03296661377, "step_time_sec": 8.231020356004592, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7528, "loss": 4.102085590362549, "lr": 0.0001999452799262113, "elapsed_sec": 62455.26431727409, "step_time_sec": 8.231188312987797, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7529, "loss": 4.161773681640625, "lr": 0.0001999411519518586, "elapsed_sec": 62463.49566698074, "step_time_sec": 8.231248522992246, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7530, "loss": 4.107585430145264, "lr": 0.0001999368739324558, "elapsed_sec": 62471.72630786896, "step_time_sec": 8.230421059997752, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7531, "loss": 4.044937610626221, "lr": 0.00019993244587476397, "elapsed_sec": 62479.95683288574, "step_time_sec": 8.230438664992107, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7532, "loss": 4.047877311706543, "lr": 0.0001999278677857811, "elapsed_sec": 62488.18618559837, "step_time_sec": 8.229190601006849, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7533, "loss": 4.18499231338501, "lr": 0.00019992313967274255, "elapsed_sec": 62496.41690635681, "step_time_sec": 8.230494490999263, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7534, "loss": 4.189175605773926, "lr": 0.00019991826154312046, "elapsed_sec": 62504.64535450935, "step_time_sec": 8.22833559900755, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7535, "loss": 4.122617721557617, "lr": 0.00019991323340462434, "elapsed_sec": 62512.87606573105, "step_time_sec": 8.230576727015432, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7536, "loss": 4.19984245300293, "lr": 0.00019990805526520064, "elapsed_sec": 62521.10745692253, "step_time_sec": 8.231203431991162, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7537, "loss": 4.340680122375488, "lr": 0.0001999027271330329, "elapsed_sec": 62529.33565735817, "step_time_sec": 8.2280296679819, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7538, "loss": 4.286325931549072, "lr": 0.0001998972490165417, "elapsed_sec": 62537.5630710125, "step_time_sec": 8.227240823005559, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7539, "loss": 4.231744766235352, "lr": 0.0001998916209243846, "elapsed_sec": 62545.79240489006, "step_time_sec": 8.229193773004226, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7540, "loss": 4.1974101066589355, "lr": 0.00019988584286545635, "elapsed_sec": 62554.02298426628, "step_time_sec": 8.230449442984536, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7541, "loss": 4.099454402923584, "lr": 0.00019987991484888851, "elapsed_sec": 62562.25329685211, "step_time_sec": 8.230142902000807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7542, "loss": 4.222587585449219, "lr": 0.00019987383688404973, "elapsed_sec": 62570.48157572746, "step_time_sec": 8.228104981011711, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7543, "loss": 4.262479782104492, "lr": 0.0001998676089805457, "elapsed_sec": 62578.712423324585, "step_time_sec": 8.230714418983553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7544, "loss": 4.168695449829102, "lr": 0.00019986123114821894, "elapsed_sec": 62586.94155073166, "step_time_sec": 8.228942223999184, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7545, "loss": 4.256417274475098, "lr": 0.00019985470339714897, "elapsed_sec": 62595.171424388885, "step_time_sec": 8.22978470500675, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7546, "loss": 4.261117935180664, "lr": 0.0001998480257376523, "elapsed_sec": 62603.40047669411, "step_time_sec": 8.228900576010346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7547, "loss": 4.079694747924805, "lr": 0.00019984119818028227, "elapsed_sec": 62611.6291885376, "step_time_sec": 8.228509615000803, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7548, "loss": 4.158231735229492, "lr": 0.00019983422073582914, "elapsed_sec": 62619.85845088959, "step_time_sec": 8.229103818972362, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7549, "loss": 4.252469539642334, "lr": 0.00019982709341532013, "elapsed_sec": 62628.089698791504, "step_time_sec": 8.231110249995254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7550, "loss": 4.369049549102783, "lr": 0.0001998198162300192, "elapsed_sec": 62636.318537950516, "step_time_sec": 8.228617531014606, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7551, "loss": 4.263192176818848, "lr": 0.00019981238919142725, "elapsed_sec": 62644.54762554169, "step_time_sec": 8.228914136008825, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7552, "loss": 4.234191417694092, "lr": 0.00019980481231128197, "elapsed_sec": 62652.77678847313, "step_time_sec": 8.229008069989504, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7553, "loss": 4.206872940063477, "lr": 0.00019979708560155784, "elapsed_sec": 62661.00789928436, "step_time_sec": 8.230979853979079, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7554, "loss": 4.277383327484131, "lr": 0.0001997892090744662, "elapsed_sec": 62669.23758459091, "step_time_sec": 8.229530979006086, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7555, "loss": 4.369133472442627, "lr": 0.0001997811827424551, "elapsed_sec": 62677.467933654785, "step_time_sec": 8.230196027987404, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7556, "loss": 4.120972633361816, "lr": 0.00019977300661820936, "elapsed_sec": 62685.69895863533, "step_time_sec": 8.230886208999436, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7557, "loss": 4.261970043182373, "lr": 0.00019976468071465053, "elapsed_sec": 62693.92785382271, "step_time_sec": 8.22874176298501, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7558, "loss": 4.120639801025391, "lr": 0.00019975620504493686, "elapsed_sec": 62702.158742427826, "step_time_sec": 8.230692975019338, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7559, "loss": 4.260241508483887, "lr": 0.00019974757962246332, "elapsed_sec": 62710.389372348785, "step_time_sec": 8.23049971400178, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7560, "loss": 4.202147960662842, "lr": 0.0001997388044608615, "elapsed_sec": 62718.61983013153, "step_time_sec": 8.230306409008335, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7561, "loss": 4.205192565917969, "lr": 0.00019972987957399973, "elapsed_sec": 62726.85138082504, "step_time_sec": 8.231355530006113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7562, "loss": 4.2638750076293945, "lr": 0.0001997208049759829, "elapsed_sec": 62735.08222055435, "step_time_sec": 8.230741879000561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7563, "loss": 4.293694496154785, "lr": 0.00019971158068115245, "elapsed_sec": 62743.314059734344, "step_time_sec": 8.231686571991304, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7564, "loss": 4.126428127288818, "lr": 0.00019970220670408647, "elapsed_sec": 62751.54454636574, "step_time_sec": 8.230294634995516, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7565, "loss": 4.122790813446045, "lr": 0.00019969268305959967, "elapsed_sec": 62759.7736287117, "step_time_sec": 8.228927059011767, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7566, "loss": 4.180386543273926, "lr": 0.00019968300976274314, "elapsed_sec": 62768.00429606438, "step_time_sec": 8.230502268997952, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7567, "loss": 4.234138011932373, "lr": 0.0001996731868288046, "elapsed_sec": 62776.23590230942, "step_time_sec": 8.231449801998679, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7568, "loss": 4.216074466705322, "lr": 0.00019966321427330823, "elapsed_sec": 62784.467443943024, "step_time_sec": 8.231392329005757, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7569, "loss": 4.213047027587891, "lr": 0.00019965309211201465, "elapsed_sec": 62792.69856309891, "step_time_sec": 8.230938845983474, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7570, "loss": 4.3676605224609375, "lr": 0.00019964282036092097, "elapsed_sec": 62800.92984008789, "step_time_sec": 8.23112331298762, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7571, "loss": 4.2021355628967285, "lr": 0.0001996323990362606, "elapsed_sec": 62809.16151928902, "step_time_sec": 8.23152986500645, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7572, "loss": 4.275321006774902, "lr": 0.00019962182815450342, "elapsed_sec": 62817.39298248291, "step_time_sec": 8.231289946998004, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7573, "loss": 4.299767017364502, "lr": 0.0001996111077323557, "elapsed_sec": 62825.62403488159, "step_time_sec": 8.230951675010147, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7574, "loss": 4.088315486907959, "lr": 0.00019960023778675998, "elapsed_sec": 62833.849486112595, "step_time_sec": 8.225269807007862, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7575, "loss": 4.159048080444336, "lr": 0.0001995892183348951, "elapsed_sec": 62842.07746434212, "step_time_sec": 8.22774975100765, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7576, "loss": 4.3597564697265625, "lr": 0.00019957804939417628, "elapsed_sec": 62850.30743789673, "step_time_sec": 8.229862749984022, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7577, "loss": 4.225555419921875, "lr": 0.00019956673098225488, "elapsed_sec": 62858.53849339485, "step_time_sec": 8.230917713983217, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7578, "loss": 4.104135990142822, "lr": 0.00019955526311701848, "elapsed_sec": 62866.769011735916, "step_time_sec": 8.230348345998209, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7579, "loss": 4.235966205596924, "lr": 0.00019954364581659096, "elapsed_sec": 62875.00040960312, "step_time_sec": 8.231191937986296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7580, "loss": 4.02359676361084, "lr": 0.00019953187909933228, "elapsed_sec": 62883.23120880127, "step_time_sec": 8.230659494001884, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7581, "loss": 4.3140177726745605, "lr": 0.00019951996298383858, "elapsed_sec": 62891.46198153496, "step_time_sec": 8.230602293013362, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7582, "loss": 4.082923412322998, "lr": 0.00019950789748894208, "elapsed_sec": 62899.6938700676, "step_time_sec": 8.231735521025257, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7583, "loss": 4.157198429107666, "lr": 0.00019949568263371114, "elapsed_sec": 62907.92415857315, "step_time_sec": 8.23008689598646, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7584, "loss": 4.207521915435791, "lr": 0.00019948331843745005, "elapsed_sec": 62916.15560936928, "step_time_sec": 8.231383766018553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7585, "loss": 4.160419940948486, "lr": 0.00019947080491969927, "elapsed_sec": 62924.386912345886, "step_time_sec": 8.231107506988337, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7586, "loss": 4.139006614685059, "lr": 0.0001994581421002351, "elapsed_sec": 62932.62028813362, "step_time_sec": 8.233175285015022, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7587, "loss": 4.1602983474731445, "lr": 0.0001994453299990699, "elapsed_sec": 62940.85136413574, "step_time_sec": 8.230935386993224, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7588, "loss": 4.16922664642334, "lr": 0.00019943236863645197, "elapsed_sec": 62949.08248257637, "step_time_sec": 8.231002274987986, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7589, "loss": 4.143311500549316, "lr": 0.00019941925803286536, "elapsed_sec": 62957.313570261, "step_time_sec": 8.230915840977104, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7590, "loss": 4.025609493255615, "lr": 0.00019940599820903013, "elapsed_sec": 62965.544904470444, "step_time_sec": 8.231167212012224, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7591, "loss": 4.14874267578125, "lr": 0.0001993925891859021, "elapsed_sec": 62973.776334285736, "step_time_sec": 8.23133013700135, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7592, "loss": 4.1204400062561035, "lr": 0.0001993790309846729, "elapsed_sec": 62982.00756859779, "step_time_sec": 8.230987784016179, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7593, "loss": 4.250630855560303, "lr": 0.00019936532362676995, "elapsed_sec": 62990.23877310753, "step_time_sec": 8.231123398989439, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7594, "loss": 4.001655101776123, "lr": 0.00019935146713385635, "elapsed_sec": 62998.469636917114, "step_time_sec": 8.230721840023762, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7595, "loss": 3.9817306995391846, "lr": 0.00019933746152783085, "elapsed_sec": 63006.70072865486, "step_time_sec": 8.230887395009631, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7596, "loss": 4.044632434844971, "lr": 0.000199323306830828, "elapsed_sec": 63014.932602882385, "step_time_sec": 8.231717301998287, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7597, "loss": 4.220474720001221, "lr": 0.00019930900306521784, "elapsed_sec": 63023.16417264938, "step_time_sec": 8.231404382007895, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7598, "loss": 4.143275737762451, "lr": 0.00019929455025360606, "elapsed_sec": 63031.39507699013, "step_time_sec": 8.230771442991681, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7599, "loss": 4.167200565338135, "lr": 0.0001992799484188339, "elapsed_sec": 63039.62646603584, "step_time_sec": 8.2312726030068, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7600, "loss": 4.142934799194336, "lr": 0.0001992651975839781, "elapsed_sec": 63047.85722088814, "step_time_sec": 8.230578695976874, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7601, "loss": 3.9805057048797607, "lr": 0.0001992502977723509, "elapsed_sec": 63056.08832907677, "step_time_sec": 8.230919415014796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7602, "loss": 4.118913173675537, "lr": 0.00019923524900749998, "elapsed_sec": 63064.31692314148, "step_time_sec": 8.228471066977363, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7603, "loss": 4.2037739753723145, "lr": 0.00019922005131320838, "elapsed_sec": 63072.5479516983, "step_time_sec": 8.23083894699812, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7604, "loss": 4.166679382324219, "lr": 0.00019920470471349456, "elapsed_sec": 63080.779009103775, "step_time_sec": 8.230898301990237, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7605, "loss": 4.1349196434021, "lr": 0.0001991892092326123, "elapsed_sec": 63089.009951114655, "step_time_sec": 8.230819030024577, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7606, "loss": 4.07930326461792, "lr": 0.00019917356489505068, "elapsed_sec": 63097.24032711983, "step_time_sec": 8.230201303987997, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7607, "loss": 4.111403465270996, "lr": 0.00019915777172553403, "elapsed_sec": 63105.47142100334, "step_time_sec": 8.230980107007781, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7608, "loss": 4.0340118408203125, "lr": 0.00019914182974902183, "elapsed_sec": 63113.70202946663, "step_time_sec": 8.230413661978673, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7609, "loss": 4.126522064208984, "lr": 0.00019912573899070882, "elapsed_sec": 63121.933710336685, "step_time_sec": 8.231496356020216, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7610, "loss": 4.27404260635376, "lr": 0.00019910949947602482, "elapsed_sec": 63130.16448187828, "step_time_sec": 8.230651036981726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7611, "loss": 4.20223331451416, "lr": 0.00019909311123063482, "elapsed_sec": 63138.397110939026, "step_time_sec": 8.232458622020204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7612, "loss": 4.194394588470459, "lr": 0.00019907657428043878, "elapsed_sec": 63146.62739825249, "step_time_sec": 8.230118226987543, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7613, "loss": 4.182150363922119, "lr": 0.00019905988865157173, "elapsed_sec": 63154.85732746124, "step_time_sec": 8.229830347991083, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7614, "loss": 4.116179466247559, "lr": 0.00019904305437040356, "elapsed_sec": 63163.08740019798, "step_time_sec": 8.22987906599883, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7615, "loss": 4.125494956970215, "lr": 0.00019902607146353928, "elapsed_sec": 63171.314138650894, "step_time_sec": 8.226642248017015, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7616, "loss": 4.003848552703857, "lr": 0.00019900893995781867, "elapsed_sec": 63179.54575538635, "step_time_sec": 8.231460918992525, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7617, "loss": 4.0618672370910645, "lr": 0.0001989916598803163, "elapsed_sec": 63187.77614545822, "step_time_sec": 8.230172835988924, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7618, "loss": 3.8085711002349854, "lr": 0.0001989742312583417, "elapsed_sec": 63196.0065946579, "step_time_sec": 8.230286281002918, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7619, "loss": 4.175075054168701, "lr": 0.00019895665411943903, "elapsed_sec": 63204.23761200905, "step_time_sec": 8.230901272007031, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7620, "loss": 4.136068820953369, "lr": 0.00019893892849138722, "elapsed_sec": 63212.468026161194, "step_time_sec": 8.23023722297512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7621, "loss": 3.9994142055511475, "lr": 0.00019892105440219986, "elapsed_sec": 63220.699724435806, "step_time_sec": 8.23154784500366, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7622, "loss": 4.012779712677002, "lr": 0.00019890303188012517, "elapsed_sec": 63228.93176364899, "step_time_sec": 8.23189048099448, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7623, "loss": 4.0434370040893555, "lr": 0.00019888486095364594, "elapsed_sec": 63237.16215634346, "step_time_sec": 8.230206322012236, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7624, "loss": 3.992349863052368, "lr": 0.00019886654165147954, "elapsed_sec": 63245.392394542694, "step_time_sec": 8.230132488999516, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7625, "loss": 4.008578300476074, "lr": 0.00019884807400257783, "elapsed_sec": 63253.62347817421, "step_time_sec": 8.230933829996502, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7626, "loss": 4.121563911437988, "lr": 0.00019882945803612702, "elapsed_sec": 63261.854635715485, "step_time_sec": 8.230925395997474, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7627, "loss": 4.173977851867676, "lr": 0.0001988106937815478, "elapsed_sec": 63270.08494782448, "step_time_sec": 8.23018066197983, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7628, "loss": 4.212357044219971, "lr": 0.0001987917812684953, "elapsed_sec": 63278.31654047966, "step_time_sec": 8.231469816993922, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7629, "loss": 4.229048728942871, "lr": 0.00019877272052685872, "elapsed_sec": 63286.54769515991, "step_time_sec": 8.230978339008288, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7630, "loss": 4.210933208465576, "lr": 0.0001987535115867618, "elapsed_sec": 63294.77906727791, "step_time_sec": 8.231241772009525, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7631, "loss": 4.045263767242432, "lr": 0.00019873415447856235, "elapsed_sec": 63303.00715970993, "step_time_sec": 8.227884815016296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7632, "loss": 3.9322235584259033, "lr": 0.00019871464923285227, "elapsed_sec": 63311.23723554611, "step_time_sec": 8.229942419013241, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7633, "loss": 4.126222133636475, "lr": 0.0001986949958804577, "elapsed_sec": 63319.46821141243, "step_time_sec": 8.230855381989386, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7634, "loss": 4.042545795440674, "lr": 0.00019867519445243887, "elapsed_sec": 63327.69759678841, "step_time_sec": 8.229188350000186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7635, "loss": 4.242661476135254, "lr": 0.00019865524498008987, "elapsed_sec": 63335.92616224289, "step_time_sec": 8.228411098010838, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7636, "loss": 4.11316442489624, "lr": 0.0001986351474949389, "elapsed_sec": 63344.15677309036, "step_time_sec": 8.230460738006514, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7637, "loss": 4.070343017578125, "lr": 0.00019861490202874803, "elapsed_sec": 63352.38824939728, "step_time_sec": 8.231323943007737, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7638, "loss": 4.202555179595947, "lr": 0.00019859450861351327, "elapsed_sec": 63360.618933200836, "step_time_sec": 8.230551184009528, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7639, "loss": 4.236643314361572, "lr": 0.00019857396728146427, "elapsed_sec": 63368.84874725342, "step_time_sec": 8.229650455003139, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7640, "loss": 4.221217155456543, "lr": 0.00019855327806506465, "elapsed_sec": 63377.07986283302, "step_time_sec": 8.230940006003948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7641, "loss": 4.109718322753906, "lr": 0.00019853244099701163, "elapsed_sec": 63385.30824303627, "step_time_sec": 8.228250977001153, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7642, "loss": 4.271256446838379, "lr": 0.00019851145611023613, "elapsed_sec": 63393.5394179821, "step_time_sec": 8.23099347401876, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7643, "loss": 4.023376941680908, "lr": 0.0001984903234379027, "elapsed_sec": 63401.770872831345, "step_time_sec": 8.231277935003163, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7644, "loss": 3.9346115589141846, "lr": 0.00019846904301340932, "elapsed_sec": 63410.00038146973, "step_time_sec": 8.229381779994583, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7645, "loss": 3.970207452774048, "lr": 0.0001984476148703877, "elapsed_sec": 63418.22933959961, "step_time_sec": 8.228821044001961, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7646, "loss": 4.134049415588379, "lr": 0.00019842603904270288, "elapsed_sec": 63426.456249952316, "step_time_sec": 8.226662591972854, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7647, "loss": 4.1931681632995605, "lr": 0.00019840431556445326, "elapsed_sec": 63434.6878426075, "step_time_sec": 8.231486201024381, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7648, "loss": 4.216803550720215, "lr": 0.0001983824444699707, "elapsed_sec": 63442.91868329048, "step_time_sec": 8.230621577007696, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7649, "loss": 4.015146255493164, "lr": 0.0001983604257938202, "elapsed_sec": 63451.14865207672, "step_time_sec": 8.229816574981669, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7650, "loss": 4.036952972412109, "lr": 0.00019833825957080018, "elapsed_sec": 63459.3779771328, "step_time_sec": 8.229178901994601, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7651, "loss": 4.000943183898926, "lr": 0.00019831594583594208, "elapsed_sec": 63467.60722947121, "step_time_sec": 8.229073661001166, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7652, "loss": 4.114128589630127, "lr": 0.00019829348462451057, "elapsed_sec": 63475.83867549896, "step_time_sec": 8.231345809006598, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7653, "loss": 4.193666934967041, "lr": 0.00019827087597200336, "elapsed_sec": 63484.06897521019, "step_time_sec": 8.23007520500687, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7654, "loss": 4.092533588409424, "lr": 0.00019824811991415122, "elapsed_sec": 63492.30079817772, "step_time_sec": 8.231653812981676, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7655, "loss": 4.050185680389404, "lr": 0.00019822521648691775, "elapsed_sec": 63500.53165745735, "step_time_sec": 8.230729159986367, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7656, "loss": 4.169108867645264, "lr": 0.0001982021657264996, "elapsed_sec": 63508.76226425171, "step_time_sec": 8.23041583300801, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7657, "loss": 4.243004322052002, "lr": 0.0001981789676693262, "elapsed_sec": 63516.992288827896, "step_time_sec": 8.229874140000902, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7658, "loss": 4.255459308624268, "lr": 0.00019815562235205977, "elapsed_sec": 63525.22378349304, "step_time_sec": 8.231326781999087, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7659, "loss": 4.160529613494873, "lr": 0.00019813212981159526, "elapsed_sec": 63533.456288576126, "step_time_sec": 8.232408234995091, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7660, "loss": 4.208989143371582, "lr": 0.00019810849008506027, "elapsed_sec": 63541.68720173836, "step_time_sec": 8.230738574988209, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7661, "loss": 4.082528591156006, "lr": 0.00019808470320981504, "elapsed_sec": 63549.91883420944, "step_time_sec": 8.23150441297912, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7662, "loss": 4.135775566101074, "lr": 0.00019806076922345236, "elapsed_sec": 63558.14999938011, "step_time_sec": 8.23092808702495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7663, "loss": 4.071573734283447, "lr": 0.00019803668816379758, "elapsed_sec": 63566.381041526794, "step_time_sec": 8.230888099002186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7664, "loss": 4.1613078117370605, "lr": 0.00019801246006890836, "elapsed_sec": 63574.610843896866, "step_time_sec": 8.229659197008004, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7665, "loss": 4.162325859069824, "lr": 0.00019798808497707482, "elapsed_sec": 63582.84051680565, "step_time_sec": 8.22952372499276, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7666, "loss": 4.235531330108643, "lr": 0.00019796356292681934, "elapsed_sec": 63591.071011304855, "step_time_sec": 8.230363507987931, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7667, "loss": 4.20902681350708, "lr": 0.0001979388939568966, "elapsed_sec": 63599.29701948166, "step_time_sec": 8.225788601004751, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7668, "loss": 4.175782680511475, "lr": 0.00019791407810629345, "elapsed_sec": 63607.528280973434, "step_time_sec": 8.231115164002404, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7669, "loss": 4.012176990509033, "lr": 0.00019788911541422888, "elapsed_sec": 63615.75849223137, "step_time_sec": 8.230107337003574, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7670, "loss": 4.182000160217285, "lr": 0.00019786400592015395, "elapsed_sec": 63623.987787246704, "step_time_sec": 8.229063890001271, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7671, "loss": 4.106530666351318, "lr": 0.0001978387496637517, "elapsed_sec": 63632.21640729904, "step_time_sec": 8.228468963992782, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7672, "loss": 4.2768402099609375, "lr": 0.00019781334668493713, "elapsed_sec": 63640.4462018013, "step_time_sec": 8.229641299985815, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7673, "loss": 4.09655237197876, "lr": 0.00019778779702385715, "elapsed_sec": 63648.6779024601, "step_time_sec": 8.23151985098957, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7674, "loss": 4.298882961273193, "lr": 0.00019776210072089043, "elapsed_sec": 63656.908861637115, "step_time_sec": 8.230781891004881, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7675, "loss": 4.086348533630371, "lr": 0.00019773625781664743, "elapsed_sec": 63665.1408765316, "step_time_sec": 8.23187544400571, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7676, "loss": 4.093648910522461, "lr": 0.0001977102683519703, "elapsed_sec": 63673.371639966965, "step_time_sec": 8.23058265799773, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7677, "loss": 3.9744772911071777, "lr": 0.00019768413236793278, "elapsed_sec": 63681.60279536247, "step_time_sec": 8.23098753899103, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7678, "loss": 4.161222457885742, "lr": 0.0001976578499058402, "elapsed_sec": 63689.832639455795, "step_time_sec": 8.229740044014761, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7679, "loss": 4.149501323699951, "lr": 0.00019763142100722944, "elapsed_sec": 63698.06230568886, "step_time_sec": 8.229451858991524, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7680, "loss": 4.139462947845459, "lr": 0.0001976048457138687, "elapsed_sec": 63706.29402375221, "step_time_sec": 8.231594443990616, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7681, "loss": 4.122117042541504, "lr": 0.00019757812406775754, "elapsed_sec": 63714.52456498146, "step_time_sec": 8.230312193976715, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7682, "loss": 4.155200481414795, "lr": 0.00019755125611112693, "elapsed_sec": 63722.75521278381, "step_time_sec": 8.230475076008588, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7683, "loss": 4.0239338874816895, "lr": 0.00019752424188643897, "elapsed_sec": 63730.9862306118, "step_time_sec": 8.230986387003213, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7684, "loss": 4.107445240020752, "lr": 0.00019749708143638697, "elapsed_sec": 63739.21722817421, "step_time_sec": 8.230724082997767, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7685, "loss": 4.266946792602539, "lr": 0.0001974697748038953, "elapsed_sec": 63747.44789958, "step_time_sec": 8.2305100520025, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7686, "loss": 4.256223201751709, "lr": 0.0001974423220321194, "elapsed_sec": 63755.678210020065, "step_time_sec": 8.230156815989176, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7687, "loss": 4.1697001457214355, "lr": 0.0001974147231644457, "elapsed_sec": 63763.909851551056, "step_time_sec": 8.231493431987474, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7688, "loss": 4.1479573249816895, "lr": 0.00019738697824449135, "elapsed_sec": 63772.14032626152, "step_time_sec": 8.230379307991825, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7689, "loss": 4.058629035949707, "lr": 0.00019735908731610452, "elapsed_sec": 63780.37156844139, "step_time_sec": 8.231036223995034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7690, "loss": 4.1743292808532715, "lr": 0.00019733105042336402, "elapsed_sec": 63788.602371931076, "step_time_sec": 8.230675052007427, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7691, "loss": 4.092673301696777, "lr": 0.00019730286761057938, "elapsed_sec": 63796.83268761635, "step_time_sec": 8.230119320011, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7692, "loss": 4.090548515319824, "lr": 0.00019727453892229071, "elapsed_sec": 63805.063747406006, "step_time_sec": 8.23090865201084, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7693, "loss": 4.090573310852051, "lr": 0.00019724606440326876, "elapsed_sec": 63813.29426908493, "step_time_sec": 8.230317595996894, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7694, "loss": 4.154499530792236, "lr": 0.00019721744409851455, "elapsed_sec": 63821.525596380234, "step_time_sec": 8.231224053015467, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7695, "loss": 4.155160427093506, "lr": 0.00019718867805325972, "elapsed_sec": 63829.75639009476, "step_time_sec": 8.230592601001263, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7696, "loss": 4.186562538146973, "lr": 0.00019715976631296615, "elapsed_sec": 63837.98802137375, "step_time_sec": 8.231459222995909, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7697, "loss": 4.391905784606934, "lr": 0.00019713070892332592, "elapsed_sec": 63846.21877741814, "step_time_sec": 8.230637531989487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7698, "loss": 4.171647071838379, "lr": 0.00019710150593026137, "elapsed_sec": 63854.44908595085, "step_time_sec": 8.230126437003491, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7699, "loss": 4.1966376304626465, "lr": 0.0001970721573799249, "elapsed_sec": 63862.6796605587, "step_time_sec": 8.230395006015897, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7700, "loss": 4.203304767608643, "lr": 0.000197042663318699, "elapsed_sec": 63870.90969824791, "step_time_sec": 8.229941529978532, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7701, "loss": 4.125126838684082, "lr": 0.00019701302379319612, "elapsed_sec": 63879.139614105225, "step_time_sec": 8.229716724978061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7702, "loss": 4.233382701873779, "lr": 0.00019698323885025855, "elapsed_sec": 63887.37063860893, "step_time_sec": 8.230887610989157, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7703, "loss": 4.12429141998291, "lr": 0.00019695330853695842, "elapsed_sec": 63895.602685689926, "step_time_sec": 8.231868412985932, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7704, "loss": 3.9560577869415283, "lr": 0.00019692323290059768, "elapsed_sec": 63903.833389520645, "step_time_sec": 8.230577311012894, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7705, "loss": 4.204691410064697, "lr": 0.0001968930119887078, "elapsed_sec": 63912.06457090378, "step_time_sec": 8.231074081995757, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7706, "loss": 4.165716648101807, "lr": 0.00019686264584904998, "elapsed_sec": 63920.29587697983, "step_time_sec": 8.23114357801387, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7707, "loss": 4.135679721832275, "lr": 0.00019683213452961485, "elapsed_sec": 63928.52524971962, "step_time_sec": 8.229178855021019, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7708, "loss": 4.076425075531006, "lr": 0.00019680147807862256, "elapsed_sec": 63936.75386738777, "step_time_sec": 8.228485233994434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7709, "loss": 4.215339660644531, "lr": 0.00019677067654452258, "elapsed_sec": 63944.98426914215, "step_time_sec": 8.23021575799794, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7710, "loss": 4.197433948516846, "lr": 0.0001967397299759936, "elapsed_sec": 63953.21610951424, "step_time_sec": 8.231671975023346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7711, "loss": 4.138127326965332, "lr": 0.00019670863842194368, "elapsed_sec": 63961.44647812843, "step_time_sec": 8.23023000502144, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7712, "loss": 4.005358695983887, "lr": 0.00019667740193150989, "elapsed_sec": 63969.677142858505, "step_time_sec": 8.230523935024394, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7713, "loss": 4.133321762084961, "lr": 0.0001966460205540584, "elapsed_sec": 63977.90827035904, "step_time_sec": 8.230949375982163, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7714, "loss": 4.102063179016113, "lr": 0.00019661449433918432, "elapsed_sec": 63986.13874197006, "step_time_sec": 8.230340304988204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7715, "loss": 4.136806488037109, "lr": 0.00019658282333671177, "elapsed_sec": 63994.36906123161, "step_time_sec": 8.230159669008572, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7716, "loss": 4.213565826416016, "lr": 0.0001965510075966936, "elapsed_sec": 64002.599816560745, "step_time_sec": 8.230541459983215, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7717, "loss": 4.104772567749023, "lr": 0.0001965190471694114, "elapsed_sec": 64010.83060193062, "step_time_sec": 8.23062277500867, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7718, "loss": 4.196958541870117, "lr": 0.00019648694210537543, "elapsed_sec": 64019.0611846447, "step_time_sec": 8.230459946993506, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7719, "loss": 4.211127281188965, "lr": 0.0001964546924553246, "elapsed_sec": 64027.29280304909, "step_time_sec": 8.231451105006272, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7720, "loss": 4.0764479637146, "lr": 0.00019642229827022622, "elapsed_sec": 64035.52453136444, "step_time_sec": 8.231609986978583, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7721, "loss": 4.116068363189697, "lr": 0.00019638975960127613, "elapsed_sec": 64043.754529953, "step_time_sec": 8.229826494993176, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7722, "loss": 4.232635021209717, "lr": 0.00019635707649989846, "elapsed_sec": 64051.9862947464, "step_time_sec": 8.231539811997209, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7723, "loss": 4.2934393882751465, "lr": 0.00019632424901774558, "elapsed_sec": 64060.217361450195, "step_time_sec": 8.230920458998298, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7724, "loss": 4.151172161102295, "lr": 0.00019629127720669805, "elapsed_sec": 64068.44929242134, "step_time_sec": 8.231802611000603, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7725, "loss": 4.193305015563965, "lr": 0.00019625816111886455, "elapsed_sec": 64076.67993760109, "step_time_sec": 8.230418013990857, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7726, "loss": 4.218358039855957, "lr": 0.00019622490080658175, "elapsed_sec": 64084.91072559357, "step_time_sec": 8.230653018021258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7727, "loss": 4.244167327880859, "lr": 0.0001961914963224143, "elapsed_sec": 64093.14225959778, "step_time_sec": 8.231366201012861, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7728, "loss": 4.304080486297607, "lr": 0.0001961579477191547, "elapsed_sec": 64101.37205505371, "step_time_sec": 8.229691079992335, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7729, "loss": 4.202428340911865, "lr": 0.00019612425504982308, "elapsed_sec": 64109.601726055145, "step_time_sec": 8.229474827996455, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7730, "loss": 4.206306457519531, "lr": 0.00019609041836766742, "elapsed_sec": 64117.83158636093, "step_time_sec": 8.229683095007204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7731, "loss": 4.0781168937683105, "lr": 0.00019605643772616319, "elapsed_sec": 64126.06253051758, "step_time_sec": 8.230829715990694, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7732, "loss": 4.179805755615234, "lr": 0.00019602231317901347, "elapsed_sec": 64134.293610572815, "step_time_sec": 8.230881134019, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7733, "loss": 4.170017719268799, "lr": 0.00019598804478014869, "elapsed_sec": 64142.52393889427, "step_time_sec": 8.230192440998508, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7734, "loss": 4.214142799377441, "lr": 0.0001959536325837266, "elapsed_sec": 64150.75554680824, "step_time_sec": 8.231420075986534, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7735, "loss": 4.09304666519165, "lr": 0.00019591907664413233, "elapsed_sec": 64158.98657345772, "step_time_sec": 8.230825111008016, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7736, "loss": 3.961272716522217, "lr": 0.00019588437701597807, "elapsed_sec": 64167.21828532219, "step_time_sec": 8.23157945901039, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7737, "loss": 4.227740287780762, "lr": 0.00019584953375410315, "elapsed_sec": 64175.44910979271, "step_time_sec": 8.230666001996724, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7738, "loss": 4.054274082183838, "lr": 0.0001958145469135738, "elapsed_sec": 64183.681070804596, "step_time_sec": 8.231744594988413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7739, "loss": 4.055152893066406, "lr": 0.00019577941654968337, "elapsed_sec": 64191.91227078438, "step_time_sec": 8.231016018020455, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7740, "loss": 4.175965785980225, "lr": 0.00019574414271795182, "elapsed_sec": 64200.142889499664, "step_time_sec": 8.230468791996827, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7741, "loss": 4.078413486480713, "lr": 0.00019570872547412598, "elapsed_sec": 64208.37375807762, "step_time_sec": 8.230703870009165, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7742, "loss": 4.288000583648682, "lr": 0.00019567316487417925, "elapsed_sec": 64216.604447841644, "step_time_sec": 8.230551803979324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7743, "loss": 4.162743091583252, "lr": 0.00019563746097431164, "elapsed_sec": 64224.83505678177, "step_time_sec": 8.230397156003164, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7744, "loss": 4.172593593597412, "lr": 0.00019560161383094958, "elapsed_sec": 64233.06519293785, "step_time_sec": 8.229912424983922, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7745, "loss": 4.187150001525879, "lr": 0.00019556562350074596, "elapsed_sec": 64241.295499801636, "step_time_sec": 8.230181877006544, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7746, "loss": 4.137173652648926, "lr": 0.0001955294900405799, "elapsed_sec": 64249.52624583244, "step_time_sec": 8.230559149000328, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7747, "loss": 4.123310089111328, "lr": 0.00019549321350755672, "elapsed_sec": 64257.75704097748, "step_time_sec": 8.230629756988492, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7748, "loss": 4.25430154800415, "lr": 0.0001954567939590079, "elapsed_sec": 64265.98803257942, "step_time_sec": 8.230857062007999, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7749, "loss": 4.230832576751709, "lr": 0.00019542023145249093, "elapsed_sec": 64274.21715450287, "step_time_sec": 8.228899305977393, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7750, "loss": 4.110300540924072, "lr": 0.00019538352604578916, "elapsed_sec": 64282.44717216492, "step_time_sec": 8.22986945300363, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7751, "loss": 4.135383605957031, "lr": 0.00019534667779691188, "elapsed_sec": 64290.677349328995, "step_time_sec": 8.230053419014439, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7752, "loss": 4.223275661468506, "lr": 0.00019530968676409412, "elapsed_sec": 64298.90880703926, "step_time_sec": 8.231243741989601, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7753, "loss": 3.9780569076538086, "lr": 0.00019527255300579643, "elapsed_sec": 64307.14025473595, "step_time_sec": 8.231310207018396, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7754, "loss": 4.152531147003174, "lr": 0.0001952352765807051, "elapsed_sec": 64315.37168240547, "step_time_sec": 8.23122103899368, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7755, "loss": 4.1117963790893555, "lr": 0.00019519785754773178, "elapsed_sec": 64323.602679014206, "step_time_sec": 8.230915205000201, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7756, "loss": 4.181560039520264, "lr": 0.00019516029596601352, "elapsed_sec": 64331.834458589554, "step_time_sec": 8.231571440002881, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7757, "loss": 4.142459392547607, "lr": 0.00019512259189491271, "elapsed_sec": 64340.06448721886, "step_time_sec": 8.229903086001286, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7758, "loss": 4.028079509735107, "lr": 0.00019508474539401685, "elapsed_sec": 64348.29375910759, "step_time_sec": 8.229048623994458, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7759, "loss": 4.123368263244629, "lr": 0.0001950467565231386, "elapsed_sec": 64356.52372455597, "step_time_sec": 8.229838058003224, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7760, "loss": 4.175625801086426, "lr": 0.00019500862534231554, "elapsed_sec": 64364.75478553772, "step_time_sec": 8.230863076023525, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7761, "loss": 4.072428226470947, "lr": 0.00019497035191181028, "elapsed_sec": 64372.98627996445, "step_time_sec": 8.231381841993425, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7762, "loss": 4.247531890869141, "lr": 0.0001949319362921101, "elapsed_sec": 64381.21674132347, "step_time_sec": 8.230233629001305, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7763, "loss": 4.17681884765625, "lr": 0.0001948933785439271, "elapsed_sec": 64389.447546720505, "step_time_sec": 8.230630812991876, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7764, "loss": 4.287696361541748, "lr": 0.000194854678728198, "elapsed_sec": 64397.67837929726, "step_time_sec": 8.230721564003034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7765, "loss": 4.020357131958008, "lr": 0.00019481583690608394, "elapsed_sec": 64405.90871286392, "step_time_sec": 8.230185123014962, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7766, "loss": 4.266476154327393, "lr": 0.00019477685313897065, "elapsed_sec": 64414.14000439644, "step_time_sec": 8.231134934991132, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7767, "loss": 4.09067440032959, "lr": 0.00019473772748846801, "elapsed_sec": 64422.37095952034, "step_time_sec": 8.230720380001003, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7768, "loss": 4.170546054840088, "lr": 0.00019469846001641026, "elapsed_sec": 64430.60262441635, "step_time_sec": 8.231495228013955, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7769, "loss": 4.072796821594238, "lr": 0.00019465905078485573, "elapsed_sec": 64438.83358120918, "step_time_sec": 8.230823974008672, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7770, "loss": 4.051712989807129, "lr": 0.00019461949985608677, "elapsed_sec": 64447.06478714943, "step_time_sec": 8.231033199001104, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7771, "loss": 4.32444429397583, "lr": 0.00019457980729260972, "elapsed_sec": 64455.29606103897, "step_time_sec": 8.231179414025974, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7772, "loss": 4.417281150817871, "lr": 0.00019453997315715474, "elapsed_sec": 64463.52726125717, "step_time_sec": 8.230963557987707, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7773, "loss": 4.2351298332214355, "lr": 0.00019449999751267567, "elapsed_sec": 64471.75799894333, "step_time_sec": 8.230575830006273, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7774, "loss": 4.181519508361816, "lr": 0.0001944598804223501, "elapsed_sec": 64479.98862147331, "step_time_sec": 8.230489123001462, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7775, "loss": 4.189551830291748, "lr": 0.00019441962194957906, "elapsed_sec": 64488.2176527977, "step_time_sec": 8.228881282993825, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7776, "loss": 4.0508198738098145, "lr": 0.00019437922215798708, "elapsed_sec": 64496.44611907005, "step_time_sec": 8.228279956994811, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7777, "loss": 4.129181385040283, "lr": 0.00019433868111142202, "elapsed_sec": 64504.67249631882, "step_time_sec": 8.226150956004858, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7778, "loss": 4.188752174377441, "lr": 0.00019429799887395494, "elapsed_sec": 64512.90391731262, "step_time_sec": 8.231267586001195, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7779, "loss": 4.083587646484375, "lr": 0.00019425717550988013, "elapsed_sec": 64521.135133743286, "step_time_sec": 8.231123750010738, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7780, "loss": 4.14951753616333, "lr": 0.00019421621108371478, "elapsed_sec": 64529.366824150085, "step_time_sec": 8.231460892013274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7781, "loss": 4.090731620788574, "lr": 0.0001941751056601992, "elapsed_sec": 64537.59770107269, "step_time_sec": 8.23071178398095, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7782, "loss": 4.2078447341918945, "lr": 0.0001941338593042963, "elapsed_sec": 64545.82938909531, "step_time_sec": 8.231543681002222, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7783, "loss": 4.104416847229004, "lr": 0.0001940924720811919, "elapsed_sec": 64554.061178684235, "step_time_sec": 8.23166774600395, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7784, "loss": 4.153920650482178, "lr": 0.0001940509440562944, "elapsed_sec": 64562.29153132439, "step_time_sec": 8.230158146994654, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7785, "loss": 4.067061424255371, "lr": 0.00019400927529523475, "elapsed_sec": 64570.52008271217, "step_time_sec": 8.228367962001357, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7786, "loss": 4.211162090301514, "lr": 0.0001939674658638662, "elapsed_sec": 64578.75075364113, "step_time_sec": 8.23051805398427, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7787, "loss": 4.2002644538879395, "lr": 0.00019392551582826447, "elapsed_sec": 64586.98213434219, "step_time_sec": 8.231219998007873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7788, "loss": 4.1117472648620605, "lr": 0.00019388342525472735, "elapsed_sec": 64595.21044278145, "step_time_sec": 8.228173517010873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7789, "loss": 4.221299648284912, "lr": 0.00019384119420977485, "elapsed_sec": 64603.43957042694, "step_time_sec": 8.229018810001435, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7790, "loss": 3.9382736682891846, "lr": 0.00019379882276014895, "elapsed_sec": 64611.66828799248, "step_time_sec": 8.22852467300254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7791, "loss": 4.200303077697754, "lr": 0.0001937563109728135, "elapsed_sec": 64619.898715019226, "step_time_sec": 8.23024453400285, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7792, "loss": 4.28739070892334, "lr": 0.00019371365891495415, "elapsed_sec": 64628.12983226776, "step_time_sec": 8.230950353987282, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7793, "loss": 4.2013936042785645, "lr": 0.00019367086665397822, "elapsed_sec": 64636.35943484306, "step_time_sec": 8.229484487004811, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7794, "loss": 4.082880020141602, "lr": 0.00019362793425751465, "elapsed_sec": 64644.58881568909, "step_time_sec": 8.229196820990182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7795, "loss": 4.2440080642700195, "lr": 0.0001935848617934138, "elapsed_sec": 64652.81850552559, "step_time_sec": 8.22948732500663, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7796, "loss": 4.165805339813232, "lr": 0.00019354164932974745, "elapsed_sec": 64661.04870414734, "step_time_sec": 8.230034681007965, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7797, "loss": 4.266365051269531, "lr": 0.00019349829693480857, "elapsed_sec": 64669.27969932556, "step_time_sec": 8.230846009013476, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7798, "loss": 4.218886852264404, "lr": 0.00019345480467711132, "elapsed_sec": 64677.50779867172, "step_time_sec": 8.227958092989866, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7799, "loss": 4.131274223327637, "lr": 0.0001934111726253909, "elapsed_sec": 64685.73723912239, "step_time_sec": 8.229274729004828, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7800, "loss": 4.132745742797852, "lr": 0.00019336740084860337, "elapsed_sec": 64693.9675154686, "step_time_sec": 8.2301780639973, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7801, "loss": 4.230288505554199, "lr": 0.00019332348941592577, "elapsed_sec": 64702.198380708694, "step_time_sec": 8.230646760988748, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7802, "loss": 4.194915294647217, "lr": 0.00019327943839675567, "elapsed_sec": 64710.42917895317, "step_time_sec": 8.230634761013789, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7803, "loss": 4.112335205078125, "lr": 0.00019323524786071138, "elapsed_sec": 64718.65849041939, "step_time_sec": 8.229229164018761, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7804, "loss": 4.223016262054443, "lr": 0.00019319091787763163, "elapsed_sec": 64726.889128685, "step_time_sec": 8.23044477502117, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7805, "loss": 4.222371578216553, "lr": 0.0001931464485175755, "elapsed_sec": 64735.12020325661, "step_time_sec": 8.230860269017285, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7806, "loss": 4.10492467880249, "lr": 0.00019310183985082248, "elapsed_sec": 64743.350709199905, "step_time_sec": 8.230347617994994, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7807, "loss": 4.051187992095947, "lr": 0.00019305709194787206, "elapsed_sec": 64751.58031916618, "step_time_sec": 8.229496493993793, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7808, "loss": 3.968032121658325, "lr": 0.0001930122048794439, "elapsed_sec": 64759.80898332596, "step_time_sec": 8.22843452001689, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7809, "loss": 3.985220193862915, "lr": 0.00019296717871647754, "elapsed_sec": 64768.03995466232, "step_time_sec": 8.230819304997567, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7810, "loss": 3.9496469497680664, "lr": 0.0001929220135301323, "elapsed_sec": 64776.27136397362, "step_time_sec": 8.231297670979984, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7811, "loss": 4.297124862670898, "lr": 0.00019287670939178737, "elapsed_sec": 64784.50259208679, "step_time_sec": 8.231015244004084, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7812, "loss": 4.207479953765869, "lr": 0.00019283126637304132, "elapsed_sec": 64792.73419928551, "step_time_sec": 8.231446706980933, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7813, "loss": 4.210857391357422, "lr": 0.00019278568454571238, "elapsed_sec": 64800.96505832672, "step_time_sec": 8.230715001991484, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7814, "loss": 4.118444442749023, "lr": 0.0001927399639818381, "elapsed_sec": 64809.19642210007, "step_time_sec": 8.23120730399387, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7815, "loss": 4.077884197235107, "lr": 0.0001926941047536753, "elapsed_sec": 64817.42736339569, "step_time_sec": 8.230765740008792, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7816, "loss": 4.1264214515686035, "lr": 0.0001926481069336999, "elapsed_sec": 64825.65671181679, "step_time_sec": 8.229186943004606, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7817, "loss": 4.094372272491455, "lr": 0.00019260197059460686, "elapsed_sec": 64833.887090206146, "step_time_sec": 8.230220664001536, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7818, "loss": 4.263648509979248, "lr": 0.00019255569580931011, "elapsed_sec": 64842.117720127106, "step_time_sec": 8.230458776990417, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7819, "loss": 4.0579118728637695, "lr": 0.00019250928265094236, "elapsed_sec": 64850.34685087204, "step_time_sec": 8.228996511985315, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7820, "loss": 4.095520496368408, "lr": 0.00019246273119285495, "elapsed_sec": 64858.57627868652, "step_time_sec": 8.229299736005487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7821, "loss": 4.11990213394165, "lr": 0.00019241604150861785, "elapsed_sec": 64866.80359864235, "step_time_sec": 8.22713045499404, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7822, "loss": 4.31095552444458, "lr": 0.0001923692136720195, "elapsed_sec": 64875.033838272095, "step_time_sec": 8.230112910998287, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7823, "loss": 4.116472244262695, "lr": 0.00019232224775706655, "elapsed_sec": 64883.26373577118, "step_time_sec": 8.22966327299946, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7824, "loss": 4.208028793334961, "lr": 0.00019227514383798406, "elapsed_sec": 64891.49419426918, "step_time_sec": 8.230353999009822, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7825, "loss": 4.136138439178467, "lr": 0.00019222790198921504, "elapsed_sec": 64899.72442674637, "step_time_sec": 8.230010475992458, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7826, "loss": 4.110520839691162, "lr": 0.00019218052228542053, "elapsed_sec": 64907.9545917511, "step_time_sec": 8.230004256009124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7827, "loss": 4.094097137451172, "lr": 0.0001921330048014794, "elapsed_sec": 64916.18636393547, "step_time_sec": 8.231635873002233, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7828, "loss": 4.259997844696045, "lr": 0.00019208534961248844, "elapsed_sec": 64924.41779756546, "step_time_sec": 8.23130232800031, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7829, "loss": 4.088522911071777, "lr": 0.0001920375567937618, "elapsed_sec": 64932.648409605026, "step_time_sec": 8.230424954002956, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7830, "loss": 4.280249118804932, "lr": 0.00019198962642083135, "elapsed_sec": 64940.87959909439, "step_time_sec": 8.23100797898951, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7831, "loss": 4.2628278732299805, "lr": 0.00019194155856944628, "elapsed_sec": 64949.11035037041, "step_time_sec": 8.230641508998815, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7832, "loss": 4.2300591468811035, "lr": 0.00019189335331557298, "elapsed_sec": 64957.340780735016, "step_time_sec": 8.230209993023891, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7833, "loss": 4.203173637390137, "lr": 0.00019184501073539516, "elapsed_sec": 64965.571821689606, "step_time_sec": 8.230921132984804, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7834, "loss": 4.243221282958984, "lr": 0.0001917965309053134, "elapsed_sec": 64973.80326652527, "step_time_sec": 8.231307400012156, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7835, "loss": 4.129832744598389, "lr": 0.00019174791390194526, "elapsed_sec": 64982.03477048874, "step_time_sec": 8.231317472993396, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7836, "loss": 4.041029453277588, "lr": 0.00019169915980212507, "elapsed_sec": 64990.2658905983, "step_time_sec": 8.230954600003315, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7837, "loss": 4.124360084533691, "lr": 0.00019165026868290386, "elapsed_sec": 64998.4981033802, "step_time_sec": 8.23204257900943, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7838, "loss": 4.139279842376709, "lr": 0.00019160124062154915, "elapsed_sec": 65006.72735977173, "step_time_sec": 8.229143777978607, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7839, "loss": 4.062743663787842, "lr": 0.00019155207569554496, "elapsed_sec": 65014.95661473274, "step_time_sec": 8.229028086003382, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7840, "loss": 4.1185808181762695, "lr": 0.00019150277398259153, "elapsed_sec": 65023.1870906353, "step_time_sec": 8.230314391985303, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7841, "loss": 4.19578218460083, "lr": 0.00019145333556060533, "elapsed_sec": 65031.4174284935, "step_time_sec": 8.23014602100011, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7842, "loss": 4.103794097900391, "lr": 0.00019140376050771888, "elapsed_sec": 65039.64907622337, "step_time_sec": 8.231547502015019, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7843, "loss": 4.033780574798584, "lr": 0.0001913540489022806, "elapsed_sec": 65047.879136800766, "step_time_sec": 8.229848752001999, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7844, "loss": 4.142269611358643, "lr": 0.00019130420082285476, "elapsed_sec": 65056.11028313637, "step_time_sec": 8.230996503989445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7845, "loss": 4.382254123687744, "lr": 0.00019125421634822126, "elapsed_sec": 65064.34071111679, "step_time_sec": 8.230259314004797, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7846, "loss": 4.162545204162598, "lr": 0.00019120409555737565, "elapsed_sec": 65072.57172012329, "step_time_sec": 8.230867334990762, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7847, "loss": 4.145630836486816, "lr": 0.00019115383852952878, "elapsed_sec": 65080.80306267738, "step_time_sec": 8.231161643983796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7848, "loss": 4.209008693695068, "lr": 0.00019110344534410695, "elapsed_sec": 65089.034277915955, "step_time_sec": 8.231028676003916, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7849, "loss": 4.330309867858887, "lr": 0.00019105291608075155, "elapsed_sec": 65097.26530766487, "step_time_sec": 8.230868546001147, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7850, "loss": 4.1137895584106445, "lr": 0.00019100225081931908, "elapsed_sec": 65105.494762182236, "step_time_sec": 8.229277948994422, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7851, "loss": 4.148992538452148, "lr": 0.00019095144963988088, "elapsed_sec": 65113.725600004196, "step_time_sec": 8.230770846013911, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7852, "loss": 4.100924015045166, "lr": 0.00019090051262272325, "elapsed_sec": 65121.95661520958, "step_time_sec": 8.230841124022845, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7853, "loss": 4.0714335441589355, "lr": 0.00019084943984834702, "elapsed_sec": 65130.18690085411, "step_time_sec": 8.230090768018272, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7854, "loss": 4.254868507385254, "lr": 0.00019079823139746767, "elapsed_sec": 65138.41740989685, "step_time_sec": 8.230369905009866, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7855, "loss": 4.2090935707092285, "lr": 0.00019074688735101503, "elapsed_sec": 65146.6479268074, "step_time_sec": 8.230348943994613, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7856, "loss": 4.123810768127441, "lr": 0.00019069540779013332, "elapsed_sec": 65154.87978196144, "step_time_sec": 8.231666706997203, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7857, "loss": 4.133962154388428, "lr": 0.00019064379279618078, "elapsed_sec": 65163.10992479324, "step_time_sec": 8.229923915991094, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7858, "loss": 3.8857710361480713, "lr": 0.00019059204245072981, "elapsed_sec": 65171.340512514114, "step_time_sec": 8.230421930988086, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7859, "loss": 4.189362525939941, "lr": 0.00019054015683556673, "elapsed_sec": 65179.57116436958, "step_time_sec": 8.230541901983088, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7860, "loss": 4.006592273712158, "lr": 0.00019048813603269154, "elapsed_sec": 65187.80166506767, "step_time_sec": 8.23028222998255, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7861, "loss": 4.298309803009033, "lr": 0.00019043598012431797, "elapsed_sec": 65196.033108234406, "step_time_sec": 8.231317767989822, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7862, "loss": 4.215229034423828, "lr": 0.00019038368919287326, "elapsed_sec": 65204.26349425316, "step_time_sec": 8.230220435012598, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7863, "loss": 4.152112007141113, "lr": 0.000190331263320998, "elapsed_sec": 65212.495465278625, "step_time_sec": 8.23181205498986, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7864, "loss": 4.094135284423828, "lr": 0.00019027870259154608, "elapsed_sec": 65220.725682497025, "step_time_sec": 8.230071850994136, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7865, "loss": 4.225912094116211, "lr": 0.0001902260070875845, "elapsed_sec": 65228.95725727081, "step_time_sec": 8.231408580002608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7866, "loss": 4.18304967880249, "lr": 0.00019017317689239325, "elapsed_sec": 65237.18771672249, "step_time_sec": 8.23036067801877, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7867, "loss": 4.189637184143066, "lr": 0.00019012021208946527, "elapsed_sec": 65245.41646027565, "step_time_sec": 8.228499147982802, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7868, "loss": 4.097110748291016, "lr": 0.00019006711276250607, "elapsed_sec": 65253.644713163376, "step_time_sec": 8.228121301013744, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7869, "loss": 4.052175045013428, "lr": 0.00019001387899543392, "elapsed_sec": 65261.8754234314, "step_time_sec": 8.230610497004818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7870, "loss": 4.020246982574463, "lr": 0.00018996051087237948, "elapsed_sec": 65270.106996297836, "step_time_sec": 8.231411201995797, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7871, "loss": 4.144284248352051, "lr": 0.0001899070084776857, "elapsed_sec": 65278.3370950222, "step_time_sec": 8.229948868014617, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7872, "loss": 4.201416015625, "lr": 0.00018985337189590795, "elapsed_sec": 65286.56718206406, "step_time_sec": 8.229848806018708, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7873, "loss": 3.979731559753418, "lr": 0.00018979960121181334, "elapsed_sec": 65294.79847621918, "step_time_sec": 8.231163210992236, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7874, "loss": 3.9503629207611084, "lr": 0.00018974569651038122, "elapsed_sec": 65303.02803277969, "step_time_sec": 8.229409697989468, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7875, "loss": 4.319154739379883, "lr": 0.0001896916578768025, "elapsed_sec": 65311.25696969032, "step_time_sec": 8.228766572021414, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7876, "loss": 4.325501918792725, "lr": 0.00018963748539647997, "elapsed_sec": 65319.486172914505, "step_time_sec": 8.229007032990921, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7877, "loss": 4.105000019073486, "lr": 0.00018958317915502779, "elapsed_sec": 65327.716504096985, "step_time_sec": 8.230160067003453, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7878, "loss": 4.1155781745910645, "lr": 0.00018952873923827157, "elapsed_sec": 65335.9466073513, "step_time_sec": 8.229944111983059, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7879, "loss": 4.006009101867676, "lr": 0.0001894741657322482, "elapsed_sec": 65344.1763958931, "step_time_sec": 8.229633098992053, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7880, "loss": 4.14911413192749, "lr": 0.0001894194587232057, "elapsed_sec": 65352.40800237656, "step_time_sec": 8.231452072999673, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7881, "loss": 4.124309539794922, "lr": 0.00018936461829760306, "elapsed_sec": 65360.637169361115, "step_time_sec": 8.228973807999864, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7882, "loss": 3.855360269546509, "lr": 0.00018930964454211006, "elapsed_sec": 65368.86864757538, "step_time_sec": 8.231321953004226, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7883, "loss": 4.211221694946289, "lr": 0.0001892545375436073, "elapsed_sec": 65377.09961295128, "step_time_sec": 8.230864800018026, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7884, "loss": 4.071603298187256, "lr": 0.0001891992973891859, "elapsed_sec": 65385.328981637955, "step_time_sec": 8.229176038003061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7885, "loss": 4.15247106552124, "lr": 0.00018914392416614746, "elapsed_sec": 65393.55788731575, "step_time_sec": 8.228759756020736, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7886, "loss": 4.134419918060303, "lr": 0.0001890884179620038, "elapsed_sec": 65401.7849149704, "step_time_sec": 8.226825518009719, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7887, "loss": 4.342655181884766, "lr": 0.000189032778864477, "elapsed_sec": 65410.01568365097, "step_time_sec": 8.230644630006282, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7888, "loss": 4.382270812988281, "lr": 0.0001889770069614991, "elapsed_sec": 65418.24763083458, "step_time_sec": 8.231729160994291, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7889, "loss": 4.253467082977295, "lr": 0.00018892110234121205, "elapsed_sec": 65426.478752851486, "step_time_sec": 8.230955539009301, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7890, "loss": 4.670515060424805, "lr": 0.00018886506509196754, "elapsed_sec": 65434.70932149887, "step_time_sec": 8.230427439993946, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7891, "loss": 4.3643479347229, "lr": 0.0001888088953023269, "elapsed_sec": 65442.94006681442, "step_time_sec": 8.23056753000128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7892, "loss": 4.331240177154541, "lr": 0.00018875259306106086, "elapsed_sec": 65451.17178630829, "step_time_sec": 8.23157892099698, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7893, "loss": 4.362483978271484, "lr": 0.00018869615845714953, "elapsed_sec": 65459.40291285515, "step_time_sec": 8.230945539020468, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7894, "loss": 4.167449474334717, "lr": 0.0001886395915797822, "elapsed_sec": 65467.63412308693, "step_time_sec": 8.231032965006307, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7895, "loss": 4.24563455581665, "lr": 0.00018858289251835717, "elapsed_sec": 65475.864184856415, "step_time_sec": 8.22996783000417, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7896, "loss": 4.1089186668396, "lr": 0.00018852606136248167, "elapsed_sec": 65484.096158504486, "step_time_sec": 8.231760768016102, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7897, "loss": 4.2040205001831055, "lr": 0.0001884690982019717, "elapsed_sec": 65492.3258368969, "step_time_sec": 8.229567247006344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7898, "loss": 4.124983787536621, "lr": 0.0001884120031268519, "elapsed_sec": 65500.5565571785, "step_time_sec": 8.230526293016737, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7899, "loss": 4.034176349639893, "lr": 0.00018835477622735532, "elapsed_sec": 65508.78728699684, "step_time_sec": 8.230514194001444, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7900, "loss": 4.1201043128967285, "lr": 0.0001882974175939234, "elapsed_sec": 65517.01781797409, "step_time_sec": 8.230385106027825, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7901, "loss": 4.316797256469727, "lr": 0.00018823992731720573, "elapsed_sec": 65525.249040842056, "step_time_sec": 8.23107115298626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7902, "loss": 4.149039268493652, "lr": 0.00018818230548806001, "elapsed_sec": 65533.48024535179, "step_time_sec": 8.231051737995585, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7903, "loss": 4.323768615722656, "lr": 0.0001881245521975518, "elapsed_sec": 65541.71156406403, "step_time_sec": 8.231143736978993, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7904, "loss": 4.278055191040039, "lr": 0.00018806666753695445, "elapsed_sec": 65549.9424533844, "step_time_sec": 8.230744422995485, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7905, "loss": 4.05115270614624, "lr": 0.00018800865159774885, "elapsed_sec": 65558.1724331379, "step_time_sec": 8.229830071009928, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7906, "loss": 4.0900702476501465, "lr": 0.0001879505044716235, "elapsed_sec": 65566.40298724174, "step_time_sec": 8.230447687004926, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7907, "loss": 4.086911201477051, "lr": 0.0001878922262504741, "elapsed_sec": 65574.6340110302, "step_time_sec": 8.23078632398392, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7908, "loss": 4.1042890548706055, "lr": 0.0001878338170264036, "elapsed_sec": 65582.86508846283, "step_time_sec": 8.23090473699267, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7909, "loss": 3.935116767883301, "lr": 0.000187775276891722, "elapsed_sec": 65591.09586215019, "step_time_sec": 8.230657628999325, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7910, "loss": 4.069327354431152, "lr": 0.0001877166059389461, "elapsed_sec": 65599.32761383057, "step_time_sec": 8.231558766012313, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7911, "loss": 4.0788350105285645, "lr": 0.00018765780426079957, "elapsed_sec": 65607.55818200111, "step_time_sec": 8.230457864003256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7912, "loss": 4.1271891593933105, "lr": 0.00018759887195021256, "elapsed_sec": 65615.78877544403, "step_time_sec": 8.230351234000409, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7913, "loss": 3.900033473968506, "lr": 0.00018753980910032176, "elapsed_sec": 65624.01813030243, "step_time_sec": 8.229220192006323, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7914, "loss": 4.10027551651001, "lr": 0.0001874806158044701, "elapsed_sec": 65632.24736571312, "step_time_sec": 8.229134192981292, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7915, "loss": 4.239595890045166, "lr": 0.00018742129215620676, "elapsed_sec": 65640.47835206985, "step_time_sec": 8.230775506992359, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7916, "loss": 4.025651931762695, "lr": 0.0001873618382492868, "elapsed_sec": 65648.70930933952, "step_time_sec": 8.230815504008206, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7917, "loss": 4.158381462097168, "lr": 0.00018730225417767126, "elapsed_sec": 65656.94090747833, "step_time_sec": 8.231454719003523, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7918, "loss": 4.11113977432251, "lr": 0.0001872425400355268, "elapsed_sec": 65665.17164206505, "step_time_sec": 8.230533964000642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7919, "loss": 4.099027156829834, "lr": 0.00018718269591722569, "elapsed_sec": 65673.40213155746, "step_time_sec": 8.23033875101828, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7920, "loss": 4.0863213539123535, "lr": 0.00018712272191734563, "elapsed_sec": 65681.63409638405, "step_time_sec": 8.231793674989603, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7921, "loss": 4.118597507476807, "lr": 0.00018706261813066955, "elapsed_sec": 65689.863956213, "step_time_sec": 8.229716138972435, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7922, "loss": 4.043849468231201, "lr": 0.0001870023846521855, "elapsed_sec": 65698.09481430054, "step_time_sec": 8.23066626500804, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7923, "loss": 4.100731372833252, "lr": 0.00018694202157708655, "elapsed_sec": 65706.32616639137, "step_time_sec": 8.231213833991205, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7924, "loss": 4.054704666137695, "lr": 0.00018688152900077045, "elapsed_sec": 65714.55681419373, "step_time_sec": 8.230485175008653, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7925, "loss": 4.082074165344238, "lr": 0.0001868209070188398, "elapsed_sec": 65722.78771471977, "step_time_sec": 8.23071084098774, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7926, "loss": 4.173854827880859, "lr": 0.00018676015572710155, "elapsed_sec": 65731.01548409462, "step_time_sec": 8.227620638994267, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7927, "loss": 4.040483474731445, "lr": 0.0001866992752215671, "elapsed_sec": 65739.24375844002, "step_time_sec": 8.228119302977575, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7928, "loss": 4.0341057777404785, "lr": 0.00018663826559845202, "elapsed_sec": 65747.47282719612, "step_time_sec": 8.228938207001192, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7929, "loss": 4.188173294067383, "lr": 0.00018657712695417596, "elapsed_sec": 65755.7045636177, "step_time_sec": 8.231524817005266, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7930, "loss": 4.141822814941406, "lr": 0.0001865158593853625, "elapsed_sec": 65763.93516993523, "step_time_sec": 8.230465303990059, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7931, "loss": 4.098898887634277, "lr": 0.00018645446298883885, "elapsed_sec": 65772.16670703888, "step_time_sec": 8.2313632130099, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7932, "loss": 4.074098587036133, "lr": 0.000186392937861636, "elapsed_sec": 65780.39747452736, "step_time_sec": 8.230634879990248, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7933, "loss": 4.21925687789917, "lr": 0.0001863312841009883, "elapsed_sec": 65788.62869644165, "step_time_sec": 8.231045244989218, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7934, "loss": 4.017517566680908, "lr": 0.00018626950180433328, "elapsed_sec": 65796.85905385017, "step_time_sec": 8.230174219992477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7935, "loss": 4.068975925445557, "lr": 0.00018620759106931185, "elapsed_sec": 65805.09013819695, "step_time_sec": 8.230935102998046, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7936, "loss": 4.3445725440979, "lr": 0.00018614555199376767, "elapsed_sec": 65813.32128047943, "step_time_sec": 8.230972010991536, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7937, "loss": 4.089942932128906, "lr": 0.0001860833846757474, "elapsed_sec": 65821.55159044266, "step_time_sec": 8.230208566994406, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7938, "loss": 3.982985734939575, "lr": 0.00018602108921350025, "elapsed_sec": 65829.78271055222, "step_time_sec": 8.230917153996415, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7939, "loss": 3.9627277851104736, "lr": 0.00018595866570547806, "elapsed_sec": 65838.01325011253, "step_time_sec": 8.230441271996824, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7940, "loss": 4.236004829406738, "lr": 0.00018589611425033492, "elapsed_sec": 65846.24202251434, "step_time_sec": 8.228532137000002, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7941, "loss": 4.063332557678223, "lr": 0.0001858334349469272, "elapsed_sec": 65854.47019720078, "step_time_sec": 8.228015041997423, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7942, "loss": 4.218339443206787, "lr": 0.00018577062789431337, "elapsed_sec": 65862.70086574554, "step_time_sec": 8.230565160018159, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7943, "loss": 4.143144607543945, "lr": 0.00018570769319175363, "elapsed_sec": 65870.93375492096, "step_time_sec": 8.232662499009166, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7944, "loss": 4.242417812347412, "lr": 0.00018564463093871008, "elapsed_sec": 65879.16443538666, "step_time_sec": 8.230514718015911, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7945, "loss": 4.184168338775635, "lr": 0.00018558144123484633, "elapsed_sec": 65887.3956644535, "step_time_sec": 8.231116875016596, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7946, "loss": 4.049352169036865, "lr": 0.00018551812418002742, "elapsed_sec": 65895.62724399567, "step_time_sec": 8.231435652007349, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7947, "loss": 4.029140949249268, "lr": 0.00018545467987431965, "elapsed_sec": 65903.85784554482, "step_time_sec": 8.230378012987785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7948, "loss": 4.132711887359619, "lr": 0.00018539110841799046, "elapsed_sec": 65912.08855509758, "step_time_sec": 8.230530102009652, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7949, "loss": 4.1784515380859375, "lr": 0.00018532740991150822, "elapsed_sec": 65920.3200955391, "step_time_sec": 8.231373835005797, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7950, "loss": 4.116335868835449, "lr": 0.00018526358445554203, "elapsed_sec": 65928.55172348022, "step_time_sec": 8.23151653201785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7951, "loss": 4.0649542808532715, "lr": 0.00018519963215096174, "elapsed_sec": 65936.78236293793, "step_time_sec": 8.230415582016576, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7952, "loss": 4.020324230194092, "lr": 0.00018513555309883757, "elapsed_sec": 65945.01282024384, "step_time_sec": 8.230349871999351, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7953, "loss": 4.0935492515563965, "lr": 0.00018507134740044012, "elapsed_sec": 65953.24165701866, "step_time_sec": 8.22858603601344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7954, "loss": 4.070353031158447, "lr": 0.00018500701515724007, "elapsed_sec": 65961.47170162201, "step_time_sec": 8.22987850100617, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7955, "loss": 4.002612590789795, "lr": 0.00018494255647090818, "elapsed_sec": 65969.70183110237, "step_time_sec": 8.229987467988394, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7956, "loss": 3.9502782821655273, "lr": 0.00018487797144331496, "elapsed_sec": 65977.93087720871, "step_time_sec": 8.228923360991757, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7957, "loss": 3.9899439811706543, "lr": 0.00018481326017653064, "elapsed_sec": 65986.15916633606, "step_time_sec": 8.228067399992142, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7958, "loss": 3.880786180496216, "lr": 0.00018474842277282495, "elapsed_sec": 65994.38831877708, "step_time_sec": 8.229000304010697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7959, "loss": 4.114177227020264, "lr": 0.00018468345933466695, "elapsed_sec": 66002.61662077904, "step_time_sec": 8.228215531009482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7960, "loss": 4.116592884063721, "lr": 0.00018461836996472484, "elapsed_sec": 66010.84721541405, "step_time_sec": 8.230389100994216, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7961, "loss": 4.236263751983643, "lr": 0.00018455315476586604, "elapsed_sec": 66019.07785868645, "step_time_sec": 8.23045970700332, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7962, "loss": 4.2731614112854, "lr": 0.00018448781384115655, "elapsed_sec": 66027.3093726635, "step_time_sec": 8.231344405008713, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7963, "loss": 4.253289222717285, "lr": 0.00018442234729386123, "elapsed_sec": 66035.54033136368, "step_time_sec": 8.230787936015986, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7964, "loss": 4.169473648071289, "lr": 0.0001843567552274435, "elapsed_sec": 66043.77113962173, "step_time_sec": 8.230683392001083, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7965, "loss": 4.245439529418945, "lr": 0.00018429103774556506, "elapsed_sec": 66052.00253415108, "step_time_sec": 8.23124142299639, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7966, "loss": 4.008779525756836, "lr": 0.00018422519495208585, "elapsed_sec": 66060.23260736465, "step_time_sec": 8.229870490002213, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7967, "loss": 4.102035045623779, "lr": 0.0001841592269510639, "elapsed_sec": 66068.46371746063, "step_time_sec": 8.231000586994924, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7968, "loss": 4.1217570304870605, "lr": 0.00018409313384675502, "elapsed_sec": 66076.6942498684, "step_time_sec": 8.23034842198831, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7969, "loss": 4.176358222961426, "lr": 0.0001840269157436128, "elapsed_sec": 66084.92546319962, "step_time_sec": 8.231020359002287, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7970, "loss": 4.1939377784729, "lr": 0.00018396057274628843, "elapsed_sec": 66093.15382814407, "step_time_sec": 8.228220645018155, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7971, "loss": 4.195115089416504, "lr": 0.00018389410495963035, "elapsed_sec": 66101.38389277458, "step_time_sec": 8.229847827984486, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7972, "loss": 4.132490634918213, "lr": 0.00018382751248868432, "elapsed_sec": 66109.61282253265, "step_time_sec": 8.228777223004727, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7973, "loss": 4.191760063171387, "lr": 0.00018376079543869315, "elapsed_sec": 66117.8433265686, "step_time_sec": 8.230415154015645, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7974, "loss": 4.134914398193359, "lr": 0.0001836939539150964, "elapsed_sec": 66126.07458662987, "step_time_sec": 8.231039645994315, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7975, "loss": 3.9226551055908203, "lr": 0.00018362698802353052, "elapsed_sec": 66134.30269217491, "step_time_sec": 8.227939956996124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7976, "loss": 4.099826335906982, "lr": 0.00018355989786982848, "elapsed_sec": 66142.53130126, "step_time_sec": 8.228514993010322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7977, "loss": 4.075515270233154, "lr": 0.0001834926835600195, "elapsed_sec": 66150.76002192497, "step_time_sec": 8.22849829099141, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7978, "loss": 4.044533729553223, "lr": 0.00018342534520032923, "elapsed_sec": 66158.99084424973, "step_time_sec": 8.230653834994882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7979, "loss": 4.159223556518555, "lr": 0.00018335788289717918, "elapsed_sec": 66167.22180175781, "step_time_sec": 8.230813643982401, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7980, "loss": 4.22202205657959, "lr": 0.00018329029675718676, "elapsed_sec": 66175.45311331749, "step_time_sec": 8.2311206330196, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7981, "loss": 4.173385143280029, "lr": 0.00018322258688716525, "elapsed_sec": 66183.6841878891, "step_time_sec": 8.230915933992947, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7982, "loss": 4.229280948638916, "lr": 0.00018315475339412333, "elapsed_sec": 66191.91525554657, "step_time_sec": 8.230897547007771, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7983, "loss": 3.9983015060424805, "lr": 0.00018308679638526508, "elapsed_sec": 66200.14591646194, "step_time_sec": 8.23052482600906, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7984, "loss": 4.114078521728516, "lr": 0.0001830187159679898, "elapsed_sec": 66208.37656259537, "step_time_sec": 8.230442925007083, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7985, "loss": 4.149316787719727, "lr": 0.0001829505122498918, "elapsed_sec": 66217.6539285183, "step_time_sec": 9.277239368006121, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7986, "loss": 4.0551981925964355, "lr": 0.00018288218533876033, "elapsed_sec": 66225.88457202911, "step_time_sec": 8.230465517000994, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7987, "loss": 4.261816024780273, "lr": 0.0001828137353425792, "elapsed_sec": 66234.11732339859, "step_time_sec": 8.232591177977156, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7988, "loss": 3.978518486022949, "lr": 0.00018274516236952687, "elapsed_sec": 66242.34885501862, "step_time_sec": 8.231358043994987, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7989, "loss": 4.163060665130615, "lr": 0.00018267646652797613, "elapsed_sec": 66250.57950806618, "step_time_sec": 8.230502245976822, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7990, "loss": 4.107173442840576, "lr": 0.00018260764792649387, "elapsed_sec": 66258.81173586845, "step_time_sec": 8.23209514998598, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7991, "loss": 3.9054977893829346, "lr": 0.00018253870667384107, "elapsed_sec": 66267.04074239731, "step_time_sec": 8.228821069991682, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7992, "loss": 4.170050144195557, "lr": 0.00018246964287897254, "elapsed_sec": 66275.26942324638, "step_time_sec": 8.228521044016816, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7993, "loss": 4.1825175285339355, "lr": 0.00018240045665103672, "elapsed_sec": 66283.49801325798, "step_time_sec": 8.228433766023954, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7994, "loss": 4.058358192443848, "lr": 0.00018233114809937558, "elapsed_sec": 66291.72931361198, "step_time_sec": 8.231144600984408, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7995, "loss": 4.1496806144714355, "lr": 0.0001822617173335244, "elapsed_sec": 66299.9606282711, "step_time_sec": 8.231170198996551, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7996, "loss": 4.129876613616943, "lr": 0.00018219216446321154, "elapsed_sec": 66308.1911149025, "step_time_sec": 8.230353952996666, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7997, "loss": 4.0687456130981445, "lr": 0.00018212248959835847, "elapsed_sec": 66316.42262768745, "step_time_sec": 8.231289683986688, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7998, "loss": 4.177155017852783, "lr": 0.00018205269284907936, "elapsed_sec": 66324.65320920944, "step_time_sec": 8.230441449006321, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 7999, "loss": 4.1697564125061035, "lr": 0.00018198277432568101, "elapsed_sec": 66332.88380861282, "step_time_sec": 8.230479866004316, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8000, "loss": 3.994699001312256, "lr": 0.00018191273413866273, "elapsed_sec": 66341.11414861679, "step_time_sec": 52.691264970984776, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": true, "probe_ran": true, "probe_elapsed_sec": 0.9977132230123971, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8001, "loss": 4.018739700317383, "lr": 0.00018184257239871606, "elapsed_sec": 66393.80702233315, "step_time_sec": 8.231630153983133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8002, "loss": 4.118190288543701, "lr": 0.00018177228921672462, "elapsed_sec": 66402.02420759201, "step_time_sec": 8.216995316994144, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8003, "loss": 3.9620327949523926, "lr": 0.00018170188470376407, "elapsed_sec": 66410.24159097672, "step_time_sec": 8.217275083996356, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8004, "loss": 4.0263848304748535, "lr": 0.00018163135897110165, "elapsed_sec": 66418.46801233292, "step_time_sec": 8.226309539983049, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8005, "loss": 4.043581485748291, "lr": 0.00018156071213019633, "elapsed_sec": 66426.69921469688, "step_time_sec": 8.230962563015055, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8006, "loss": 4.071379661560059, "lr": 0.00018148994429269838, "elapsed_sec": 66434.92947292328, "step_time_sec": 8.23012390101212, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8007, "loss": 4.128544330596924, "lr": 0.00018141905557044936, "elapsed_sec": 66443.16103124619, "step_time_sec": 8.231461224000668, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8008, "loss": 3.983898401260376, "lr": 0.0001813480460754818, "elapsed_sec": 66451.39264702797, "step_time_sec": 8.231370002991753, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8009, "loss": 4.019167423248291, "lr": 0.0001812769159200192, "elapsed_sec": 66459.62352633476, "step_time_sec": 8.230716310004937, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8010, "loss": 4.089117527008057, "lr": 0.0001812056652164757, "elapsed_sec": 66467.85526704788, "step_time_sec": 8.231634141993709, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8011, "loss": 4.0709991455078125, "lr": 0.00018113429407745589, "elapsed_sec": 66476.08666038513, "step_time_sec": 8.231196776003344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8012, "loss": 3.947174072265625, "lr": 0.00018106280261575482, "elapsed_sec": 66484.31782269478, "step_time_sec": 8.230994880985236, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8013, "loss": 3.951263427734375, "lr": 0.00018099119094435758, "elapsed_sec": 66492.54809856415, "step_time_sec": 8.230117545987014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8014, "loss": 4.019392490386963, "lr": 0.0001809194591764394, "elapsed_sec": 66500.7778635025, "step_time_sec": 8.22961435301113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8015, "loss": 4.08423376083374, "lr": 0.00018084760742536504, "elapsed_sec": 66509.00812149048, "step_time_sec": 8.230101455003023, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8016, "loss": 4.02824068069458, "lr": 0.00018077563580468921, "elapsed_sec": 66517.23880720139, "step_time_sec": 8.23053805399104, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8017, "loss": 4.211193084716797, "lr": 0.00018070354442815582, "elapsed_sec": 66525.46718144417, "step_time_sec": 8.228246719983872, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8018, "loss": 3.948974847793579, "lr": 0.0001806313334096981, "elapsed_sec": 66533.69661951065, "step_time_sec": 8.229295266006375, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8019, "loss": 4.330013751983643, "lr": 0.00018055900286343844, "elapsed_sec": 66541.92425656319, "step_time_sec": 8.227460706984857, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8020, "loss": 4.210188865661621, "lr": 0.00018048655290368808, "elapsed_sec": 66550.15512251854, "step_time_sec": 8.230710810021264, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8021, "loss": 4.197790622711182, "lr": 0.00018041398364494694, "elapsed_sec": 66558.38264632225, "step_time_sec": 8.227400792005938, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8022, "loss": 4.2085676193237305, "lr": 0.00018034129520190354, "elapsed_sec": 66566.61223220825, "step_time_sec": 8.229371813999023, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8023, "loss": 4.08642053604126, "lr": 0.0001802684876894347, "elapsed_sec": 66574.84122514725, "step_time_sec": 8.228867470024852, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8024, "loss": 4.100363254547119, "lr": 0.0001801955612226055, "elapsed_sec": 66583.06982040405, "step_time_sec": 8.228470333997393, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8025, "loss": 4.083750247955322, "lr": 0.00018012251591666893, "elapsed_sec": 66591.29901576042, "step_time_sec": 8.228995916986605, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8026, "loss": 4.322475433349609, "lr": 0.00018004935188706587, "elapsed_sec": 66599.5288105011, "step_time_sec": 8.229633488022955, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8027, "loss": 4.180636882781982, "lr": 0.00017997606924942476, "elapsed_sec": 66607.75833916664, "step_time_sec": 8.229357298987452, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8028, "loss": 4.106827259063721, "lr": 0.00017990266811956153, "elapsed_sec": 66615.98674106598, "step_time_sec": 8.228261052980088, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8029, "loss": 4.3787522315979, "lr": 0.0001798291486134794, "elapsed_sec": 66624.21835446358, "step_time_sec": 8.23145354699227, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8030, "loss": 4.408085346221924, "lr": 0.00017975551084736861, "elapsed_sec": 66632.44916176796, "step_time_sec": 8.230660164001165, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8031, "loss": 4.004117965698242, "lr": 0.00017968175493760633, "elapsed_sec": 66640.68030714989, "step_time_sec": 8.2310058150033, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8032, "loss": 4.12358283996582, "lr": 0.00017960788100075645, "elapsed_sec": 66648.91227674484, "step_time_sec": 8.231805996008916, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8033, "loss": 4.088930130004883, "lr": 0.00017953388915356933, "elapsed_sec": 66657.14312124252, "step_time_sec": 8.23068362698541, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8034, "loss": 4.031378269195557, "lr": 0.0001794597795129818, "elapsed_sec": 66665.37401366234, "step_time_sec": 8.230741092993412, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8035, "loss": 3.9215145111083984, "lr": 0.00017938555219611677, "elapsed_sec": 66673.60468435287, "step_time_sec": 8.230571083986433, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8036, "loss": 4.038020133972168, "lr": 0.0001793112073202831, "elapsed_sec": 66681.83660531044, "step_time_sec": 8.23174611700233, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8037, "loss": 3.983860969543457, "lr": 0.00017923674500297545, "elapsed_sec": 66690.06546854973, "step_time_sec": 8.228638799017062, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8038, "loss": 4.027338981628418, "lr": 0.00017916216536187417, "elapsed_sec": 66698.29508256912, "step_time_sec": 8.229552303993842, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8039, "loss": 3.960996389389038, "lr": 0.00017908746851484492, "elapsed_sec": 66706.52611660957, "step_time_sec": 8.230783445993438, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8040, "loss": 4.071220874786377, "lr": 0.0001790126545799387, "elapsed_sec": 66714.75607013702, "step_time_sec": 8.229790179000702, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8041, "loss": 4.100517749786377, "lr": 0.00017893772367539142, "elapsed_sec": 66722.98595404625, "step_time_sec": 8.229784324998036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8042, "loss": 4.017546653747559, "lr": 0.00017886267591962396, "elapsed_sec": 66731.2159333229, "step_time_sec": 8.22982199199032, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8043, "loss": 3.9997358322143555, "lr": 0.00017878751143124186, "elapsed_sec": 66739.44612288475, "step_time_sec": 8.230023271986283, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8044, "loss": 4.116062164306641, "lr": 0.00017871223032903507, "elapsed_sec": 66747.6756772995, "step_time_sec": 8.229393309011357, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8045, "loss": 4.093998908996582, "lr": 0.00017863683273197794, "elapsed_sec": 66755.90800714493, "step_time_sec": 8.232178285019472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8046, "loss": 4.263824462890625, "lr": 0.00017856131875922882, "elapsed_sec": 66764.14064526558, "step_time_sec": 8.232458421989577, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8047, "loss": 3.9643547534942627, "lr": 0.00017848568853013006, "elapsed_sec": 66772.37338709831, "step_time_sec": 8.23249701198074, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8048, "loss": 4.028225421905518, "lr": 0.00017840994216420771, "elapsed_sec": 66780.60430908203, "step_time_sec": 8.230795918992953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8049, "loss": 3.967130661010742, "lr": 0.00017833407978117138, "elapsed_sec": 66788.83489632607, "step_time_sec": 8.230454039992765, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8050, "loss": 4.156105041503906, "lr": 0.00017825810150091402, "elapsed_sec": 66797.06648755074, "step_time_sec": 8.23138600299717, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8051, "loss": 4.065688133239746, "lr": 0.00017818200744351177, "elapsed_sec": 66805.29738163948, "step_time_sec": 8.230706314003328, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8052, "loss": 4.062596797943115, "lr": 0.00017810579772922365, "elapsed_sec": 66813.52810406685, "step_time_sec": 8.23057806599536, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8053, "loss": 4.21670389175415, "lr": 0.00017802947247849162, "elapsed_sec": 66821.75915360451, "step_time_sec": 8.23088637200999, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8054, "loss": 4.2858757972717285, "lr": 0.0001779530318119401, "elapsed_sec": 66829.99024152756, "step_time_sec": 8.230937745014671, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8055, "loss": 4.153250694274902, "lr": 0.000177876475850376, "elapsed_sec": 66838.22106194496, "step_time_sec": 8.230724230001215, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8056, "loss": 4.090001583099365, "lr": 0.00017779980471478835, "elapsed_sec": 66846.45220351219, "step_time_sec": 8.23091456698603, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8057, "loss": 4.087268829345703, "lr": 0.00017772301852634828, "elapsed_sec": 66854.68333053589, "step_time_sec": 8.231022640014999, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8058, "loss": 4.077794075012207, "lr": 0.00017764611740640873, "elapsed_sec": 66862.91059613228, "step_time_sec": 8.227055494993692, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8059, "loss": 4.0402679443359375, "lr": 0.0001775691014765043, "elapsed_sec": 66871.13925933838, "step_time_sec": 8.228525845013792, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8060, "loss": 4.23488712310791, "lr": 0.00017749197085835096, "elapsed_sec": 66879.3689866066, "step_time_sec": 8.22957304000738, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8061, "loss": 4.066333770751953, "lr": 0.000177414725673846, "elapsed_sec": 66887.59983277321, "step_time_sec": 8.230668648000574, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8062, "loss": 4.075894832611084, "lr": 0.00017733736604506778, "elapsed_sec": 66895.83285570145, "step_time_sec": 8.232918458001222, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8063, "loss": 4.26380729675293, "lr": 0.00017725989209427546, "elapsed_sec": 66904.06270265579, "step_time_sec": 8.229672176996246, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8064, "loss": 4.11187219619751, "lr": 0.00017718230394390895, "elapsed_sec": 66912.29447841644, "step_time_sec": 8.231578742997954, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8065, "loss": 4.063866138458252, "lr": 0.00017710460171658861, "elapsed_sec": 66920.52593660355, "step_time_sec": 8.231325214990648, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8066, "loss": 4.121595859527588, "lr": 0.00017702678553511503, "elapsed_sec": 66928.75730538368, "step_time_sec": 8.231198458001018, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8067, "loss": 4.004380702972412, "lr": 0.000176948855522469, "elapsed_sec": 66936.98818469048, "step_time_sec": 8.230721428990364, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8068, "loss": 4.255770683288574, "lr": 0.00017687081180181114, "elapsed_sec": 66945.2198047638, "step_time_sec": 8.231441365991486, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8069, "loss": 4.1032915115356445, "lr": 0.00017679265449648178, "elapsed_sec": 66953.45082473755, "step_time_sec": 8.2309131109796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8070, "loss": 4.152338981628418, "lr": 0.00017671438373000075, "elapsed_sec": 66961.6814057827, "step_time_sec": 8.230392327008303, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8071, "loss": 4.1574296951293945, "lr": 0.0001766359996260673, "elapsed_sec": 66969.91255068779, "step_time_sec": 8.23100205999799, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8072, "loss": 4.0374064445495605, "lr": 0.00017655750230855958, "elapsed_sec": 66978.14332008362, "step_time_sec": 8.230605122982524, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8073, "loss": 4.112817764282227, "lr": 0.00017647889190153493, "elapsed_sec": 66986.37517905235, "step_time_sec": 8.231696856993949, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8074, "loss": 4.042863845825195, "lr": 0.00017640016852922918, "elapsed_sec": 66994.60555100441, "step_time_sec": 8.23023719599587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8075, "loss": 4.130919933319092, "lr": 0.0001763213323160569, "elapsed_sec": 67002.83630728722, "step_time_sec": 8.230593194981338, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8076, "loss": 4.202809810638428, "lr": 0.00017624238338661075, "elapsed_sec": 67011.06816482544, "step_time_sec": 8.231672858993988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8077, "loss": 4.062798023223877, "lr": 0.00017616332186566174, "elapsed_sec": 67019.29870700836, "step_time_sec": 8.230409519019304, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8078, "loss": 4.145578861236572, "lr": 0.00017608414787815874, "elapsed_sec": 67027.53006029129, "step_time_sec": 8.23116155702155, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8079, "loss": 4.195059299468994, "lr": 0.00017600486154922837, "elapsed_sec": 67035.7613492012, "step_time_sec": 8.231132598011754, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8080, "loss": 4.090424060821533, "lr": 0.00017592546300417475, "elapsed_sec": 67043.99306058884, "step_time_sec": 8.231622259016149, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8081, "loss": 3.9833688735961914, "lr": 0.00017584595236847946, "elapsed_sec": 67052.22490859032, "step_time_sec": 8.231603830994572, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8082, "loss": 4.251234531402588, "lr": 0.00017576632976780106, "elapsed_sec": 67060.45601963997, "step_time_sec": 8.230969655007357, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8083, "loss": 4.149438381195068, "lr": 0.0001756865953279752, "elapsed_sec": 67068.6875731945, "step_time_sec": 8.231356809003046, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8084, "loss": 4.042508125305176, "lr": 0.00017560674917501423, "elapsed_sec": 67076.91862845421, "step_time_sec": 8.230919335008366, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8085, "loss": 4.20265007019043, "lr": 0.00017552679143510708, "elapsed_sec": 67085.149777174, "step_time_sec": 8.230968945019413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8086, "loss": 4.162759780883789, "lr": 0.00017544672223461895, "elapsed_sec": 67093.44727993011, "step_time_sec": 8.238405320997117, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8087, "loss": 4.254945278167725, "lr": 0.00017536654170009132, "elapsed_sec": 67101.6784632206, "step_time_sec": 8.230942922993563, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8088, "loss": 4.112188816070557, "lr": 0.0001752862499582415, "elapsed_sec": 67109.90932154655, "step_time_sec": 8.23071102402173, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8089, "loss": 4.140296936035156, "lr": 0.00017520584713596262, "elapsed_sec": 67118.14004325867, "step_time_sec": 8.230571091000456, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8090, "loss": 4.1698808670043945, "lr": 0.00017512533336032336, "elapsed_sec": 67126.37105154991, "step_time_sec": 8.230841178999981, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8091, "loss": 4.201804161071777, "lr": 0.0001750447087585677, "elapsed_sec": 67134.60284471512, "step_time_sec": 8.231706991995452, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8092, "loss": 4.064421653747559, "lr": 0.00017496397345811483, "elapsed_sec": 67142.83402109146, "step_time_sec": 8.231026797991944, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8093, "loss": 4.314642906188965, "lr": 0.00017488312758655886, "elapsed_sec": 67151.06520986557, "step_time_sec": 8.230985044006957, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8094, "loss": 4.181114673614502, "lr": 0.00017480217127166867, "elapsed_sec": 67159.2958741188, "step_time_sec": 8.230511305009713, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8095, "loss": 4.057234287261963, "lr": 0.00017472110464138762, "elapsed_sec": 67167.52823090553, "step_time_sec": 8.232179385988275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8096, "loss": 4.124692916870117, "lr": 0.0001746399278238335, "elapsed_sec": 67175.75833153725, "step_time_sec": 8.22991700199782, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8097, "loss": 4.136149883270264, "lr": 0.0001745586409472982, "elapsed_sec": 67183.98821616173, "step_time_sec": 8.229740100010531, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8098, "loss": 4.190131187438965, "lr": 0.00017447724414024752, "elapsed_sec": 67192.21911168098, "step_time_sec": 8.230714239994995, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8099, "loss": 4.022372722625732, "lr": 0.00017439573753132102, "elapsed_sec": 67200.44799518585, "step_time_sec": 8.228740135004045, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8100, "loss": 3.9286975860595703, "lr": 0.00017431412124933182, "elapsed_sec": 67208.67647528648, "step_time_sec": 8.228344116010703, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8101, "loss": 4.039257526397705, "lr": 0.0001742323954232663, "elapsed_sec": 67216.90778064728, "step_time_sec": 8.23115692898864, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8102, "loss": 4.245100498199463, "lr": 0.00017415056018228405, "elapsed_sec": 67225.13926124573, "step_time_sec": 8.231332718016347, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8103, "loss": 4.179373264312744, "lr": 0.00017406861565571745, "elapsed_sec": 67233.36972737312, "step_time_sec": 8.230301325995242, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8104, "loss": 4.158207416534424, "lr": 0.00017398656197307178, "elapsed_sec": 67241.60058188438, "step_time_sec": 8.23065776398289, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8105, "loss": 4.084346294403076, "lr": 0.00017390439926402465, "elapsed_sec": 67249.83346796036, "step_time_sec": 8.232768901012605, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8106, "loss": 4.092166423797607, "lr": 0.00017382212765842608, "elapsed_sec": 67258.06194233894, "step_time_sec": 8.228263735974906, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8107, "loss": 4.217005729675293, "lr": 0.00017373974728629817, "elapsed_sec": 67266.2896118164, "step_time_sec": 8.227515301987296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8108, "loss": 4.190155982971191, "lr": 0.00017365725827783485, "elapsed_sec": 67274.51863241196, "step_time_sec": 8.22887204401195, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8109, "loss": 4.146851062774658, "lr": 0.00017357466076340185, "elapsed_sec": 67282.74829435349, "step_time_sec": 8.229516983003123, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8110, "loss": 4.058335781097412, "lr": 0.00017349195487353637, "elapsed_sec": 67290.97683906555, "step_time_sec": 8.228397848986788, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8111, "loss": 3.9775562286376953, "lr": 0.00017340914073894675, "elapsed_sec": 67299.20739102364, "step_time_sec": 8.230391635996057, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8112, "loss": 4.092794418334961, "lr": 0.00017332621849051253, "elapsed_sec": 67307.43832063675, "step_time_sec": 8.230757234996418, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8113, "loss": 4.1413984298706055, "lr": 0.00017324318825928412, "elapsed_sec": 67315.66981959343, "step_time_sec": 8.2313456920092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8114, "loss": 3.89070463180542, "lr": 0.00017316005017648252, "elapsed_sec": 67323.90147972107, "step_time_sec": 8.231545972026652, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8115, "loss": 4.111894607543945, "lr": 0.00017307680437349913, "elapsed_sec": 67332.1332013607, "step_time_sec": 8.231518558983225, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8116, "loss": 4.155488967895508, "lr": 0.0001729934509818958, "elapsed_sec": 67340.36419773102, "step_time_sec": 8.230838415998733, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8117, "loss": 4.095334053039551, "lr": 0.00017290999013340415, "elapsed_sec": 67348.59467029572, "step_time_sec": 8.230346541997278, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8118, "loss": 4.192036151885986, "lr": 0.00017282642195992582, "elapsed_sec": 67356.82278323174, "step_time_sec": 8.227934422990074, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8119, "loss": 4.139297008514404, "lr": 0.000172742746593532, "elapsed_sec": 67365.05395102501, "step_time_sec": 8.23106515701511, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8120, "loss": 3.994550943374634, "lr": 0.00017265896416646326, "elapsed_sec": 67373.28502607346, "step_time_sec": 8.230859967996366, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8121, "loss": 4.258646488189697, "lr": 0.0001725750748111294, "elapsed_sec": 67381.51731538773, "step_time_sec": 8.232129591982812, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8122, "loss": 4.125983715057373, "lr": 0.00017249107866010923, "elapsed_sec": 67389.74757289886, "step_time_sec": 8.230094469006872, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8123, "loss": 4.205964088439941, "lr": 0.00017240697584615028, "elapsed_sec": 67397.97842359543, "step_time_sec": 8.230740719998721, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8124, "loss": 4.137531280517578, "lr": 0.00017232276650216868, "elapsed_sec": 67406.20950460434, "step_time_sec": 8.230885997996666, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8125, "loss": 4.1650919914245605, "lr": 0.00017223845076124898, "elapsed_sec": 67414.43784213066, "step_time_sec": 8.228165020991582, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8126, "loss": 4.143566608428955, "lr": 0.0001721540287566438, "elapsed_sec": 67422.66659975052, "step_time_sec": 8.22861028299667, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8127, "loss": 4.076248645782471, "lr": 0.00017206950062177365, "elapsed_sec": 67430.89704656601, "step_time_sec": 8.230311035003979, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8128, "loss": 4.108814239501953, "lr": 0.00017198486649022695, "elapsed_sec": 67439.12882375717, "step_time_sec": 8.231641769001726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8129, "loss": 4.200916767120361, "lr": 0.00017190012649575948, "elapsed_sec": 67447.35945534706, "step_time_sec": 8.23044798898627, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8130, "loss": 4.189251899719238, "lr": 0.00017181528077229435, "elapsed_sec": 67455.5908062458, "step_time_sec": 8.231261562003056, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8131, "loss": 4.179879188537598, "lr": 0.00017173032945392186, "elapsed_sec": 67463.82211494446, "step_time_sec": 8.231128565996187, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8132, "loss": 3.9855246543884277, "lr": 0.00017164527267489907, "elapsed_sec": 67472.05238866806, "step_time_sec": 8.230085482005961, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8133, "loss": 4.162336826324463, "lr": 0.00017156011056964975, "elapsed_sec": 67480.28153014183, "step_time_sec": 8.229031194001436, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8134, "loss": 4.062328815460205, "lr": 0.00017147484327276416, "elapsed_sec": 67488.51238203049, "step_time_sec": 8.230673420999665, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8135, "loss": 4.027810573577881, "lr": 0.00017138947091899882, "elapsed_sec": 67496.74352097511, "step_time_sec": 8.230937311018351, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8136, "loss": 4.1177191734313965, "lr": 0.00017130399364327618, "elapsed_sec": 67504.97420573235, "step_time_sec": 8.230532132991357, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8137, "loss": 4.213068962097168, "lr": 0.00017121841158068463, "elapsed_sec": 67513.2046687603, "step_time_sec": 8.230310400016606, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8138, "loss": 4.265487194061279, "lr": 0.00017113272486647804, "elapsed_sec": 67521.43621373177, "step_time_sec": 8.231396354996832, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8139, "loss": 3.9536893367767334, "lr": 0.00017104693363607581, "elapsed_sec": 67529.66621255875, "step_time_sec": 8.229830621014116, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8140, "loss": 4.016124725341797, "lr": 0.00017096103802506242, "elapsed_sec": 67537.89829277992, "step_time_sec": 8.231931989983423, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8141, "loss": 4.106963634490967, "lr": 0.00017087503816918734, "elapsed_sec": 67546.12957954407, "step_time_sec": 8.231193247978808, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8142, "loss": 4.158844470977783, "lr": 0.00017078893420436483, "elapsed_sec": 67554.35964512825, "step_time_sec": 8.229842589993495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8143, "loss": 4.159726142883301, "lr": 0.0001707027262666736, "elapsed_sec": 67562.58792638779, "step_time_sec": 8.228117530001327, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8144, "loss": 4.146900177001953, "lr": 0.00017061641449235673, "elapsed_sec": 67570.81613373756, "step_time_sec": 8.228089426993392, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8145, "loss": 4.187472343444824, "lr": 0.0001705299990178214, "elapsed_sec": 67579.04404091835, "step_time_sec": 8.227755027997773, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8146, "loss": 4.058978080749512, "lr": 0.00017044347997963863, "elapsed_sec": 67587.27353262901, "step_time_sec": 8.229332290007733, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8147, "loss": 4.01951789855957, "lr": 0.0001703568575145433, "elapsed_sec": 67595.50287175179, "step_time_sec": 8.229198620974785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8148, "loss": 4.145716190338135, "lr": 0.00017027013175943345, "elapsed_sec": 67603.7331366539, "step_time_sec": 8.23010956600774, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8149, "loss": 4.1521382331848145, "lr": 0.00017018330285137058, "elapsed_sec": 67611.96316504478, "step_time_sec": 8.229931656009285, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8150, "loss": 4.090482234954834, "lr": 0.00017009637092757913, "elapsed_sec": 67620.19463348389, "step_time_sec": 8.231247326999437, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8151, "loss": 4.136586666107178, "lr": 0.00017000933612544637, "elapsed_sec": 67628.42497324944, "step_time_sec": 8.230215451010736, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8152, "loss": 4.2710652351379395, "lr": 0.00016992219858252212, "elapsed_sec": 67636.65450930595, "step_time_sec": 8.22937047801679, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8153, "loss": 4.138609409332275, "lr": 0.00016983495843651865, "elapsed_sec": 67644.88620948792, "step_time_sec": 8.231540488981409, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8154, "loss": 4.057893753051758, "lr": 0.00016974761582531033, "elapsed_sec": 67653.11763644218, "step_time_sec": 8.231201532995328, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8155, "loss": 4.2039642333984375, "lr": 0.00016966017088693343, "elapsed_sec": 67661.347489357, "step_time_sec": 8.229705675999867, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8156, "loss": 4.172255992889404, "lr": 0.00016957262375958602, "elapsed_sec": 67669.57857108116, "step_time_sec": 8.230970286997035, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8157, "loss": 4.159053325653076, "lr": 0.0001694849745816276, "elapsed_sec": 67677.8073644638, "step_time_sec": 8.228559231996769, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8158, "loss": 4.069680690765381, "lr": 0.000169397223491579, "elapsed_sec": 67686.03831481934, "step_time_sec": 8.230885396013036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8159, "loss": 4.290521621704102, "lr": 0.0001693093706281221, "elapsed_sec": 67694.27017116547, "step_time_sec": 8.23170195898274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8160, "loss": 4.261802673339844, "lr": 0.00016922141613009962, "elapsed_sec": 67702.50109553337, "step_time_sec": 8.230679383006645, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8161, "loss": 4.113680362701416, "lr": 0.00016913336013651484, "elapsed_sec": 67710.73161792755, "step_time_sec": 8.230435559991747, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8162, "loss": 4.002089977264404, "lr": 0.00016904520278653156, "elapsed_sec": 67718.96318984032, "step_time_sec": 8.231410970998695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8163, "loss": 4.163859844207764, "lr": 0.00016895694421947369, "elapsed_sec": 67727.19442105293, "step_time_sec": 8.231079716992099, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8164, "loss": 4.110896110534668, "lr": 0.0001688685845748251, "elapsed_sec": 67735.42611145973, "step_time_sec": 8.231518464017427, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8165, "loss": 4.153928279876709, "lr": 0.00016878012399222945, "elapsed_sec": 67743.65637636185, "step_time_sec": 8.230061593989376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8166, "loss": 4.052203178405762, "lr": 0.00016869156261148992, "elapsed_sec": 67751.88751983643, "step_time_sec": 8.230965605005622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8167, "loss": 4.097213268280029, "lr": 0.00016860290057256887, "elapsed_sec": 67760.11796116829, "step_time_sec": 8.230290325009264, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8168, "loss": 4.220771789550781, "lr": 0.00016851413801558792, "elapsed_sec": 67768.34862875938, "step_time_sec": 8.230529833992478, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8169, "loss": 4.0430803298950195, "lr": 0.00016842527508082743, "elapsed_sec": 67776.57960891724, "step_time_sec": 8.230827546998626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8170, "loss": 4.225661754608154, "lr": 0.0001683363119087265, "elapsed_sec": 67784.8104724884, "step_time_sec": 8.230734659999143, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8171, "loss": 4.091677665710449, "lr": 0.00016824724863988245, "elapsed_sec": 67793.04053664207, "step_time_sec": 8.229879383987281, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8172, "loss": 4.159618377685547, "lr": 0.00016815808541505105, "elapsed_sec": 67801.27103805542, "step_time_sec": 8.230375815997832, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8173, "loss": 4.0963897705078125, "lr": 0.0001680688223751458, "elapsed_sec": 67809.49970555305, "step_time_sec": 8.22849748202134, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8174, "loss": 4.186812877655029, "lr": 0.00016797945966123815, "elapsed_sec": 67817.73011636734, "step_time_sec": 8.230241815006593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8175, "loss": 4.268120765686035, "lr": 0.00016788999741455693, "elapsed_sec": 67825.96099758148, "step_time_sec": 8.230735394987278, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8176, "loss": 4.186574459075928, "lr": 0.00016780043577648834, "elapsed_sec": 67834.1901948452, "step_time_sec": 8.229102627985412, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8177, "loss": 4.142487049102783, "lr": 0.00016771077488857562, "elapsed_sec": 67842.4181432724, "step_time_sec": 8.227696426998591, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8178, "loss": 4.118481636047363, "lr": 0.00016762101489251895, "elapsed_sec": 67850.64772796631, "step_time_sec": 8.229497950000223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8179, "loss": 4.072386264801025, "lr": 0.00016753115593017503, "elapsed_sec": 67858.87804293633, "step_time_sec": 8.23014380299719, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8180, "loss": 4.022984504699707, "lr": 0.000167441198143557, "elapsed_sec": 67867.10825419426, "step_time_sec": 8.230025955999736, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8181, "loss": 3.9239237308502197, "lr": 0.00016735114167483424, "elapsed_sec": 67875.33813929558, "step_time_sec": 8.22978508900269, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8182, "loss": 4.006577014923096, "lr": 0.00016726098666633202, "elapsed_sec": 67883.56704854965, "step_time_sec": 8.228698496997822, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8183, "loss": 4.229240417480469, "lr": 0.00016717073326053135, "elapsed_sec": 67891.79674172401, "step_time_sec": 8.229529047996039, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8184, "loss": 4.076436519622803, "lr": 0.00016708038160006882, "elapsed_sec": 67900.02749824524, "step_time_sec": 8.230596345005324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8185, "loss": 4.12562894821167, "lr": 0.0001669899318277362, "elapsed_sec": 67908.25869441032, "step_time_sec": 8.231086710002273, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8186, "loss": 4.0791015625, "lr": 0.00016689938408648037, "elapsed_sec": 67916.48784089088, "step_time_sec": 8.228991051000776, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8187, "loss": 4.012454986572266, "lr": 0.000166808738519403, "elapsed_sec": 67924.71624732018, "step_time_sec": 8.228240643977188, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8188, "loss": 3.9097580909729004, "lr": 0.00016671799526976044, "elapsed_sec": 67932.94463610649, "step_time_sec": 8.228189526998904, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8189, "loss": 4.041913986206055, "lr": 0.0001666271544809634, "elapsed_sec": 67941.17549681664, "step_time_sec": 8.230719625978963, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8190, "loss": 3.9053757190704346, "lr": 0.00016653621629657666, "elapsed_sec": 67949.40598964691, "step_time_sec": 8.230320095986826, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8191, "loss": 4.081328392028809, "lr": 0.00016644518086031895, "elapsed_sec": 67957.63413357735, "step_time_sec": 8.227986843994586, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8192, "loss": 3.9336588382720947, "lr": 0.00016635404831606278, "elapsed_sec": 67965.86140108109, "step_time_sec": 8.22715448201052, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8193, "loss": 4.13126277923584, "lr": 0.0001662628188078341, "elapsed_sec": 67974.090644598, "step_time_sec": 8.22907320401282, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8194, "loss": 4.155962944030762, "lr": 0.00016617149247981198, "elapsed_sec": 67982.32114768028, "step_time_sec": 8.230315048014745, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8195, "loss": 4.221015930175781, "lr": 0.0001660800694763287, "elapsed_sec": 67990.5524930954, "step_time_sec": 8.231211934005842, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8196, "loss": 4.199339866638184, "lr": 0.00016598854994186917, "elapsed_sec": 67998.782828331, "step_time_sec": 8.230163757019909, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8197, "loss": 4.269638538360596, "lr": 0.00016589693402107095, "elapsed_sec": 68007.01277542114, "step_time_sec": 8.229782130016247, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8198, "loss": 4.1398115158081055, "lr": 0.00016580522185872385, "elapsed_sec": 68015.24402427673, "step_time_sec": 8.231105898012174, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8199, "loss": 4.0221028327941895, "lr": 0.0001657134135997698, "elapsed_sec": 68023.47506785393, "step_time_sec": 8.230882060015574, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8200, "loss": 4.071999549865723, "lr": 0.0001656215093893027, "elapsed_sec": 68031.70666408539, "step_time_sec": 8.231422103010118, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8201, "loss": 3.9782888889312744, "lr": 0.000165529509372568, "elapsed_sec": 68039.93748664856, "step_time_sec": 8.230708727001911, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8202, "loss": 4.099155426025391, "lr": 0.00016543741369496256, "elapsed_sec": 68048.16864323616, "step_time_sec": 8.230951076984638, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8203, "loss": 3.951138734817505, "lr": 0.00016534522250203444, "elapsed_sec": 68056.40058612823, "step_time_sec": 8.231785246025538, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8204, "loss": 3.9642722606658936, "lr": 0.0001652529359394827, "elapsed_sec": 68064.63236379623, "step_time_sec": 8.231640504003735, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8205, "loss": 3.933060646057129, "lr": 0.00016516055415315693, "elapsed_sec": 68072.86233615875, "step_time_sec": 8.229806354996981, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8206, "loss": 4.174562931060791, "lr": 0.00016506807728905755, "elapsed_sec": 68081.09323453903, "step_time_sec": 8.2307929660019, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8207, "loss": 4.0509819984436035, "lr": 0.0001649755054933349, "elapsed_sec": 68089.32360744476, "step_time_sec": 8.230215534975287, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8208, "loss": 4.112175464630127, "lr": 0.00016488283891228955, "elapsed_sec": 68097.5538239479, "step_time_sec": 8.230019061011262, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8209, "loss": 4.169012069702148, "lr": 0.00016479007769237188, "elapsed_sec": 68105.78459143639, "step_time_sec": 8.230643868999323, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8210, "loss": 4.121086120605469, "lr": 0.00016469722198018166, "elapsed_sec": 68114.01465845108, "step_time_sec": 8.229944846010767, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8211, "loss": 4.001013278961182, "lr": 0.00016460427192246818, "elapsed_sec": 68122.24590182304, "step_time_sec": 8.231025784014491, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8212, "loss": 4.166910171508789, "lr": 0.00016451122766612976, "elapsed_sec": 68130.47614908218, "step_time_sec": 8.230098795000231, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8213, "loss": 4.145337104797363, "lr": 0.0001644180893582136, "elapsed_sec": 68138.7067015171, "step_time_sec": 8.230454422009643, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8214, "loss": 3.899897813796997, "lr": 0.00016432485714591548, "elapsed_sec": 68146.93934631348, "step_time_sec": 8.23245213800692, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8215, "loss": 4.137608528137207, "lr": 0.0001642315311765797, "elapsed_sec": 68155.17054367065, "step_time_sec": 8.231043823005166, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8216, "loss": 4.096497058868408, "lr": 0.00016413811159769864, "elapsed_sec": 68163.40266990662, "step_time_sec": 8.232008722989121, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8217, "loss": 4.093592643737793, "lr": 0.00016404459855691268, "elapsed_sec": 68171.63386559486, "step_time_sec": 8.231021784013137, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8218, "loss": 4.055642127990723, "lr": 0.0001639509922020099, "elapsed_sec": 68179.86556267738, "step_time_sec": 8.231505181000102, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8219, "loss": 4.196927070617676, "lr": 0.00016385729268092577, "elapsed_sec": 68188.09612989426, "step_time_sec": 8.230370740988292, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8220, "loss": 4.095883846282959, "lr": 0.0001637635001417431, "elapsed_sec": 68196.32700610161, "step_time_sec": 8.23073044299963, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8221, "loss": 4.086783409118652, "lr": 0.00016366961473269172, "elapsed_sec": 68204.55842542648, "step_time_sec": 8.231229070021072, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8222, "loss": 3.9602646827697754, "lr": 0.00016357563660214812, "elapsed_sec": 68212.78962898254, "step_time_sec": 8.231116818002192, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8223, "loss": 4.078238487243652, "lr": 0.0001634815658986354, "elapsed_sec": 68221.0204372406, "step_time_sec": 8.230610151978908, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8224, "loss": 4.079604625701904, "lr": 0.00016338740277082293, "elapsed_sec": 68229.25092029572, "step_time_sec": 8.230329973011976, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8225, "loss": 3.787959098815918, "lr": 0.00016329314736752626, "elapsed_sec": 68237.48340797424, "step_time_sec": 8.232276886992622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8226, "loss": 4.068479537963867, "lr": 0.0001631987998377066, "elapsed_sec": 68245.71357107162, "step_time_sec": 8.23001772200223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8227, "loss": 4.08174467086792, "lr": 0.00016310436033047085, "elapsed_sec": 68253.94407224655, "step_time_sec": 8.230338311986998, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8228, "loss": 4.10393762588501, "lr": 0.00016300982899507133, "elapsed_sec": 68262.17454457283, "step_time_sec": 8.230292605992872, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8229, "loss": 3.9672842025756836, "lr": 0.0001629152059809053, "elapsed_sec": 68270.40370106697, "step_time_sec": 8.229038727993611, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8230, "loss": 4.117644786834717, "lr": 0.0001628204914375151, "elapsed_sec": 68278.63196635246, "step_time_sec": 8.228060557012213, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8231, "loss": 4.163902282714844, "lr": 0.00016272568551458765, "elapsed_sec": 68286.86238408089, "step_time_sec": 8.23030224701506, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8232, "loss": 3.988149642944336, "lr": 0.00016263078836195428, "elapsed_sec": 68295.09284877777, "step_time_sec": 8.230356760992436, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8233, "loss": 4.006316661834717, "lr": 0.00016253580012959046, "elapsed_sec": 68303.32224726677, "step_time_sec": 8.229181465983856, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8234, "loss": 4.122753620147705, "lr": 0.00016244072096761568, "elapsed_sec": 68311.55167675018, "step_time_sec": 8.229283248016145, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8235, "loss": 4.1180524826049805, "lr": 0.0001623455510262931, "elapsed_sec": 68319.7811665535, "step_time_sec": 8.229324600019027, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8236, "loss": 4.023460865020752, "lr": 0.00016225029045602939, "elapsed_sec": 68328.01211094856, "step_time_sec": 8.230759375990601, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8237, "loss": 4.150411605834961, "lr": 0.0001621549394073743, "elapsed_sec": 68336.24329161644, "step_time_sec": 8.231082490005065, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8238, "loss": 4.1273579597473145, "lr": 0.00016205949803102076, "elapsed_sec": 68344.47477126122, "step_time_sec": 8.231321967003169, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8239, "loss": 4.127530097961426, "lr": 0.0001619639664778044, "elapsed_sec": 68352.70527338982, "step_time_sec": 8.230270472995471, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8240, "loss": 3.9820287227630615, "lr": 0.00016186834489870328, "elapsed_sec": 68360.93633699417, "step_time_sec": 8.230942625988973, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8241, "loss": 4.031822681427002, "lr": 0.0001617726334448378, "elapsed_sec": 68369.16667175293, "step_time_sec": 8.230166779016145, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8242, "loss": 4.004646301269531, "lr": 0.00016167683226747046, "elapsed_sec": 68377.39753103256, "step_time_sec": 8.230687739007408, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8243, "loss": 4.127984046936035, "lr": 0.00016158094151800547, "elapsed_sec": 68385.62798595428, "step_time_sec": 8.23033948597731, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8244, "loss": 4.041343688964844, "lr": 0.00016148496134798853, "elapsed_sec": 68393.85991215706, "step_time_sec": 8.231722595024621, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8245, "loss": 4.12069845199585, "lr": 0.0001613888919091069, "elapsed_sec": 68402.09116148949, "step_time_sec": 8.231090596003924, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8246, "loss": 3.9748082160949707, "lr": 0.00016129273335318864, "elapsed_sec": 68410.32149028778, "step_time_sec": 8.230171697010519, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8247, "loss": 4.024083137512207, "lr": 0.0001611964858322029, "elapsed_sec": 68418.55207681656, "step_time_sec": 8.230434701981721, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8248, "loss": 4.0033488273620605, "lr": 0.00016110014949825924, "elapsed_sec": 68426.78304934502, "step_time_sec": 8.230804465012625, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8249, "loss": 4.023813724517822, "lr": 0.0001610037245036077, "elapsed_sec": 68435.01410102844, "step_time_sec": 8.230875146982726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8250, "loss": 4.169163703918457, "lr": 0.00016090721100063839, "elapsed_sec": 68443.24533987045, "step_time_sec": 8.231106024002656, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8251, "loss": 4.295607089996338, "lr": 0.00016081060914188125, "elapsed_sec": 68451.47656655312, "step_time_sec": 8.231103727011941, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8252, "loss": 4.103723049163818, "lr": 0.00016071391908000597, "elapsed_sec": 68459.70625305176, "step_time_sec": 8.229502660979051, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8253, "loss": 4.211152076721191, "lr": 0.0001606171409678216, "elapsed_sec": 68467.9363861084, "step_time_sec": 8.229969456006074, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8254, "loss": 4.168231010437012, "lr": 0.00016052027495827627, "elapsed_sec": 68476.16798949242, "step_time_sec": 8.23147986998083, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8255, "loss": 4.019128322601318, "lr": 0.00016042332120445708, "elapsed_sec": 68484.3982861042, "step_time_sec": 8.230150441988371, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8256, "loss": 4.262388706207275, "lr": 0.00016032627985958984, "elapsed_sec": 68492.62944102287, "step_time_sec": 8.230947981006466, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8257, "loss": 4.274378299713135, "lr": 0.0001602291510770387, "elapsed_sec": 68500.8600051403, "step_time_sec": 8.230465544998879, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8258, "loss": 4.180390357971191, "lr": 0.00016013193501030607, "elapsed_sec": 68509.09087395668, "step_time_sec": 8.230653423001058, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8259, "loss": 4.305572509765625, "lr": 0.00016003463181303224, "elapsed_sec": 68517.3190600872, "step_time_sec": 8.228065333008999, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8260, "loss": 4.210272789001465, "lr": 0.0001599372416389953, "elapsed_sec": 68525.5478553772, "step_time_sec": 8.228590762999374, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8261, "loss": 4.103196144104004, "lr": 0.0001598397646421107, "elapsed_sec": 68533.77706575394, "step_time_sec": 8.229053680988727, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8262, "loss": 4.153153896331787, "lr": 0.00015974220097643112, "elapsed_sec": 68542.0081949234, "step_time_sec": 8.230938838009024, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8263, "loss": 4.147850513458252, "lr": 0.00015964455079614623, "elapsed_sec": 68550.23987746239, "step_time_sec": 8.231518976012012, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8264, "loss": 4.270009517669678, "lr": 0.0001595468142555825, "elapsed_sec": 68558.47085571289, "step_time_sec": 8.230844297999283, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8265, "loss": 4.171886920928955, "lr": 0.00015944899150920275, "elapsed_sec": 68566.70114135742, "step_time_sec": 8.230218632990727, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8266, "loss": 4.096487522125244, "lr": 0.00015935108271160613, "elapsed_sec": 68574.9341878891, "step_time_sec": 8.232872504013358, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8267, "loss": 4.072835445404053, "lr": 0.00015925308801752774, "elapsed_sec": 68583.16398906708, "step_time_sec": 8.229549125971971, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8268, "loss": 4.019716262817383, "lr": 0.00015915500758183847, "elapsed_sec": 68591.39186501503, "step_time_sec": 8.227736261993414, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8269, "loss": 4.129780292510986, "lr": 0.00015905684155954466, "elapsed_sec": 68599.62047171593, "step_time_sec": 8.22845966098248, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8270, "loss": 4.288735389709473, "lr": 0.000158958590105788, "elapsed_sec": 68607.8516664505, "step_time_sec": 8.230989222996868, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8271, "loss": 4.2696123123168945, "lr": 0.0001588602533758451, "elapsed_sec": 68616.08280420303, "step_time_sec": 8.230964133021189, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8272, "loss": 4.023375034332275, "lr": 0.00015876183152512742, "elapsed_sec": 68624.3131518364, "step_time_sec": 8.230224372004159, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8273, "loss": 4.0406293869018555, "lr": 0.00015866332470918087, "elapsed_sec": 68632.5427865982, "step_time_sec": 8.2294451300113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8274, "loss": 4.2026238441467285, "lr": 0.00015856473308368572, "elapsed_sec": 68640.77193045616, "step_time_sec": 8.2290100120008, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8275, "loss": 4.0119099617004395, "lr": 0.00015846605680445617, "elapsed_sec": 68649.00302171707, "step_time_sec": 8.230957761988975, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8276, "loss": 4.261819362640381, "lr": 0.0001583672960274403, "elapsed_sec": 68657.23323702812, "step_time_sec": 8.230049427016638, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8277, "loss": 4.112046718597412, "lr": 0.00015826845090871971, "elapsed_sec": 68665.46412086487, "step_time_sec": 8.230664870992769, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8278, "loss": 4.289106369018555, "lr": 0.00015816952160450928, "elapsed_sec": 68673.69493150711, "step_time_sec": 8.230643391987542, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8279, "loss": 4.216383457183838, "lr": 0.00015807050827115692, "elapsed_sec": 68681.92573666573, "step_time_sec": 8.23069072101498, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8280, "loss": 4.121379375457764, "lr": 0.00015797141106514337, "elapsed_sec": 68690.15657281876, "step_time_sec": 8.230624284013174, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8281, "loss": 4.149533748626709, "lr": 0.00015787223014308193, "elapsed_sec": 68698.38737177849, "step_time_sec": 8.230688034993364, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8282, "loss": 4.080966472625732, "lr": 0.00015777296566171813, "elapsed_sec": 68706.61808872223, "step_time_sec": 8.230556129012257, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8283, "loss": 4.287896633148193, "lr": 0.0001576736177779297, "elapsed_sec": 68714.84882616997, "step_time_sec": 8.230539602984209, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8284, "loss": 4.189948081970215, "lr": 0.00015757418664872597, "elapsed_sec": 68723.0799484253, "step_time_sec": 8.230916365981102, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8285, "loss": 3.995246171951294, "lr": 0.00015747467243124806, "elapsed_sec": 68731.31033730507, "step_time_sec": 8.230242944991915, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8286, "loss": 4.154514312744141, "lr": 0.00015737507528276824, "elapsed_sec": 68739.54111337662, "step_time_sec": 8.230659690016182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8287, "loss": 3.970848321914673, "lr": 0.00015727539536068995, "elapsed_sec": 68747.7718732357, "step_time_sec": 8.230609188991366, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8288, "loss": 4.08168888092041, "lr": 0.00015717563282254728, "elapsed_sec": 68756.00168943405, "step_time_sec": 8.229573349992279, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8289, "loss": 3.994162082672119, "lr": 0.00015707578782600515, "elapsed_sec": 68764.23148465157, "step_time_sec": 8.22965603499324, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8290, "loss": 4.020946502685547, "lr": 0.00015697586052885855, "elapsed_sec": 68772.46143341064, "step_time_sec": 8.229745655989973, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8291, "loss": 4.139753818511963, "lr": 0.0001568758510890327, "elapsed_sec": 68780.69231390953, "step_time_sec": 8.230747249996057, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8292, "loss": 4.033897399902344, "lr": 0.00015677575966458252, "elapsed_sec": 68788.92300224304, "step_time_sec": 8.230498599994462, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8293, "loss": 4.068337440490723, "lr": 0.00015667558641369256, "elapsed_sec": 68797.15318489075, "step_time_sec": 8.230023576994427, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8294, "loss": 3.9719080924987793, "lr": 0.0001565753314946767, "elapsed_sec": 68805.38337111473, "step_time_sec": 8.230050705024041, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8295, "loss": 4.162044048309326, "lr": 0.0001564749950659779, "elapsed_sec": 68813.61395335197, "step_time_sec": 8.230406748974929, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8296, "loss": 3.9566891193389893, "lr": 0.00015637457728616785, "elapsed_sec": 68821.84477353096, "step_time_sec": 8.230716786987614, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8297, "loss": 3.989258289337158, "lr": 0.00015627407831394688, "elapsed_sec": 68830.07573318481, "step_time_sec": 8.23074847998214, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8298, "loss": 4.266422748565674, "lr": 0.00015617349830814367, "elapsed_sec": 68838.30755734444, "step_time_sec": 8.231644396000775, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8299, "loss": 4.189156532287598, "lr": 0.00015607283742771487, "elapsed_sec": 68846.53794336319, "step_time_sec": 8.230235583992908, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8300, "loss": 4.117122173309326, "lr": 0.00015597209583174498, "elapsed_sec": 68854.76901769638, "step_time_sec": 8.230950403987663, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8301, "loss": 4.1360297203063965, "lr": 0.00015587127367944613, "elapsed_sec": 68862.99980020523, "step_time_sec": 8.230558821989689, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8302, "loss": 4.193490028381348, "lr": 0.00015577037113015765, "elapsed_sec": 68871.2320318222, "step_time_sec": 8.232090503006475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8303, "loss": 4.189589977264404, "lr": 0.00015566938834334605, "elapsed_sec": 68879.46271562576, "step_time_sec": 8.230524985992815, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8304, "loss": 4.095293045043945, "lr": 0.00015556832547860454, "elapsed_sec": 68887.69310975075, "step_time_sec": 8.230228461994557, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8305, "loss": 4.039289951324463, "lr": 0.00015546718269565288, "elapsed_sec": 68895.9243273735, "step_time_sec": 8.231067569984589, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8306, "loss": 4.037785053253174, "lr": 0.00015536596015433728, "elapsed_sec": 68904.15447926521, "step_time_sec": 8.229995650995988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8307, "loss": 4.131558418273926, "lr": 0.00015526465801462985, "elapsed_sec": 68912.38533043861, "step_time_sec": 8.230687874980504, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8308, "loss": 4.093619346618652, "lr": 0.00015516327643662857, "elapsed_sec": 68920.61657905579, "step_time_sec": 8.2311118899961, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8309, "loss": 4.108926773071289, "lr": 0.0001550618155805569, "elapsed_sec": 68928.84726834297, "step_time_sec": 8.230611488979775, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8310, "loss": 3.906359910964966, "lr": 0.0001549602756067637, "elapsed_sec": 68937.07899308205, "step_time_sec": 8.231527209980413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8311, "loss": 4.073352813720703, "lr": 0.00015485865667572274, "elapsed_sec": 68945.31035733223, "step_time_sec": 8.231122950004647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8312, "loss": 4.0147705078125, "lr": 0.00015475695894803264, "elapsed_sec": 68953.54115986824, "step_time_sec": 8.23067295801593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8313, "loss": 4.1811113357543945, "lr": 0.0001546551825844166, "elapsed_sec": 68961.77218604088, "step_time_sec": 8.230876095010899, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8314, "loss": 4.133883953094482, "lr": 0.00015455332774572198, "elapsed_sec": 68970.00295948982, "step_time_sec": 8.230667708005058, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8315, "loss": 4.0809645652771, "lr": 0.00015445139459292027, "elapsed_sec": 68978.23366832733, "step_time_sec": 8.230544646998169, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8316, "loss": 4.187154769897461, "lr": 0.00015434938328710667, "elapsed_sec": 68986.4647552967, "step_time_sec": 8.230891162005719, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8317, "loss": 4.035526275634766, "lr": 0.00015424729398949992, "elapsed_sec": 68994.6957564354, "step_time_sec": 8.23088942898903, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8318, "loss": 4.079876899719238, "lr": 0.0001541451268614419, "elapsed_sec": 69002.92647266388, "step_time_sec": 8.230516274983529, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8319, "loss": 4.058657169342041, "lr": 0.00015404288206439777, "elapsed_sec": 69011.15730977058, "step_time_sec": 8.230681958026253, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8320, "loss": 4.127237319946289, "lr": 0.0001539405597599551, "elapsed_sec": 69019.38719415665, "step_time_sec": 8.229746655997587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8321, "loss": 4.071511268615723, "lr": 0.00015383816010982427, "elapsed_sec": 69027.61916923523, "step_time_sec": 8.231797504005954, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8322, "loss": 4.212087154388428, "lr": 0.00015373568327583763, "elapsed_sec": 69035.84973335266, "step_time_sec": 8.23048206299427, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8323, "loss": 4.150247573852539, "lr": 0.00015363312941994966, "elapsed_sec": 69044.0784676075, "step_time_sec": 8.2285314979963, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8324, "loss": 4.115091800689697, "lr": 0.00015353049870423655, "elapsed_sec": 69052.30616164207, "step_time_sec": 8.227579976984998, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8325, "loss": 4.0471367835998535, "lr": 0.00015342779129089593, "elapsed_sec": 69060.5370452404, "step_time_sec": 8.230674350023037, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8326, "loss": 4.018762588500977, "lr": 0.00015332500734224663, "elapsed_sec": 69068.76852464676, "step_time_sec": 8.231267456983915, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8327, "loss": 4.175698280334473, "lr": 0.0001532221470207285, "elapsed_sec": 69076.99941301346, "step_time_sec": 8.230779385979986, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8328, "loss": 4.116957664489746, "lr": 0.000153119210488902, "elapsed_sec": 69085.23138594627, "step_time_sec": 8.231839991989546, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8329, "loss": 3.9734926223754883, "lr": 0.00015301619790944817, "elapsed_sec": 69093.46235609055, "step_time_sec": 8.230744141998002, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8330, "loss": 4.118539810180664, "lr": 0.00015291310944516806, "elapsed_sec": 69101.69329977036, "step_time_sec": 8.230810349981766, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8331, "loss": 3.9776179790496826, "lr": 0.00015280994525898275, "elapsed_sec": 69109.92622542381, "step_time_sec": 8.23285273698275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8332, "loss": 3.9169912338256836, "lr": 0.00015270670551393304, "elapsed_sec": 69118.15853500366, "step_time_sec": 8.232109642005526, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8333, "loss": 4.190698146820068, "lr": 0.00015260339037317903, "elapsed_sec": 69126.39123344421, "step_time_sec": 8.232558582007186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8334, "loss": 3.948875665664673, "lr": 0.0001525, "elapsed_sec": 69134.62390637398, "step_time_sec": 8.232476692995988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8335, "loss": 4.166153907775879, "lr": 0.00015239653455779422, "elapsed_sec": 69142.85606813431, "step_time_sec": 8.231908538989956, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8336, "loss": 4.117424011230469, "lr": 0.0001522929942100785, "elapsed_sec": 69151.08662438393, "step_time_sec": 8.230401209992124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8337, "loss": 4.2527689933776855, "lr": 0.00015218937912048807, "elapsed_sec": 69159.31796836853, "step_time_sec": 8.231191650993424, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8338, "loss": 4.175387859344482, "lr": 0.00015208568945277627, "elapsed_sec": 69167.54699063301, "step_time_sec": 8.228908664983464, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8339, "loss": 3.967604160308838, "lr": 0.0001519819253708143, "elapsed_sec": 69175.77742505074, "step_time_sec": 8.230231172026834, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8340, "loss": 4.202940940856934, "lr": 0.00015187808703859103, "elapsed_sec": 69184.00603199005, "step_time_sec": 8.22844947699923, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8341, "loss": 4.148523807525635, "lr": 0.0001517741746202125, "elapsed_sec": 69192.23629045486, "step_time_sec": 8.230101075983839, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8342, "loss": 4.1839118003845215, "lr": 0.00015167018827990208, "elapsed_sec": 69200.46673583984, "step_time_sec": 8.230252429988468, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8343, "loss": 3.9809677600860596, "lr": 0.00015156612818199977, "elapsed_sec": 69208.69630002975, "step_time_sec": 8.229381116980221, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8344, "loss": 4.076254367828369, "lr": 0.00015146199449096224, "elapsed_sec": 69216.92643404007, "step_time_sec": 8.229979416995775, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8345, "loss": 3.9404726028442383, "lr": 0.00015135778737136244, "elapsed_sec": 69225.15643310547, "step_time_sec": 8.22987143197679, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8346, "loss": 3.9062464237213135, "lr": 0.00015125350698788934, "elapsed_sec": 69233.38692998886, "step_time_sec": 8.230295774003025, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8347, "loss": 4.133115768432617, "lr": 0.00015114915350534767, "elapsed_sec": 69241.61691617966, "step_time_sec": 8.229820250999182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8348, "loss": 4.00809383392334, "lr": 0.0001510447270886578, "elapsed_sec": 69249.84622573853, "step_time_sec": 8.22922558299615, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8349, "loss": 4.085416793823242, "lr": 0.00015094022790285532, "elapsed_sec": 69258.0774281025, "step_time_sec": 8.230974669015268, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8350, "loss": 3.962641716003418, "lr": 0.00015083565611309074, "elapsed_sec": 69266.30875468254, "step_time_sec": 8.23114901498775, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8351, "loss": 4.069022178649902, "lr": 0.0001507310118846294, "elapsed_sec": 69274.53964567184, "step_time_sec": 8.23072479700204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8352, "loss": 3.9148173332214355, "lr": 0.00015062629538285108, "elapsed_sec": 69282.77060890198, "step_time_sec": 8.230784833984217, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8353, "loss": 4.058850288391113, "lr": 0.0001505215067732498, "elapsed_sec": 69291.00173902512, "step_time_sec": 8.230980151012773, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8354, "loss": 3.984717845916748, "lr": 0.0001504166462214336, "elapsed_sec": 69299.23070549965, "step_time_sec": 8.228840344003402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8355, "loss": 4.065704345703125, "lr": 0.00015031171389312406, "elapsed_sec": 69307.46014809608, "step_time_sec": 8.229301031009527, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8356, "loss": 4.28450870513916, "lr": 0.00015020670995415637, "elapsed_sec": 69315.69005775452, "step_time_sec": 8.229786709998734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8357, "loss": 4.06116247177124, "lr": 0.00015010163457047877, "elapsed_sec": 69323.91779947281, "step_time_sec": 8.227517799998168, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8358, "loss": 3.9627175331115723, "lr": 0.00014999648790815247, "elapsed_sec": 69332.14524245262, "step_time_sec": 8.227346230996773, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8359, "loss": 3.9991085529327393, "lr": 0.00014989127013335127, "elapsed_sec": 69340.37708687782, "step_time_sec": 8.231695374997798, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8360, "loss": 4.076084613800049, "lr": 0.0001497859814123615, "elapsed_sec": 69348.60779929161, "step_time_sec": 8.230537750991061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8361, "loss": 4.073925971984863, "lr": 0.00014968062191158144, "elapsed_sec": 69356.83726859093, "step_time_sec": 8.229303316998994, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8362, "loss": 4.111567974090576, "lr": 0.00014957519179752132, "elapsed_sec": 69365.06671214104, "step_time_sec": 8.229241415014258, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8363, "loss": 4.1499834060668945, "lr": 0.00014946969123680293, "elapsed_sec": 69373.29407668114, "step_time_sec": 8.227159353002207, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8364, "loss": 4.177452087402344, "lr": 0.00014936412039615946, "elapsed_sec": 69381.52546739578, "step_time_sec": 8.231232929014368, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8365, "loss": 3.9971730709075928, "lr": 0.0001492584794424351, "elapsed_sec": 69389.75595712662, "step_time_sec": 8.230330660007894, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8366, "loss": 4.1594672203063965, "lr": 0.00014915276854258485, "elapsed_sec": 69397.98452758789, "step_time_sec": 8.228411534015322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8367, "loss": 3.9980247020721436, "lr": 0.00014904698786367433, "elapsed_sec": 69406.21446990967, "step_time_sec": 8.229781233007088, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8368, "loss": 4.032858848571777, "lr": 0.00014894113757287937, "elapsed_sec": 69414.44431424141, "step_time_sec": 8.229705321980873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8369, "loss": 4.116787910461426, "lr": 0.0001488352178374858, "elapsed_sec": 69422.67515945435, "step_time_sec": 8.230677134008147, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8370, "loss": 4.0433125495910645, "lr": 0.00014872922882488925, "elapsed_sec": 69430.90555095673, "step_time_sec": 8.230244431993924, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8371, "loss": 4.082945823669434, "lr": 0.0001486231707025948, "elapsed_sec": 69439.13664746284, "step_time_sec": 8.230934636987513, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8372, "loss": 4.124801158905029, "lr": 0.00014851704363821677, "elapsed_sec": 69447.36743807793, "step_time_sec": 8.2306929169863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8373, "loss": 4.128530502319336, "lr": 0.00014841084779947846, "elapsed_sec": 69455.5988240242, "step_time_sec": 8.23114590899786, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8374, "loss": 3.944819450378418, "lr": 0.00014830458335421184, "elapsed_sec": 69463.82914018631, "step_time_sec": 8.23014763998799, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8375, "loss": 4.0259318351745605, "lr": 0.00014819825047035723, "elapsed_sec": 69472.0609908104, "step_time_sec": 8.23167511599604, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8376, "loss": 4.309098720550537, "lr": 0.0001480918493159632, "elapsed_sec": 69480.29172945023, "step_time_sec": 8.230620516987983, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8377, "loss": 4.0497589111328125, "lr": 0.00014798538005918624, "elapsed_sec": 69488.5226047039, "step_time_sec": 8.230747211986454, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8378, "loss": 4.050824165344238, "lr": 0.00014787884286829043, "elapsed_sec": 69496.75456142426, "step_time_sec": 8.231824180984404, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8379, "loss": 4.080826282501221, "lr": 0.00014777223791164715, "elapsed_sec": 69504.98391342163, "step_time_sec": 8.22915819499758, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8380, "loss": 4.170077800750732, "lr": 0.00014766556535773497, "elapsed_sec": 69513.21321749687, "step_time_sec": 8.229173412983073, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8381, "loss": 4.287255764007568, "lr": 0.00014755882537513925, "elapsed_sec": 69521.44338965416, "step_time_sec": 8.229925696010469, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8382, "loss": 3.9837591648101807, "lr": 0.00014745201813255187, "elapsed_sec": 69529.67366743088, "step_time_sec": 8.230112555000233, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8383, "loss": 4.021696090698242, "lr": 0.00014734514379877116, "elapsed_sec": 69537.9031662941, "step_time_sec": 8.229353105998598, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8384, "loss": 4.076295852661133, "lr": 0.00014723820254270128, "elapsed_sec": 69546.13390493393, "step_time_sec": 8.230628749006428, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8385, "loss": 4.136383056640625, "lr": 0.0001471311945333523, "elapsed_sec": 69554.36583089828, "step_time_sec": 8.231769592006458, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8386, "loss": 3.8759660720825195, "lr": 0.0001470241199398397, "elapsed_sec": 69562.59616422653, "step_time_sec": 8.230160588020226, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8387, "loss": 4.0484538078308105, "lr": 0.00014691697893138428, "elapsed_sec": 69570.82673335075, "step_time_sec": 8.230434316996252, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8388, "loss": 4.074484825134277, "lr": 0.00014680977167731173, "elapsed_sec": 69579.05884981155, "step_time_sec": 8.231886231980752, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8389, "loss": 4.061526775360107, "lr": 0.00014670249834705242, "elapsed_sec": 69587.29011392593, "step_time_sec": 8.231169854989275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8390, "loss": 4.113466262817383, "lr": 0.00014659515911014123, "elapsed_sec": 69595.52094960213, "step_time_sec": 8.230663615016965, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8391, "loss": 4.018374443054199, "lr": 0.00014648775413621715, "elapsed_sec": 69603.7520661354, "step_time_sec": 8.230913876002887, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8392, "loss": 4.102305889129639, "lr": 0.00014638028359502303, "elapsed_sec": 69611.98370933533, "step_time_sec": 8.23148486498394, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8393, "loss": 4.1290998458862305, "lr": 0.00014627274765640537, "elapsed_sec": 69620.21465444565, "step_time_sec": 8.230837409006199, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8394, "loss": 4.207062244415283, "lr": 0.00014616514649031407, "elapsed_sec": 69628.44620513916, "step_time_sec": 8.23143886300386, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8395, "loss": 4.326524257659912, "lr": 0.00014605748026680202, "elapsed_sec": 69636.67610740662, "step_time_sec": 8.229669908003416, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8396, "loss": 4.0695672035217285, "lr": 0.000145949749156025, "elapsed_sec": 69644.904733181, "step_time_sec": 8.228479056997458, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8397, "loss": 4.181346893310547, "lr": 0.00014584195332824137, "elapsed_sec": 69653.1346757412, "step_time_sec": 8.229778577020625, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8398, "loss": 4.133565425872803, "lr": 0.00014573409295381165, "elapsed_sec": 69661.36639118195, "step_time_sec": 8.231547308969311, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8399, "loss": 4.153907299041748, "lr": 0.00014562616820319849, "elapsed_sec": 69669.597427845, "step_time_sec": 8.230859073984902, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8400, "loss": 4.028529644012451, "lr": 0.00014551817924696617, "elapsed_sec": 69677.82640600204, "step_time_sec": 8.228822399978526, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8401, "loss": 3.966414451599121, "lr": 0.0001454101262557805, "elapsed_sec": 69686.05624198914, "step_time_sec": 8.229612442024518, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8402, "loss": 4.022522926330566, "lr": 0.00014530200940040852, "elapsed_sec": 69694.28485798836, "step_time_sec": 8.228475188021548, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8403, "loss": 3.9255545139312744, "lr": 0.00014519382885171822, "elapsed_sec": 69702.51538562775, "step_time_sec": 8.230411414988339, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8404, "loss": 4.079139232635498, "lr": 0.00014508558478067808, "elapsed_sec": 69710.74699544907, "step_time_sec": 8.231491181999445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8405, "loss": 4.051558017730713, "lr": 0.0001449772773583572, "elapsed_sec": 69718.97824382782, "step_time_sec": 8.23098624200793, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8406, "loss": 3.9423062801361084, "lr": 0.00014486890675592461, "elapsed_sec": 69727.20982718468, "step_time_sec": 8.231422966986429, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8407, "loss": 4.2686767578125, "lr": 0.0001447604731446493, "elapsed_sec": 69735.44116950035, "step_time_sec": 8.231196064967662, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8408, "loss": 4.133525371551514, "lr": 0.00014465197669589984, "elapsed_sec": 69743.67262458801, "step_time_sec": 8.231213813996874, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8409, "loss": 4.039440631866455, "lr": 0.00014454341758114405, "elapsed_sec": 69751.90388822556, "step_time_sec": 8.231137668015435, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8410, "loss": 4.101277828216553, "lr": 0.00014443479597194881, "elapsed_sec": 69760.13537979126, "step_time_sec": 8.231294427998364, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8411, "loss": 4.052449703216553, "lr": 0.0001443261120399798, "elapsed_sec": 69768.366147995, "step_time_sec": 8.230682905996218, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8412, "loss": 4.11983060836792, "lr": 0.00014421736595700112, "elapsed_sec": 69776.59670376778, "step_time_sec": 8.230340156995226, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8413, "loss": 4.0898823738098145, "lr": 0.00014410855789487518, "elapsed_sec": 69784.82550811768, "step_time_sec": 8.228617706045043, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8414, "loss": 4.185410499572754, "lr": 0.0001439996880255623, "elapsed_sec": 69793.05618786812, "step_time_sec": 8.230513254995458, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8415, "loss": 3.9276773929595947, "lr": 0.0001438907565211205, "elapsed_sec": 69801.28736233711, "step_time_sec": 8.23100005602464, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8416, "loss": 4.154094219207764, "lr": 0.00014378176355370515, "elapsed_sec": 69809.51813983917, "step_time_sec": 8.230708212999161, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8417, "loss": 4.061429977416992, "lr": 0.0001436727092955688, "elapsed_sec": 69817.74828124046, "step_time_sec": 8.229928993969224, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8418, "loss": 3.9526777267456055, "lr": 0.0001435635939190609, "elapsed_sec": 69825.97960639, "step_time_sec": 8.231141704018228, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8419, "loss": 4.045917510986328, "lr": 0.00014345441759662742, "elapsed_sec": 69834.21158313751, "step_time_sec": 8.23181663901778, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8420, "loss": 3.9229605197906494, "lr": 0.00014334518050081069, "elapsed_sec": 69842.44261360168, "step_time_sec": 8.230857237009332, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8421, "loss": 3.9440906047821045, "lr": 0.00014323588280424908, "elapsed_sec": 69850.67212200165, "step_time_sec": 8.229410200030543, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8422, "loss": 4.023775100708008, "lr": 0.00014312652467967672, "elapsed_sec": 69858.90309500694, "step_time_sec": 8.230730751994997, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8423, "loss": 4.149770259857178, "lr": 0.00014301710629992327, "elapsed_sec": 69867.13461899757, "step_time_sec": 8.231419282965362, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8424, "loss": 3.982006311416626, "lr": 0.0001429076278379136, "elapsed_sec": 69875.36562824249, "step_time_sec": 8.23081832297612, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8425, "loss": 4.084634780883789, "lr": 0.0001427980894666675, "elapsed_sec": 69883.59703755379, "step_time_sec": 8.2311855989974, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8426, "loss": 4.1751203536987305, "lr": 0.0001426884913592995, "elapsed_sec": 69891.82826638222, "step_time_sec": 8.231120190001093, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8427, "loss": 4.032606601715088, "lr": 0.0001425788336890185, "elapsed_sec": 69900.05903077126, "step_time_sec": 8.230557226983365, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8428, "loss": 4.054851531982422, "lr": 0.00014246911662912757, "elapsed_sec": 69908.28941512108, "step_time_sec": 8.230240545992274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8429, "loss": 4.070963382720947, "lr": 0.0001423593403530236, "elapsed_sec": 69916.52035236359, "step_time_sec": 8.23075816599885, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8430, "loss": 4.173376083374023, "lr": 0.00014224950503419705, "elapsed_sec": 69924.75141906738, "step_time_sec": 8.230918619025033, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8431, "loss": 3.990043878555298, "lr": 0.00014213961084623175, "elapsed_sec": 69932.98284173012, "step_time_sec": 8.231303056003526, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8432, "loss": 4.181698799133301, "lr": 0.00014202965796280455, "elapsed_sec": 69941.21447587013, "step_time_sec": 8.231412883964367, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8433, "loss": 4.042077541351318, "lr": 0.000141919646557685, "elapsed_sec": 69949.44606471062, "step_time_sec": 8.231445006967988, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8434, "loss": 4.105112552642822, "lr": 0.00014180957680473525, "elapsed_sec": 69957.67721319199, "step_time_sec": 8.23097112798132, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8435, "loss": 4.150096416473389, "lr": 0.00014169944887790955, "elapsed_sec": 69965.90784430504, "step_time_sec": 8.230467574030627, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8436, "loss": 4.127730369567871, "lr": 0.00014158926295125413, "elapsed_sec": 69974.13947701454, "step_time_sec": 8.231559314008337, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8437, "loss": 4.034190654754639, "lr": 0.00014147901919890691, "elapsed_sec": 69982.3702147007, "step_time_sec": 8.23051469703205, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8438, "loss": 4.049023628234863, "lr": 0.00014136871779509724, "elapsed_sec": 69990.60113286972, "step_time_sec": 8.23083332797978, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8439, "loss": 4.009584903717041, "lr": 0.0001412583589141454, "elapsed_sec": 69998.83233642578, "step_time_sec": 8.230989010015037, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8440, "loss": 3.876335620880127, "lr": 0.00014114794273046273, "elapsed_sec": 70007.06202292442, "step_time_sec": 8.22953748004511, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8441, "loss": 4.156271934509277, "lr": 0.00014103746941855098, "elapsed_sec": 70015.29343819618, "step_time_sec": 8.231292227981612, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8442, "loss": 4.1369709968566895, "lr": 0.0001409269391530022, "elapsed_sec": 70023.52394890785, "step_time_sec": 8.230290498002432, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8443, "loss": 4.104703426361084, "lr": 0.0001408163521084986, "elapsed_sec": 70031.7558953762, "step_time_sec": 8.23178759298753, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8444, "loss": 3.9610755443573, "lr": 0.0001407057084598119, "elapsed_sec": 70039.98686695099, "step_time_sec": 8.230872724030633, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8445, "loss": 4.092274188995361, "lr": 0.0001405950083818034, "elapsed_sec": 70048.2187321186, "step_time_sec": 8.231652415008284, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8446, "loss": 4.212036609649658, "lr": 0.00014048425204942361, "elapsed_sec": 70056.44943642616, "step_time_sec": 8.230586552002933, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8447, "loss": 4.364424228668213, "lr": 0.00014037343963771188, "elapsed_sec": 70064.68111205101, "step_time_sec": 8.231460294977296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8448, "loss": 4.101032257080078, "lr": 0.00014026257132179618, "elapsed_sec": 70072.91225934029, "step_time_sec": 8.231053613999393, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8449, "loss": 4.158392906188965, "lr": 0.0001401516472768929, "elapsed_sec": 70081.14363765717, "step_time_sec": 8.231199815985747, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8450, "loss": 4.216705322265625, "lr": 0.00014004066767830644, "elapsed_sec": 70089.37197637558, "step_time_sec": 8.228124669985846, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8451, "loss": 4.08394193649292, "lr": 0.00013992963270142904, "elapsed_sec": 70097.60281658173, "step_time_sec": 8.230662957997993, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8452, "loss": 4.153367519378662, "lr": 0.0001398185425217404, "elapsed_sec": 70105.83440566063, "step_time_sec": 8.231476171989925, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8453, "loss": 4.093417167663574, "lr": 0.00013970739731480756, "elapsed_sec": 70114.06551337242, "step_time_sec": 8.230982531968039, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8454, "loss": 4.066890716552734, "lr": 0.0001395961972562844, "elapsed_sec": 70122.29747080803, "step_time_sec": 8.231794469000306, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8455, "loss": 4.119101047515869, "lr": 0.00013948494252191161, "elapsed_sec": 70130.53165364265, "step_time_sec": 8.234048124984838, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8456, "loss": 4.222465515136719, "lr": 0.0001393736332875162, "elapsed_sec": 70138.76373505592, "step_time_sec": 8.231881337007508, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8457, "loss": 4.034773826599121, "lr": 0.0001392622697290114, "elapsed_sec": 70146.99614167213, "step_time_sec": 8.232196306984406, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8458, "loss": 4.132722854614258, "lr": 0.00013915085202239618, "elapsed_sec": 70155.22841143608, "step_time_sec": 8.2321588639752, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8459, "loss": 4.094777584075928, "lr": 0.00013903938034375517, "elapsed_sec": 70163.46019816399, "step_time_sec": 8.23162268300075, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8460, "loss": 3.9427547454833984, "lr": 0.00013892785486925826, "elapsed_sec": 70171.69178843498, "step_time_sec": 8.231396980991121, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8461, "loss": 4.132266998291016, "lr": 0.0001388162757751604, "elapsed_sec": 70179.92441344261, "step_time_sec": 8.232429363997653, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8462, "loss": 4.037353038787842, "lr": 0.00013870464323780123, "elapsed_sec": 70188.15495705605, "step_time_sec": 8.23044109303737, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8463, "loss": 4.0737433433532715, "lr": 0.0001385929574336049, "elapsed_sec": 70196.38751196861, "step_time_sec": 8.232384972972795, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8464, "loss": 4.151650905609131, "lr": 0.00013848121853907971, "elapsed_sec": 70204.61916661263, "step_time_sec": 8.23141739401035, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8465, "loss": 3.9840283393859863, "lr": 0.00013836942673081784, "elapsed_sec": 70212.84933757782, "step_time_sec": 8.230004540004302, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8466, "loss": 4.016157627105713, "lr": 0.00013825758218549515, "elapsed_sec": 70221.08074307442, "step_time_sec": 8.231250570039265, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8467, "loss": 4.0240044593811035, "lr": 0.00013814568507987087, "elapsed_sec": 70229.31102514267, "step_time_sec": 8.230130401032511, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8468, "loss": 4.005252838134766, "lr": 0.00013803373559078715, "elapsed_sec": 70237.54159283638, "step_time_sec": 8.230371660960373, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8469, "loss": 4.182056903839111, "lr": 0.0001379217338951691, "elapsed_sec": 70245.76915621758, "step_time_sec": 8.227450847974978, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8470, "loss": 4.069942474365234, "lr": 0.00013780968017002422, "elapsed_sec": 70253.9945397377, "step_time_sec": 8.225243292981759, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8471, "loss": 4.173440933227539, "lr": 0.00013769757459244233, "elapsed_sec": 70262.22342467308, "step_time_sec": 8.22869521501707, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8472, "loss": 4.091847896575928, "lr": 0.0001375854173395951, "elapsed_sec": 70270.45378732681, "step_time_sec": 8.230162953026593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8473, "loss": 4.030980110168457, "lr": 0.00013747320858873595, "elapsed_sec": 70278.6855340004, "step_time_sec": 8.231567458016798, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8474, "loss": 3.9271488189697266, "lr": 0.00013736094851719965, "elapsed_sec": 70286.91644263268, "step_time_sec": 8.23076616995968, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8475, "loss": 4.069836616516113, "lr": 0.00013724863730240204, "elapsed_sec": 70295.14843726158, "step_time_sec": 8.231868928996846, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8476, "loss": 4.095757007598877, "lr": 0.00013713627512183986, "elapsed_sec": 70303.37961959839, "step_time_sec": 8.23104585101828, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8477, "loss": 4.104904651641846, "lr": 0.00013702386215309038, "elapsed_sec": 70311.61224412918, "step_time_sec": 8.232473960029893, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8478, "loss": 4.034915924072266, "lr": 0.0001369113985738111, "elapsed_sec": 70319.84422111511, "step_time_sec": 8.231813493010122, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8479, "loss": 4.172235012054443, "lr": 0.0001367988845617395, "elapsed_sec": 70328.07805490494, "step_time_sec": 8.233666763000656, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8480, "loss": 3.943650484085083, "lr": 0.00013668632029469283, "elapsed_sec": 70336.30994081497, "step_time_sec": 8.231715807982255, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8481, "loss": 3.9419240951538086, "lr": 0.00013657370595056767, "elapsed_sec": 70344.54152393341, "step_time_sec": 8.23141751496587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8482, "loss": 4.142129898071289, "lr": 0.00013646104170733986, "elapsed_sec": 70352.7734375, "step_time_sec": 8.231667194981128, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8483, "loss": 4.221067905426025, "lr": 0.00013634832774306397, "elapsed_sec": 70361.0033826828, "step_time_sec": 8.229857517988421, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8484, "loss": 4.033290863037109, "lr": 0.00013623556423587326, "elapsed_sec": 70369.23327636719, "step_time_sec": 8.229777105967514, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8485, "loss": 4.017926216125488, "lr": 0.0001361227513639792, "elapsed_sec": 70377.46312689781, "step_time_sec": 8.229548241011798, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8486, "loss": 4.0155768394470215, "lr": 0.00013600988930567137, "elapsed_sec": 70385.69363641739, "step_time_sec": 8.230404069006909, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8487, "loss": 4.0315093994140625, "lr": 0.00013589697823931699, "elapsed_sec": 70393.92413377762, "step_time_sec": 8.230253451969475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8488, "loss": 4.178143501281738, "lr": 0.00013578401834336078, "elapsed_sec": 70402.15463900566, "step_time_sec": 8.230390852026176, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8489, "loss": 4.280350685119629, "lr": 0.0001356710097963246, "elapsed_sec": 70410.38541460037, "step_time_sec": 8.230586049030535, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8490, "loss": 3.9715516567230225, "lr": 0.00013555795277680726, "elapsed_sec": 70418.61607480049, "step_time_sec": 8.230487207998522, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8491, "loss": 4.1142802238464355, "lr": 0.00013544484746348413, "elapsed_sec": 70426.84704971313, "step_time_sec": 8.230822048033588, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8492, "loss": 4.146721839904785, "lr": 0.0001353316940351069, "elapsed_sec": 70435.078343153, "step_time_sec": 8.231095598952379, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8493, "loss": 3.9573018550872803, "lr": 0.00013521849267050332, "elapsed_sec": 70443.30939340591, "step_time_sec": 8.23093678301666, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8494, "loss": 4.103301525115967, "lr": 0.0001351052435485769, "elapsed_sec": 70451.54033327103, "step_time_sec": 8.230759282014333, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8495, "loss": 4.057191848754883, "lr": 0.0001349919468483066, "elapsed_sec": 70459.77140641212, "step_time_sec": 8.230926219024695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8496, "loss": 4.0559844970703125, "lr": 0.00013487860274874656, "elapsed_sec": 70468.00274562836, "step_time_sec": 8.231150819046889, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8497, "loss": 4.2417073249816895, "lr": 0.0001347652114290259, "elapsed_sec": 70476.23296880722, "step_time_sec": 8.230062526999973, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8498, "loss": 4.262842655181885, "lr": 0.00013465177306834833, "elapsed_sec": 70484.46106290817, "step_time_sec": 8.227988389029633, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8499, "loss": 4.08730936050415, "lr": 0.0001345382878459919, "elapsed_sec": 70492.69274759293, "step_time_sec": 8.231524953036569, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8500, "loss": 4.152107238769531, "lr": 0.0001344247559413087, "elapsed_sec": 70500.92199921608, "step_time_sec": 30.78636245400412, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8501, "loss": 4.0766448974609375, "lr": 0.00013431117753372461, "elapsed_sec": 70531.7192106247, "step_time_sec": 8.239409832982346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8502, "loss": 4.1528730392456055, "lr": 0.00013419755280273898, "elapsed_sec": 70539.93486261368, "step_time_sec": 8.215494928997941, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8503, "loss": 4.007999420166016, "lr": 0.0001340838819279245, "elapsed_sec": 70548.15267777443, "step_time_sec": 8.217730054981075, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8504, "loss": 3.9963345527648926, "lr": 0.0001339701650889266, "elapsed_sec": 70556.3695166111, "step_time_sec": 8.21662781498162, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8505, "loss": 4.112575054168701, "lr": 0.00013385640246546348, "elapsed_sec": 70564.5945558548, "step_time_sec": 8.224876909982413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8506, "loss": 4.094964027404785, "lr": 0.00013374259423732561, "elapsed_sec": 70572.82530713081, "step_time_sec": 8.23063386202557, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8507, "loss": 4.0911946296691895, "lr": 0.00013362874058437562, "elapsed_sec": 70581.05565953255, "step_time_sec": 8.230197011027485, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8508, "loss": 4.170497417449951, "lr": 0.00013351484168654782, "elapsed_sec": 70589.28671050072, "step_time_sec": 8.230867328005843, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8509, "loss": 4.094784736633301, "lr": 0.00013340089772384818, "elapsed_sec": 70597.51810073853, "step_time_sec": 8.231209488003515, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8510, "loss": 4.084589004516602, "lr": 0.00013328690887635377, "elapsed_sec": 70605.74933552742, "step_time_sec": 8.231091167021077, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8511, "loss": 4.174625396728516, "lr": 0.0001331728753242126, "elapsed_sec": 70613.97978782654, "step_time_sec": 8.230353170016315, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8512, "loss": 4.154334545135498, "lr": 0.00013305879724764344, "elapsed_sec": 70622.21051931381, "step_time_sec": 8.230503786995541, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8513, "loss": 4.1463847160339355, "lr": 0.00013294467482693526, "elapsed_sec": 70630.4417848587, "step_time_sec": 8.231134578003548, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8514, "loss": 4.18644380569458, "lr": 0.0001328305082424472, "elapsed_sec": 70638.67237091064, "step_time_sec": 8.230499715020414, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8515, "loss": 4.066800594329834, "lr": 0.00013271629767460825, "elapsed_sec": 70646.90403366089, "step_time_sec": 8.231446162040811, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8516, "loss": 4.0225114822387695, "lr": 0.00013260204330391682, "elapsed_sec": 70655.13466143608, "step_time_sec": 8.230442274012603, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8517, "loss": 4.1690354347229, "lr": 0.00013248774531094063, "elapsed_sec": 70663.36515378952, "step_time_sec": 8.23040157003561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8518, "loss": 4.162052631378174, "lr": 0.0001323734038763162, "elapsed_sec": 70671.5964627266, "step_time_sec": 8.231104446051177, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8519, "loss": 4.014646053314209, "lr": 0.0001322590191807489, "elapsed_sec": 70679.82658267021, "step_time_sec": 8.229937899974175, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8520, "loss": 4.081300735473633, "lr": 0.00013214459140501226, "elapsed_sec": 70688.05770683289, "step_time_sec": 8.230982428998686, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8521, "loss": 4.172067165374756, "lr": 0.0001320301207299481, "elapsed_sec": 70696.28528237343, "step_time_sec": 8.227460914000403, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8522, "loss": 4.116971015930176, "lr": 0.0001319156073364659, "elapsed_sec": 70704.5144059658, "step_time_sec": 8.228947244992014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8523, "loss": 4.024746417999268, "lr": 0.00013180105140554266, "elapsed_sec": 70712.74442958832, "step_time_sec": 8.229875438963063, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8524, "loss": 4.1518940925598145, "lr": 0.00013168645311822268, "elapsed_sec": 70720.97412228584, "step_time_sec": 8.229595236014575, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8525, "loss": 4.0800604820251465, "lr": 0.00013157181265561715, "elapsed_sec": 70729.20500993729, "step_time_sec": 8.23068721400341, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8526, "loss": 4.091342926025391, "lr": 0.00013145713019890393, "elapsed_sec": 70737.43368244171, "step_time_sec": 8.228471576003358, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8527, "loss": 4.2191853523254395, "lr": 0.00013134240592932724, "elapsed_sec": 70745.66490340233, "step_time_sec": 8.231066189007834, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8528, "loss": 4.011129379272461, "lr": 0.00013122764002819733, "elapsed_sec": 70753.89646172523, "step_time_sec": 8.231446134974249, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8529, "loss": 4.118689060211182, "lr": 0.00013111283267689037, "elapsed_sec": 70762.12786340714, "step_time_sec": 8.231202574039344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8530, "loss": 3.985767126083374, "lr": 0.0001309979840568479, "elapsed_sec": 70770.35848093033, "step_time_sec": 8.230484104016796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8531, "loss": 4.343873500823975, "lr": 0.00013088309434957675, "elapsed_sec": 70778.58865904808, "step_time_sec": 8.23006480501499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8532, "loss": 4.163682460784912, "lr": 0.00013076816373664874, "elapsed_sec": 70786.81601238251, "step_time_sec": 8.227197572006844, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8533, "loss": 4.282049655914307, "lr": 0.00013065319239970023, "elapsed_sec": 70795.04698610306, "step_time_sec": 8.23075243603671, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8534, "loss": 4.197038650512695, "lr": 0.00013053818052043194, "elapsed_sec": 70803.2764031887, "step_time_sec": 8.229261917003896, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8535, "loss": 3.686532974243164, "lr": 0.0001304231282806088, "elapsed_sec": 70811.50671648979, "step_time_sec": 8.230164117005188, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8536, "loss": 4.040095329284668, "lr": 0.0001303080358620594, "elapsed_sec": 70819.73757743835, "step_time_sec": 8.23075140803121, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8537, "loss": 3.989466428756714, "lr": 0.00013019290344667582, "elapsed_sec": 70827.96845531464, "step_time_sec": 8.230686692986637, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8538, "loss": 3.958566188812256, "lr": 0.00013007773121641344, "elapsed_sec": 70836.19931864738, "step_time_sec": 8.230733948992565, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8539, "loss": 3.8794503211975098, "lr": 0.00012996251935329053, "elapsed_sec": 70844.43035125732, "step_time_sec": 8.230872199987061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8540, "loss": 4.037501811981201, "lr": 0.00012984726803938793, "elapsed_sec": 70852.66031932831, "step_time_sec": 8.2298059980385, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8541, "loss": 4.168520927429199, "lr": 0.00012973197745684895, "elapsed_sec": 70860.89142918587, "step_time_sec": 8.230968108982779, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8542, "loss": 4.066051483154297, "lr": 0.00012961664778787883, "elapsed_sec": 70869.12132787704, "step_time_sec": 8.229746846016496, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8543, "loss": 4.002058029174805, "lr": 0.00012950127921474463, "elapsed_sec": 70877.35145521164, "step_time_sec": 8.230017989000771, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8544, "loss": 4.17834997177124, "lr": 0.000129385871919775, "elapsed_sec": 70885.58225440979, "step_time_sec": 8.230590899998788, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8545, "loss": 3.928532361984253, "lr": 0.00012927042608535964, "elapsed_sec": 70893.8126885891, "step_time_sec": 8.230337199987844, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8546, "loss": 4.11597204208374, "lr": 0.0001291549418939492, "elapsed_sec": 70902.0441596508, "step_time_sec": 8.231253473029938, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8547, "loss": 4.142262935638428, "lr": 0.00012903941952805494, "elapsed_sec": 70910.27412962914, "step_time_sec": 8.229812034987845, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8548, "loss": 4.070409297943115, "lr": 0.00012892385917024854, "elapsed_sec": 70918.50590896606, "step_time_sec": 8.231669483997393, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8549, "loss": 4.085076332092285, "lr": 0.0001288082610031616, "elapsed_sec": 70926.7367618084, "step_time_sec": 8.230629897967447, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8550, "loss": 4.04774284362793, "lr": 0.00012869262520948558, "elapsed_sec": 70934.9644844532, "step_time_sec": 8.227621225989424, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8551, "loss": 4.102221488952637, "lr": 0.0001285769519719713, "elapsed_sec": 70943.19384503365, "step_time_sec": 8.229152630025055, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8552, "loss": 4.076539039611816, "lr": 0.00012846124147342883, "elapsed_sec": 70951.42395806313, "step_time_sec": 8.230007862031925, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8553, "loss": 4.159053325653076, "lr": 0.0001283454938967271, "elapsed_sec": 70959.65506696701, "step_time_sec": 8.230904033000115, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8554, "loss": 3.970383882522583, "lr": 0.00012822970942479367, "elapsed_sec": 70967.88630056381, "step_time_sec": 8.231090868997853, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8555, "loss": 4.167671203613281, "lr": 0.00012811388824061435, "elapsed_sec": 70976.11798334122, "step_time_sec": 8.231516224972438, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8556, "loss": 4.155329704284668, "lr": 0.00012799803052723302, "elapsed_sec": 70984.34839940071, "step_time_sec": 8.230263948033098, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8557, "loss": 4.054134368896484, "lr": 0.0001278821364677513, "elapsed_sec": 70992.5788333416, "step_time_sec": 8.23026313004084, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8558, "loss": 4.082738399505615, "lr": 0.0001277662062453282, "elapsed_sec": 71000.81022953987, "step_time_sec": 8.231239568965975, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8559, "loss": 4.0070881843566895, "lr": 0.00012765024004317993, "elapsed_sec": 71009.04063296318, "step_time_sec": 8.230226190004032, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8560, "loss": 4.092668533325195, "lr": 0.00012753423804457947, "elapsed_sec": 71017.26936888695, "step_time_sec": 8.22854089998873, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8561, "loss": 4.087948799133301, "lr": 0.00012741820043285656, "elapsed_sec": 71025.4977016449, "step_time_sec": 8.228213388007134, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8562, "loss": 4.098570346832275, "lr": 0.00012730212739139703, "elapsed_sec": 71033.72799229622, "step_time_sec": 8.230089248972945, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8563, "loss": 3.9777300357818604, "lr": 0.0001271860191036428, "elapsed_sec": 71041.95818686485, "step_time_sec": 8.2300633470295, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8564, "loss": 3.9962005615234375, "lr": 0.00012706987575309144, "elapsed_sec": 71050.18678832054, "step_time_sec": 8.22839850798482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8565, "loss": 3.8925397396087646, "lr": 0.00012695369752329603, "elapsed_sec": 71058.41841387749, "step_time_sec": 8.231459359987639, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8566, "loss": 4.154677867889404, "lr": 0.00012683748459786466, "elapsed_sec": 71066.64977312088, "step_time_sec": 8.231222464004532, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8567, "loss": 4.055737495422363, "lr": 0.00012672123716046032, "elapsed_sec": 71074.88045835495, "step_time_sec": 8.230532176967245, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8568, "loss": 3.99780011177063, "lr": 0.00012660495539480054, "elapsed_sec": 71083.11115980148, "step_time_sec": 8.230509546992835, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8569, "loss": 4.025853157043457, "lr": 0.00012648863948465705, "elapsed_sec": 71091.34170126915, "step_time_sec": 8.230403512017801, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8570, "loss": 4.180479049682617, "lr": 0.00012637228961385563, "elapsed_sec": 71099.57229471207, "step_time_sec": 8.230390949000139, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8571, "loss": 4.0257887840271, "lr": 0.00012625590596627564, "elapsed_sec": 71107.80219697952, "step_time_sec": 8.229777914995793, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8572, "loss": 3.96492600440979, "lr": 0.00012613948872584988, "elapsed_sec": 71116.03257918358, "step_time_sec": 8.23021006199997, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8573, "loss": 4.036098003387451, "lr": 0.0001260230380765642, "elapsed_sec": 71124.26251387596, "step_time_sec": 8.229821854969487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8574, "loss": 4.080698490142822, "lr": 0.00012590655420245732, "elapsed_sec": 71132.49338555336, "step_time_sec": 8.230653543956578, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8575, "loss": 3.9275729656219482, "lr": 0.0001257900372876204, "elapsed_sec": 71140.7249724865, "step_time_sec": 8.23147091496503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8576, "loss": 4.009306907653809, "lr": 0.0001256734875161968, "elapsed_sec": 71148.95629453659, "step_time_sec": 8.231158946000505, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8577, "loss": 3.940373659133911, "lr": 0.0001255569050723819, "elapsed_sec": 71157.1868929863, "step_time_sec": 8.230412230012007, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8578, "loss": 3.918843984603882, "lr": 0.0001254402901404226, "elapsed_sec": 71165.41709947586, "step_time_sec": 8.230050379992463, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8579, "loss": 4.174170970916748, "lr": 0.00012532364290461726, "elapsed_sec": 71173.64887881279, "step_time_sec": 8.231597360048909, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8580, "loss": 3.803218126296997, "lr": 0.00012520696354931523, "elapsed_sec": 71181.88085198402, "step_time_sec": 8.231845455011353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8581, "loss": 3.951709270477295, "lr": 0.0001250902522589166, "elapsed_sec": 71190.11024594307, "step_time_sec": 8.229247151990421, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8582, "loss": 3.95235538482666, "lr": 0.00012497350921787195, "elapsed_sec": 71198.3394947052, "step_time_sec": 8.229082803009078, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8583, "loss": 4.0625104904174805, "lr": 0.0001248567346106821, "elapsed_sec": 71206.56679677963, "step_time_sec": 8.227202378970105, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8584, "loss": 4.2031660079956055, "lr": 0.00012473992862189768, "elapsed_sec": 71214.79800581932, "step_time_sec": 8.231010071991477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8585, "loss": 4.059568405151367, "lr": 0.0001246230914361189, "elapsed_sec": 71223.02849292755, "step_time_sec": 8.230355122999754, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8586, "loss": 3.9759747982025146, "lr": 0.0001245062232379954, "elapsed_sec": 71231.2581152916, "step_time_sec": 8.229492718004622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8587, "loss": 4.038230895996094, "lr": 0.00012438932421222567, "elapsed_sec": 71239.48797178268, "step_time_sec": 8.229616630007513, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8588, "loss": 4.109641075134277, "lr": 0.00012427239454355704, "elapsed_sec": 71247.71559238434, "step_time_sec": 8.227469511970412, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8589, "loss": 4.13153076171875, "lr": 0.00012415543441678518, "elapsed_sec": 71255.94705057144, "step_time_sec": 8.231311756011564, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8590, "loss": 4.115842819213867, "lr": 0.000124038444016754, "elapsed_sec": 71264.17717432976, "step_time_sec": 8.230037898989394, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8591, "loss": 4.018014430999756, "lr": 0.0001239214235283551, "elapsed_sec": 71272.40774798393, "step_time_sec": 8.230330106976908, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8592, "loss": 4.056178569793701, "lr": 0.00012380437313652787, "elapsed_sec": 71280.63866829872, "step_time_sec": 8.23079336801311, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8593, "loss": 4.149214744567871, "lr": 0.00012368729302625868, "elapsed_sec": 71288.86987185478, "step_time_sec": 8.231019405997358, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8594, "loss": 4.060923099517822, "lr": 0.00012357018338258108, "elapsed_sec": 71297.10055398941, "step_time_sec": 8.230524100014009, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8595, "loss": 3.8992977142333984, "lr": 0.00012345304439057519, "elapsed_sec": 71305.33127188683, "step_time_sec": 8.230618743982632, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8596, "loss": 4.008516311645508, "lr": 0.00012333587623536755, "elapsed_sec": 71313.56226682663, "step_time_sec": 8.23078758403426, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8597, "loss": 3.955749273300171, "lr": 0.00012321867910213073, "elapsed_sec": 71321.79252648354, "step_time_sec": 8.230151087976992, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8598, "loss": 4.065338611602783, "lr": 0.00012310145317608322, "elapsed_sec": 71330.02313303947, "step_time_sec": 8.230492378992494, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8599, "loss": 4.115305423736572, "lr": 0.00012298419864248888, "elapsed_sec": 71338.25357460976, "step_time_sec": 8.230273884022608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8600, "loss": 4.000481128692627, "lr": 0.00012286691568665688, "elapsed_sec": 71346.48387145996, "step_time_sec": 8.230097694962751, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8601, "loss": 4.083104133605957, "lr": 0.00012274960449394127, "elapsed_sec": 71354.7140583992, "step_time_sec": 8.230024939985014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8602, "loss": 4.054177761077881, "lr": 0.0001226322652497407, "elapsed_sec": 71362.94527840614, "step_time_sec": 8.23104282002896, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8603, "loss": 3.8969461917877197, "lr": 0.00012251489813949818, "elapsed_sec": 71371.1760559082, "step_time_sec": 8.230636867985595, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8604, "loss": 4.063159465789795, "lr": 0.00012239750334870077, "elapsed_sec": 71379.4060306549, "step_time_sec": 8.229825676011387, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8605, "loss": 4.0464348793029785, "lr": 0.00012228008106287928, "elapsed_sec": 71387.63682460785, "step_time_sec": 8.230593701009639, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8606, "loss": 4.06300687789917, "lr": 0.00012216263146760793, "elapsed_sec": 71395.86753797531, "step_time_sec": 8.230562857002951, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8607, "loss": 3.91633939743042, "lr": 0.00012204515474850412, "elapsed_sec": 71404.09776997566, "step_time_sec": 8.230130838986952, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8608, "loss": 3.9158987998962402, "lr": 0.00012192765109122812, "elapsed_sec": 71412.32642221451, "step_time_sec": 8.228451003029477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8609, "loss": 3.892422914505005, "lr": 0.00012181012068148279, "elapsed_sec": 71420.55581521988, "step_time_sec": 8.229238633008208, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8610, "loss": 4.042487144470215, "lr": 0.00012169256370501328, "elapsed_sec": 71428.78663921356, "step_time_sec": 8.230659644003026, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8611, "loss": 3.9816646575927734, "lr": 0.00012157498034760668, "elapsed_sec": 71437.01669692993, "step_time_sec": 8.229927624983247, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8612, "loss": 4.078569412231445, "lr": 0.00012145737079509181, "elapsed_sec": 71445.24716234207, "step_time_sec": 8.230316866014618, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8613, "loss": 4.279323577880859, "lr": 0.00012133973523333884, "elapsed_sec": 71453.47968959808, "step_time_sec": 8.2323475680314, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8614, "loss": 3.9338772296905518, "lr": 0.00012122207384825906, "elapsed_sec": 71461.7101867199, "step_time_sec": 8.230393294012174, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8615, "loss": 4.058927059173584, "lr": 0.00012110438682580468, "elapsed_sec": 71469.94133782387, "step_time_sec": 8.2309895909857, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8616, "loss": 3.955320119857788, "lr": 0.00012098667435196824, "elapsed_sec": 71478.17135238647, "step_time_sec": 8.229835691046901, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8617, "loss": 3.9591116905212402, "lr": 0.00012086893661278267, "elapsed_sec": 71486.40206313133, "step_time_sec": 8.230558007024229, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8618, "loss": 4.116209506988525, "lr": 0.00012075117379432073, "elapsed_sec": 71494.63304758072, "step_time_sec": 8.230879383045249, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8619, "loss": 3.878891706466675, "lr": 0.00012063338608269481, "elapsed_sec": 71502.86350393295, "step_time_sec": 8.23023742402438, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8620, "loss": 4.05318546295166, "lr": 0.00012051557366405674, "elapsed_sec": 71511.09392642975, "step_time_sec": 8.230255153030157, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8621, "loss": 3.8732144832611084, "lr": 0.0001203977367245973, "elapsed_sec": 71519.32452607155, "step_time_sec": 8.23049590998562, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8622, "loss": 4.116639614105225, "lr": 0.00012027987545054605, "elapsed_sec": 71527.55485749245, "step_time_sec": 8.230125569971278, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8623, "loss": 3.995793581008911, "lr": 0.00012016199002817103, "elapsed_sec": 71535.78535795212, "step_time_sec": 8.23039763502311, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8624, "loss": 4.0310282707214355, "lr": 0.00012004408064377844, "elapsed_sec": 71544.0160973072, "step_time_sec": 8.230546479986515, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8625, "loss": 4.044683933258057, "lr": 0.0001199261474837123, "elapsed_sec": 71552.24694514275, "step_time_sec": 8.230695559002925, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8626, "loss": 4.075584888458252, "lr": 0.00011980819073435426, "elapsed_sec": 71560.4771001339, "step_time_sec": 8.230014184024185, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8627, "loss": 4.093453407287598, "lr": 0.00011969021058212323, "elapsed_sec": 71568.70885205269, "step_time_sec": 8.231588578026276, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8628, "loss": 4.2117509841918945, "lr": 0.00011957220721347514, "elapsed_sec": 71576.9382956028, "step_time_sec": 8.229311902017798, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8629, "loss": 4.173976421356201, "lr": 0.00011945418081490252, "elapsed_sec": 71585.16887354851, "step_time_sec": 8.230378173990175, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8630, "loss": 4.035418510437012, "lr": 0.00011933613157293442, "elapsed_sec": 71593.39930200577, "step_time_sec": 8.230275654990692, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8631, "loss": 4.074500560760498, "lr": 0.00011921805967413588, "elapsed_sec": 71601.63056111336, "step_time_sec": 8.231152290012687, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8632, "loss": 4.036254405975342, "lr": 0.00011909996530510781, "elapsed_sec": 71609.86092495918, "step_time_sec": 8.23014742101077, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8633, "loss": 3.982861280441284, "lr": 0.00011898184865248664, "elapsed_sec": 71618.0917994976, "step_time_sec": 8.230741786013823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8634, "loss": 3.9988465309143066, "lr": 0.00011886370990294397, "elapsed_sec": 71626.32254266739, "step_time_sec": 8.230632974009495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8635, "loss": 3.9992549419403076, "lr": 0.00011874554924318635, "elapsed_sec": 71634.55308365822, "step_time_sec": 8.230381255038083, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8636, "loss": 3.93267560005188, "lr": 0.00011862736685995497, "elapsed_sec": 71642.78379106522, "step_time_sec": 8.230511740024667, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8637, "loss": 3.9938740730285645, "lr": 0.00011850916294002531, "elapsed_sec": 71651.01362228394, "step_time_sec": 8.229679843992926, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8638, "loss": 4.042017936706543, "lr": 0.00011839093767020691, "elapsed_sec": 71659.24366307259, "step_time_sec": 8.229929669992998, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8639, "loss": 3.936439275741577, "lr": 0.00011827269123734309, "elapsed_sec": 71667.47362446785, "step_time_sec": 8.229812959034462, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8640, "loss": 4.143510818481445, "lr": 0.00011815442382831054, "elapsed_sec": 71675.70462656021, "step_time_sec": 8.23078451200854, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8641, "loss": 3.992398500442505, "lr": 0.00011803613563001917, "elapsed_sec": 71683.93423080444, "step_time_sec": 8.229456344037317, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8642, "loss": 3.82513427734375, "lr": 0.0001179178268294117, "elapsed_sec": 71692.1649889946, "step_time_sec": 8.23058797896374, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8643, "loss": 3.85610294342041, "lr": 0.00011779949761346342, "elapsed_sec": 71700.39316129684, "step_time_sec": 8.228054088016506, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8644, "loss": 4.0182085037231445, "lr": 0.00011768114816918187, "elapsed_sec": 71708.62301683426, "step_time_sec": 8.229698575974908, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8645, "loss": 4.0419816970825195, "lr": 0.00011756277868360663, "elapsed_sec": 71716.85187125206, "step_time_sec": 8.228726152970921, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8646, "loss": 4.012211322784424, "lr": 0.00011744438934380888, "elapsed_sec": 71725.0821530819, "step_time_sec": 8.230082286987454, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8647, "loss": 3.9620723724365234, "lr": 0.0001173259803368912, "elapsed_sec": 71733.31219911575, "step_time_sec": 8.229875974007882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8648, "loss": 4.088051795959473, "lr": 0.00011720755184998725, "elapsed_sec": 71741.54111909866, "step_time_sec": 8.228771088004578, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8649, "loss": 4.001232147216797, "lr": 0.00011708910407026145, "elapsed_sec": 71749.77222204208, "step_time_sec": 8.231024238979444, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8650, "loss": 4.001354217529297, "lr": 0.00011697063718490876, "elapsed_sec": 71758.00252914429, "step_time_sec": 8.230081666028127, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8651, "loss": 4.185722351074219, "lr": 0.00011685215138115433, "elapsed_sec": 71766.2322781086, "step_time_sec": 8.229577892052475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8652, "loss": 4.046591758728027, "lr": 0.00011673364684625316, "elapsed_sec": 71774.46085381508, "step_time_sec": 8.228486002946738, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8653, "loss": 4.096876621246338, "lr": 0.00011661512376748992, "elapsed_sec": 71782.68988776207, "step_time_sec": 8.22883576899767, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8654, "loss": 4.095592498779297, "lr": 0.0001164965823321785, "elapsed_sec": 71790.91990923882, "step_time_sec": 8.229877247998957, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8655, "loss": 4.117239475250244, "lr": 0.0001163780227276619, "elapsed_sec": 71799.15046072006, "step_time_sec": 8.230375891958829, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8656, "loss": 4.036471843719482, "lr": 0.00011625944514131178, "elapsed_sec": 71807.38054537773, "step_time_sec": 8.229940283985343, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8657, "loss": 4.036426067352295, "lr": 0.00011614084976052819, "elapsed_sec": 71815.61014199257, "step_time_sec": 8.229454165964853, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8658, "loss": 4.060984134674072, "lr": 0.00011602223677273936, "elapsed_sec": 71823.83805036545, "step_time_sec": 8.227733039006125, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8659, "loss": 4.126467704772949, "lr": 0.0001159036063654013, "elapsed_sec": 71832.06786870956, "step_time_sec": 8.229681727010757, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8660, "loss": 3.956582546234131, "lr": 0.00011578495872599763, "elapsed_sec": 71840.29948472977, "step_time_sec": 8.231432822009083, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8661, "loss": 4.112150192260742, "lr": 0.00011566629404203909, "elapsed_sec": 71848.52790236473, "step_time_sec": 8.22827376803616, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8662, "loss": 4.088905334472656, "lr": 0.00011554761250106346, "elapsed_sec": 71856.75574851036, "step_time_sec": 8.227717737026978, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8663, "loss": 4.0720648765563965, "lr": 0.00011542891429063502, "elapsed_sec": 71864.9849729538, "step_time_sec": 8.229022666986566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8664, "loss": 4.0614213943481445, "lr": 0.00011531019959834461, "elapsed_sec": 71873.21597218513, "step_time_sec": 8.230838762014173, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8665, "loss": 4.028099060058594, "lr": 0.00011519146861180895, "elapsed_sec": 71881.44695281982, "step_time_sec": 8.230830173997674, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8666, "loss": 4.065983772277832, "lr": 0.00011507272151867052, "elapsed_sec": 71889.67680048943, "step_time_sec": 8.229743995994795, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8667, "loss": 4.090848445892334, "lr": 0.00011495395850659734, "elapsed_sec": 71897.90718793869, "step_time_sec": 8.230206913023721, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8668, "loss": 4.095284461975098, "lr": 0.0001148351797632825, "elapsed_sec": 71906.13724040985, "step_time_sec": 8.229901541024446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8669, "loss": 4.027215957641602, "lr": 0.00011471638547644405, "elapsed_sec": 71914.36770772934, "step_time_sec": 8.230317371955607, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8670, "loss": 4.065239906311035, "lr": 0.0001145975758338245, "elapsed_sec": 71922.59812927246, "step_time_sec": 8.230256616021506, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8671, "loss": 4.083734035491943, "lr": 0.00011447875102319071, "elapsed_sec": 71930.82606744766, "step_time_sec": 8.227800080028828, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8672, "loss": 4.04567813873291, "lr": 0.00011435991123233343, "elapsed_sec": 71939.05444264412, "step_time_sec": 8.228187775006518, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8673, "loss": 3.942305088043213, "lr": 0.00011424105664906718, "elapsed_sec": 71947.28302907944, "step_time_sec": 8.228461910970509, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8674, "loss": 4.005392551422119, "lr": 0.00011412218746122977, "elapsed_sec": 71955.51424694061, "step_time_sec": 8.231096665025689, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8675, "loss": 3.8955957889556885, "lr": 0.00011400330385668217, "elapsed_sec": 71963.74544215202, "step_time_sec": 8.230957294988912, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8676, "loss": 3.939009428024292, "lr": 0.00011388440602330808, "elapsed_sec": 71971.97640395164, "step_time_sec": 8.230860244017094, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8677, "loss": 3.9712986946105957, "lr": 0.00011376549414901369, "elapsed_sec": 71980.20689058304, "step_time_sec": 8.230264848971274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8678, "loss": 4.008339881896973, "lr": 0.00011364656842172741, "elapsed_sec": 71988.43825769424, "step_time_sec": 8.231276291015092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8679, "loss": 3.9119606018066406, "lr": 0.00011352762902939953, "elapsed_sec": 71996.66841435432, "step_time_sec": 8.229960045951884, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8680, "loss": 4.086650848388672, "lr": 0.00011340867616000191, "elapsed_sec": 72004.8989751339, "step_time_sec": 8.23044682602631, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8681, "loss": 4.008514404296875, "lr": 0.00011328971000152778, "elapsed_sec": 72013.12919592857, "step_time_sec": 8.229998464987148, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8682, "loss": 4.078860759735107, "lr": 0.00011317073074199125, "elapsed_sec": 72021.36009931564, "step_time_sec": 8.23071269504726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8683, "loss": 3.98787784576416, "lr": 0.00011305173856942727, "elapsed_sec": 72029.59068107605, "step_time_sec": 8.230412945034914, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8684, "loss": 3.6708788871765137, "lr": 0.00011293273367189111, "elapsed_sec": 72037.82097816467, "step_time_sec": 8.230156301055104, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8685, "loss": 3.8859751224517822, "lr": 0.00011281371623745818, "elapsed_sec": 72046.05067753792, "step_time_sec": 8.229601295024622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8686, "loss": 3.9957926273345947, "lr": 0.0001126946864542237, "elapsed_sec": 72054.28104019165, "step_time_sec": 8.230199220997747, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8687, "loss": 4.072388172149658, "lr": 0.00011257564451030241, "elapsed_sec": 72062.51219964027, "step_time_sec": 8.2309420859674, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8688, "loss": 4.103943824768066, "lr": 0.00011245659059382827, "elapsed_sec": 72070.7418782711, "step_time_sec": 8.229563727974892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8689, "loss": 3.9649078845977783, "lr": 0.00011233752489295417, "elapsed_sec": 72078.96983456612, "step_time_sec": 8.227777323976625, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8690, "loss": 3.9695985317230225, "lr": 0.00011221844759585157, "elapsed_sec": 72087.19852757454, "step_time_sec": 8.228587593010161, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8691, "loss": 3.918229341506958, "lr": 0.00011209935889071032, "elapsed_sec": 72095.42621541023, "step_time_sec": 8.227447149984073, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8692, "loss": 4.070336818695068, "lr": 0.00011198025896573827, "elapsed_sec": 72103.65360760689, "step_time_sec": 8.227211480028927, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8693, "loss": 4.13555383682251, "lr": 0.00011186114800916102, "elapsed_sec": 72111.88337802887, "step_time_sec": 8.22964256600244, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8694, "loss": 3.988516330718994, "lr": 0.0001117420262092216, "elapsed_sec": 72120.11232638359, "step_time_sec": 8.22881918301573, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8695, "loss": 4.13838005065918, "lr": 0.00011162289375418016, "elapsed_sec": 72128.33936071396, "step_time_sec": 8.226856007007882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8696, "loss": 4.1062774658203125, "lr": 0.00011150375083231367, "elapsed_sec": 72136.56883215904, "step_time_sec": 8.22934666799847, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8697, "loss": 4.00524377822876, "lr": 0.00011138459763191573, "elapsed_sec": 72144.800085783, "step_time_sec": 8.231108490959741, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8698, "loss": 4.075328350067139, "lr": 0.00011126543434129606, "elapsed_sec": 72153.0300359726, "step_time_sec": 8.229764015995897, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8699, "loss": 4.032296657562256, "lr": 0.00011114626114878045, "elapsed_sec": 72161.26069688797, "step_time_sec": 8.23046166903805, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8700, "loss": 3.9678614139556885, "lr": 0.00011102707824271025, "elapsed_sec": 72169.49187636375, "step_time_sec": 8.231031474017072, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8701, "loss": 3.953793525695801, "lr": 0.0001109078858114422, "elapsed_sec": 72177.72317886353, "step_time_sec": 8.231189411017112, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8702, "loss": 3.903233766555786, "lr": 0.00011078868404334807, "elapsed_sec": 72185.95375728607, "step_time_sec": 8.230370467994362, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8703, "loss": 3.885821580886841, "lr": 0.00011066947312681444, "elapsed_sec": 72194.18408322334, "step_time_sec": 8.230274741014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8704, "loss": 3.967435836791992, "lr": 0.00011055025325024225, "elapsed_sec": 72202.41588664055, "step_time_sec": 8.231595173012465, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8705, "loss": 4.0208024978637695, "lr": 0.00011043102460204672, "elapsed_sec": 72210.64579081535, "step_time_sec": 8.229738335998263, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8706, "loss": 4.005836009979248, "lr": 0.00011031178737065684, "elapsed_sec": 72218.87700653076, "step_time_sec": 8.23105033498723, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8707, "loss": 3.8143117427825928, "lr": 0.0001101925417445152, "elapsed_sec": 72227.10719919205, "step_time_sec": 8.23008441599086, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8708, "loss": 4.0106682777404785, "lr": 0.00011007328791207766, "elapsed_sec": 72235.33892035484, "step_time_sec": 8.23153876804281, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8709, "loss": 3.9641807079315186, "lr": 0.00010995402606181304, "elapsed_sec": 72243.56995773315, "step_time_sec": 8.230850166000891, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8710, "loss": 3.9617385864257812, "lr": 0.00010983475638220284, "elapsed_sec": 72251.80032777786, "step_time_sec": 8.230216402967926, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8711, "loss": 4.0698113441467285, "lr": 0.00010971547906174094, "elapsed_sec": 72260.03715920448, "step_time_sec": 8.230434662022162, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8712, "loss": 3.9912960529327393, "lr": 0.00010959619428893326, "elapsed_sec": 72268.26772046089, "step_time_sec": 8.230354141036514, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8713, "loss": 4.065634250640869, "lr": 0.00010947690225229755, "elapsed_sec": 72276.49882531166, "step_time_sec": 8.230916200962383, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8714, "loss": 4.114887237548828, "lr": 0.00010935760314036299, "elapsed_sec": 72284.72914838791, "step_time_sec": 8.230169903021306, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8715, "loss": 3.9179670810699463, "lr": 0.00010923829714166994, "elapsed_sec": 72292.95969414711, "step_time_sec": 8.230412360979244, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8716, "loss": 3.997466802597046, "lr": 0.00010911898444476973, "elapsed_sec": 72301.19018006325, "step_time_sec": 8.230331447965, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8717, "loss": 4.10292387008667, "lr": 0.00010899966523822418, "elapsed_sec": 72309.42049455643, "step_time_sec": 8.230127635993995, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8718, "loss": 4.101484775543213, "lr": 0.00010888033971060543, "elapsed_sec": 72317.65064668655, "step_time_sec": 8.230019946000539, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8719, "loss": 3.9859235286712646, "lr": 0.00010876100805049561, "elapsed_sec": 72325.87954187393, "step_time_sec": 8.22869408497354, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8720, "loss": 4.014129638671875, "lr": 0.00010864167044648656, "elapsed_sec": 72334.10988211632, "step_time_sec": 8.230200112971943, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8721, "loss": 4.037094593048096, "lr": 0.00010852232708717945, "elapsed_sec": 72342.34010624886, "step_time_sec": 8.230080100998748, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8722, "loss": 3.968916177749634, "lr": 0.00010840297816118464, "elapsed_sec": 72350.5717113018, "step_time_sec": 8.231385461054742, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8723, "loss": 4.020269870758057, "lr": 0.00010828362385712124, "elapsed_sec": 72358.80130624771, "step_time_sec": 8.229413155990187, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8724, "loss": 3.9227778911590576, "lr": 0.00010816426436361684, "elapsed_sec": 72367.03114271164, "step_time_sec": 8.229721640003845, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8725, "loss": 3.885697364807129, "lr": 0.00010804489986930727, "elapsed_sec": 72375.25826215744, "step_time_sec": 8.226981008017901, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8726, "loss": 4.176624774932861, "lr": 0.0001079255305628362, "elapsed_sec": 72383.48834824562, "step_time_sec": 8.229833610006608, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8727, "loss": 4.005798816680908, "lr": 0.00010780615663285498, "elapsed_sec": 72391.71932435036, "step_time_sec": 8.230828289990313, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8728, "loss": 4.207278251647949, "lr": 0.00010768677826802225, "elapsed_sec": 72399.9489710331, "step_time_sec": 8.229485956020653, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8729, "loss": 4.163966655731201, "lr": 0.00010756739565700363, "elapsed_sec": 72408.17836809158, "step_time_sec": 8.229318280005828, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8730, "loss": 4.154783725738525, "lr": 0.00010744800898847147, "elapsed_sec": 72416.40693449974, "step_time_sec": 8.228352606995031, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8731, "loss": 4.13192081451416, "lr": 0.00010732861845110452, "elapsed_sec": 72424.63739299774, "step_time_sec": 8.230311697989237, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8732, "loss": 4.129744052886963, "lr": 0.00010720922423358765, "elapsed_sec": 72432.86809110641, "step_time_sec": 8.230534686997999, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8733, "loss": 4.1949262619018555, "lr": 0.00010708982652461156, "elapsed_sec": 72441.0988008976, "step_time_sec": 8.230567461985629, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8734, "loss": 4.164474010467529, "lr": 0.00010697042551287244, "elapsed_sec": 72449.32852196693, "step_time_sec": 8.229499663051683, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8735, "loss": 3.973381996154785, "lr": 0.00010685102138707174, "elapsed_sec": 72457.55889344215, "step_time_sec": 8.230231975961942, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8736, "loss": 3.9525370597839355, "lr": 0.00010673161433591582, "elapsed_sec": 72465.78912138939, "step_time_sec": 8.23002739995718, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8737, "loss": 4.093543529510498, "lr": 0.0001066122045481156, "elapsed_sec": 72474.02055096626, "step_time_sec": 8.231271572003607, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8738, "loss": 4.0401482582092285, "lr": 0.0001064927922123864, "elapsed_sec": 72482.24994015694, "step_time_sec": 8.229287887050305, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8739, "loss": 4.1036295890808105, "lr": 0.00010637337751744755, "elapsed_sec": 72490.47800087929, "step_time_sec": 8.227911187976133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8740, "loss": 3.880133628845215, "lr": 0.00010625396065202211, "elapsed_sec": 72498.70766615868, "step_time_sec": 8.229420287010726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8741, "loss": 4.169203281402588, "lr": 0.00010613454180483653, "elapsed_sec": 72506.93786716461, "step_time_sec": 8.230050910962746, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8742, "loss": 4.0156121253967285, "lr": 0.00010601512116462047, "elapsed_sec": 72515.16887331009, "step_time_sec": 8.230906725977547, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8743, "loss": 4.032777786254883, "lr": 0.00010589569892010632, "elapsed_sec": 72523.3969631195, "step_time_sec": 8.227872828021646, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8744, "loss": 4.063382625579834, "lr": 0.00010577627526002908, "elapsed_sec": 72531.627764225, "step_time_sec": 8.230692551005632, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8745, "loss": 3.8516552448272705, "lr": 0.00010565685037312602, "elapsed_sec": 72539.85668301582, "step_time_sec": 8.228674233017955, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8746, "loss": 4.076182842254639, "lr": 0.00010553742444813624, "elapsed_sec": 72548.08720064163, "step_time_sec": 8.230382768029813, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8747, "loss": 4.092944145202637, "lr": 0.00010541799767380059, "elapsed_sec": 72556.31734633446, "step_time_sec": 8.229981748969294, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8748, "loss": 4.064695358276367, "lr": 0.00010529857023886114, "elapsed_sec": 72564.54495978355, "step_time_sec": 8.227459531044587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8749, "loss": 4.052779197692871, "lr": 0.00010517914233206111, "elapsed_sec": 72572.77198648453, "step_time_sec": 8.226850737002678, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8750, "loss": 3.9858343601226807, "lr": 0.00010505971414214447, "elapsed_sec": 72580.99968457222, "step_time_sec": 8.22756168601336, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8751, "loss": 3.9544572830200195, "lr": 0.00010494028585785556, "elapsed_sec": 72589.2302672863, "step_time_sec": 8.230414277000818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8752, "loss": 3.981203317642212, "lr": 0.00010482085766793891, "elapsed_sec": 72597.4609568119, "step_time_sec": 8.230568094993941, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8753, "loss": 3.929379463195801, "lr": 0.00010470142976113889, "elapsed_sec": 72605.69188976288, "step_time_sec": 8.230693497986067, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8754, "loss": 4.056965351104736, "lr": 0.00010458200232619945, "elapsed_sec": 72613.9224793911, "step_time_sec": 8.230416635982692, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8755, "loss": 4.062180995941162, "lr": 0.00010446257555186378, "elapsed_sec": 72622.15426778793, "step_time_sec": 8.231639298028313, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8756, "loss": 4.053943157196045, "lr": 0.000104343149626874, "elapsed_sec": 72630.38242340088, "step_time_sec": 8.22805769596016, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8757, "loss": 4.163810729980469, "lr": 0.00010422372473997093, "elapsed_sec": 72638.6114552021, "step_time_sec": 8.228857941983733, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8758, "loss": 4.185450553894043, "lr": 0.00010410430107989371, "elapsed_sec": 72646.84120249748, "step_time_sec": 8.229582656000275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8759, "loss": 4.063606262207031, "lr": 0.00010398487883537955, "elapsed_sec": 72655.0730817318, "step_time_sec": 8.2316964410129, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8760, "loss": 3.9339189529418945, "lr": 0.00010386545819516347, "elapsed_sec": 72663.30328679085, "step_time_sec": 8.230008787009865, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8761, "loss": 3.982707977294922, "lr": 0.00010374603934797791, "elapsed_sec": 72671.53340768814, "step_time_sec": 8.22994354297407, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8762, "loss": 4.000027179718018, "lr": 0.00010362662248255244, "elapsed_sec": 72679.76452255249, "step_time_sec": 8.230953260033857, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8763, "loss": 4.121155738830566, "lr": 0.00010350720778761362, "elapsed_sec": 72687.99423003197, "step_time_sec": 8.229581900988705, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8764, "loss": 4.128565311431885, "lr": 0.00010338779545188442, "elapsed_sec": 72696.22588467598, "step_time_sec": 8.231469094986096, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8765, "loss": 4.062410831451416, "lr": 0.00010326838566408423, "elapsed_sec": 72704.45569777489, "step_time_sec": 8.229702194978017, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8766, "loss": 4.135798931121826, "lr": 0.00010314897861292826, "elapsed_sec": 72712.68714761734, "step_time_sec": 8.231323052023072, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8767, "loss": 4.204688549041748, "lr": 0.00010302957448712755, "elapsed_sec": 72720.91760826111, "step_time_sec": 8.230275968962815, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8768, "loss": 3.9988088607788086, "lr": 0.00010291017347538848, "elapsed_sec": 72729.14880347252, "step_time_sec": 8.231014685006812, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8769, "loss": 3.9335458278656006, "lr": 0.00010279077576641237, "elapsed_sec": 72737.37940263748, "step_time_sec": 8.230465694970917, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8770, "loss": 4.020765781402588, "lr": 0.00010267138154889551, "elapsed_sec": 72745.61006331444, "step_time_sec": 8.230527415988036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8771, "loss": 4.087182998657227, "lr": 0.00010255199101152855, "elapsed_sec": 72753.83974957466, "step_time_sec": 8.229472253005952, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8772, "loss": 4.042000770568848, "lr": 0.00010243260434299637, "elapsed_sec": 72762.07053852081, "step_time_sec": 8.230625473021064, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8773, "loss": 4.124916076660156, "lr": 0.00010231322173197777, "elapsed_sec": 72770.30291438103, "step_time_sec": 8.232229874003679, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8774, "loss": 3.941974639892578, "lr": 0.00010219384336714502, "elapsed_sec": 72778.53161287308, "step_time_sec": 8.228605365031399, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8775, "loss": 4.0196533203125, "lr": 0.00010207446943716384, "elapsed_sec": 72786.76091384888, "step_time_sec": 8.229075917974114, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8776, "loss": 4.110443115234375, "lr": 0.00010195510013069275, "elapsed_sec": 72794.9889702797, "step_time_sec": 8.22793156397529, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8777, "loss": 4.011272430419922, "lr": 0.0001018357356363832, "elapsed_sec": 72803.21932911873, "step_time_sec": 8.230170244991314, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8778, "loss": 4.457544326782227, "lr": 0.00010171637614287878, "elapsed_sec": 72811.45022726059, "step_time_sec": 8.230761684011668, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8779, "loss": 4.187194347381592, "lr": 0.00010159702183881534, "elapsed_sec": 72819.68040704727, "step_time_sec": 8.23001901601674, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8780, "loss": 4.062544345855713, "lr": 0.00010147767291282058, "elapsed_sec": 72827.91105866432, "step_time_sec": 8.230478203040548, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8781, "loss": 3.9394936561584473, "lr": 0.00010135832955351345, "elapsed_sec": 72836.14285492897, "step_time_sec": 8.231582659995183, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8782, "loss": 3.779221534729004, "lr": 0.00010123899194950442, "elapsed_sec": 72844.3735089302, "step_time_sec": 8.230505800980609, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8783, "loss": 3.9106457233428955, "lr": 0.00010111966028939459, "elapsed_sec": 72852.60463309288, "step_time_sec": 8.23099110001931, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8784, "loss": 4.093245029449463, "lr": 0.00010100033476177582, "elapsed_sec": 72860.83424806595, "step_time_sec": 8.229468587960582, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8785, "loss": 3.968968391418457, "lr": 0.00010088101555523028, "elapsed_sec": 72869.06572508812, "step_time_sec": 8.23125493596308, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8786, "loss": 3.988868236541748, "lr": 0.00010076170285833006, "elapsed_sec": 72877.29634952545, "step_time_sec": 8.230475929973181, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8787, "loss": 4.315004348754883, "lr": 0.00010064239685963706, "elapsed_sec": 72885.52601861954, "step_time_sec": 8.229551496973727, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8788, "loss": 4.19673490524292, "lr": 0.00010052309774770247, "elapsed_sec": 72893.75230503082, "step_time_sec": 8.22606612200616, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8789, "loss": 4.038812160491943, "lr": 0.00010040380571106677, "elapsed_sec": 72901.98190569878, "step_time_sec": 8.229425785015337, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8790, "loss": 3.9533193111419678, "lr": 0.00010028452093825909, "elapsed_sec": 72910.21316742897, "step_time_sec": 8.231151129002683, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8791, "loss": 4.127547264099121, "lr": 0.00010016524361779716, "elapsed_sec": 72918.44376564026, "step_time_sec": 8.230378651991487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8792, "loss": 4.013630390167236, "lr": 0.00010004597393818698, "elapsed_sec": 72926.6734521389, "step_time_sec": 8.22953181102639, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8793, "loss": 4.0304179191589355, "lr": 9.992671208792235e-05, "elapsed_sec": 72934.90371131897, "step_time_sec": 8.230093938997015, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8794, "loss": 4.024982452392578, "lr": 9.980745825548484e-05, "elapsed_sec": 72943.13395619392, "step_time_sec": 8.230088540003635, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8795, "loss": 4.061284065246582, "lr": 9.968821262934318e-05, "elapsed_sec": 72951.36584949493, "step_time_sec": 8.231690608954523, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8796, "loss": 3.8975977897644043, "lr": 9.956897539795329e-05, "elapsed_sec": 72959.59639167786, "step_time_sec": 8.230371553974692, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8797, "loss": 4.0519232749938965, "lr": 9.944974674975776e-05, "elapsed_sec": 72977.03347706795, "step_time_sec": 17.436917682003696, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8798, "loss": 4.1320414543151855, "lr": 9.933052687318558e-05, "elapsed_sec": 72985.26263856888, "step_time_sec": 8.229006193985697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8799, "loss": 4.024807929992676, "lr": 9.921131595665195e-05, "elapsed_sec": 72993.49326848984, "step_time_sec": 8.230501318990719, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8800, "loss": 4.008519172668457, "lr": 9.909211418855781e-05, "elapsed_sec": 73001.72355413437, "step_time_sec": 8.23009541502688, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8801, "loss": 3.84291934967041, "lr": 9.897292175728975e-05, "elapsed_sec": 73009.9557955265, "step_time_sec": 8.232051989005413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8802, "loss": 4.002016067504883, "lr": 9.885373885121957e-05, "elapsed_sec": 73018.18667626381, "step_time_sec": 8.230778000957798, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8803, "loss": 4.014429569244385, "lr": 9.873456565870394e-05, "elapsed_sec": 73026.4169614315, "step_time_sec": 8.230035561020486, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8804, "loss": 3.893808126449585, "lr": 9.86154023680843e-05, "elapsed_sec": 73034.647772789, "step_time_sec": 8.230709551018663, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8805, "loss": 3.9605252742767334, "lr": 9.849624916768633e-05, "elapsed_sec": 73042.87755584717, "step_time_sec": 8.229640752018895, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8806, "loss": 3.914135456085205, "lr": 9.837710624581989e-05, "elapsed_sec": 73051.10792207718, "step_time_sec": 8.230139451974537, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8807, "loss": 3.9732487201690674, "lr": 9.825797379077842e-05, "elapsed_sec": 73059.337651968, "step_time_sec": 8.229541135020554, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8808, "loss": 4.070653915405273, "lr": 9.813885199083898e-05, "elapsed_sec": 73067.56835913658, "step_time_sec": 8.230541637982242, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8809, "loss": 4.064519882202148, "lr": 9.801974103426176e-05, "elapsed_sec": 73075.79707217216, "step_time_sec": 8.22856712603243, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8810, "loss": 4.17510461807251, "lr": 9.79006411092897e-05, "elapsed_sec": 73084.02719116211, "step_time_sec": 8.229901457962114, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8811, "loss": 3.9526116847991943, "lr": 9.778155240414847e-05, "elapsed_sec": 73092.25578808784, "step_time_sec": 8.2284530440229, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8812, "loss": 4.020784854888916, "lr": 9.766247510704586e-05, "elapsed_sec": 73100.4861176014, "step_time_sec": 8.230191140028182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8813, "loss": 4.024294376373291, "lr": 9.754340940617172e-05, "elapsed_sec": 73108.71693849564, "step_time_sec": 8.230612951971125, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8814, "loss": 3.9456043243408203, "lr": 9.742435548969761e-05, "elapsed_sec": 73116.94634366035, "step_time_sec": 8.229257619008422, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8815, "loss": 3.9728496074676514, "lr": 9.730531354577632e-05, "elapsed_sec": 73125.17451548576, "step_time_sec": 8.227988107013516, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8816, "loss": 4.2210164070129395, "lr": 9.718628376254185e-05, "elapsed_sec": 73133.40397334099, "step_time_sec": 8.22932113497518, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8817, "loss": 4.243497371673584, "lr": 9.706726632810891e-05, "elapsed_sec": 73141.63462471962, "step_time_sec": 8.230469776026439, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8818, "loss": 4.109224319458008, "lr": 9.694826143057277e-05, "elapsed_sec": 73149.86536359787, "step_time_sec": 8.230583363969345, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8819, "loss": 3.8491199016571045, "lr": 9.682926925800877e-05, "elapsed_sec": 73158.09540843964, "step_time_sec": 8.229922239028383, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8820, "loss": 3.936203718185425, "lr": 9.671028999847223e-05, "elapsed_sec": 73166.3270099163, "step_time_sec": 8.231377639051061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8821, "loss": 4.059665203094482, "lr": 9.659132383999811e-05, "elapsed_sec": 73174.55810284615, "step_time_sec": 8.230936564039439, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8822, "loss": 4.07102632522583, "lr": 9.647237097060048e-05, "elapsed_sec": 73182.7888314724, "step_time_sec": 8.230611588049214, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8823, "loss": 4.098618030548096, "lr": 9.635343157827262e-05, "elapsed_sec": 73191.01991868019, "step_time_sec": 8.230917698994745, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8824, "loss": 3.9070842266082764, "lr": 9.623450585098632e-05, "elapsed_sec": 73199.24931621552, "step_time_sec": 8.229196380008943, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8825, "loss": 3.999316453933716, "lr": 9.611559397669193e-05, "elapsed_sec": 73207.48097586632, "step_time_sec": 8.231477196968626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8826, "loss": 4.015039443969727, "lr": 9.599669614331786e-05, "elapsed_sec": 73215.712069273, "step_time_sec": 8.230980460997671, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8827, "loss": 4.0736165046691895, "lr": 9.587781253877024e-05, "elapsed_sec": 73223.9429409504, "step_time_sec": 8.230637669970747, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8828, "loss": 3.9190726280212402, "lr": 9.575894335093287e-05, "elapsed_sec": 73232.17358350754, "step_time_sec": 8.23046641703695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8829, "loss": 3.9861342906951904, "lr": 9.56400887676666e-05, "elapsed_sec": 73240.40445685387, "step_time_sec": 8.230721522995736, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8830, "loss": 4.0442094802856445, "lr": 9.552124897680933e-05, "elapsed_sec": 73248.63468027115, "step_time_sec": 8.230039412039332, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8831, "loss": 3.9789679050445557, "lr": 9.540242416617552e-05, "elapsed_sec": 73256.86529755592, "step_time_sec": 8.230483072984498, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8832, "loss": 4.022754192352295, "lr": 9.528361452355596e-05, "elapsed_sec": 73265.09616208076, "step_time_sec": 8.230703090957832, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8833, "loss": 4.014962673187256, "lr": 9.516482023671751e-05, "elapsed_sec": 73273.32684469223, "step_time_sec": 8.230511308996938, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8834, "loss": 4.05161714553833, "lr": 9.504604149340268e-05, "elapsed_sec": 73281.55909347534, "step_time_sec": 8.232067285978701, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8835, "loss": 4.191577434539795, "lr": 9.492727848132949e-05, "elapsed_sec": 73289.78444576263, "step_time_sec": 8.225220413005445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8836, "loss": 3.987272262573242, "lr": 9.480853138819108e-05, "elapsed_sec": 73298.01279783249, "step_time_sec": 8.228204614017159, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8837, "loss": 4.029180526733398, "lr": 9.468980040165538e-05, "elapsed_sec": 73306.24308896065, "step_time_sec": 8.230098305968568, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8838, "loss": 4.0614824295043945, "lr": 9.457108570936499e-05, "elapsed_sec": 73314.47377228737, "step_time_sec": 8.230494170973543, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8839, "loss": 4.0398430824279785, "lr": 9.445238749893658e-05, "elapsed_sec": 73322.70519638062, "step_time_sec": 8.231255069025792, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8840, "loss": 4.036078453063965, "lr": 9.433370595796093e-05, "elapsed_sec": 73330.93623638153, "step_time_sec": 8.230948118958622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8841, "loss": 3.9870901107788086, "lr": 9.42150412740024e-05, "elapsed_sec": 73339.16769480705, "step_time_sec": 8.231217634980567, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8842, "loss": 3.9588072299957275, "lr": 9.409639363459871e-05, "elapsed_sec": 73347.39829158783, "step_time_sec": 8.230511905043386, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8843, "loss": 4.180789947509766, "lr": 9.397776322726066e-05, "elapsed_sec": 73355.62885212898, "step_time_sec": 8.230345254996791, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8844, "loss": 4.071582317352295, "lr": 9.385915023947183e-05, "elapsed_sec": 73363.86045265198, "step_time_sec": 8.231424556986894, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8845, "loss": 4.206017017364502, "lr": 9.374055485868826e-05, "elapsed_sec": 73372.09047937393, "step_time_sec": 8.229883300024085, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8846, "loss": 3.9102885723114014, "lr": 9.362197727233812e-05, "elapsed_sec": 73380.32157111168, "step_time_sec": 8.23091312701581, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8847, "loss": 4.017024040222168, "lr": 9.350341766782153e-05, "elapsed_sec": 73388.55191612244, "step_time_sec": 8.230156350997277, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8848, "loss": 3.911468505859375, "lr": 9.33848762325101e-05, "elapsed_sec": 73396.78287005424, "step_time_sec": 8.230848209001124, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8849, "loss": 4.143026351928711, "lr": 9.326635315374686e-05, "elapsed_sec": 73405.01386857033, "step_time_sec": 8.23081216297578, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8850, "loss": 4.030381202697754, "lr": 9.31478486188457e-05, "elapsed_sec": 73413.24440526962, "step_time_sec": 8.23041540000122, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8851, "loss": 4.060690402984619, "lr": 9.302936281509126e-05, "elapsed_sec": 73421.47510027885, "step_time_sec": 8.230472904979251, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8852, "loss": 4.029162406921387, "lr": 9.291089592973859e-05, "elapsed_sec": 73429.70535898209, "step_time_sec": 8.230109588999767, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8853, "loss": 4.005303859710693, "lr": 9.279244815001279e-05, "elapsed_sec": 73437.93419575691, "step_time_sec": 8.228677897946909, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8854, "loss": 4.075190544128418, "lr": 9.267401966310884e-05, "elapsed_sec": 73446.1654407978, "step_time_sec": 8.231094588001724, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8855, "loss": 4.031805038452148, "lr": 9.255561065619114e-05, "elapsed_sec": 73454.39687132835, "step_time_sec": 8.231296637968626, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8856, "loss": 4.0172553062438965, "lr": 9.243722131639338e-05, "elapsed_sec": 73462.6282055378, "step_time_sec": 8.23108477704227, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8857, "loss": 4.057837009429932, "lr": 9.231885183081814e-05, "elapsed_sec": 73470.85874009132, "step_time_sec": 8.230374917038716, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8858, "loss": 4.129846572875977, "lr": 9.220050238653659e-05, "elapsed_sec": 73479.08752059937, "step_time_sec": 8.228607080003712, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8859, "loss": 3.892758846282959, "lr": 9.208217317058835e-05, "elapsed_sec": 73487.3184633255, "step_time_sec": 8.230793342983816, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8860, "loss": 4.052910327911377, "lr": 9.196386436998085e-05, "elapsed_sec": 73495.55024433136, "step_time_sec": 8.231625121028628, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8861, "loss": 4.056928634643555, "lr": 9.184557617168946e-05, "elapsed_sec": 73503.77641439438, "step_time_sec": 8.225994112028275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8862, "loss": 3.9544782638549805, "lr": 9.172730876265693e-05, "elapsed_sec": 73512.00523304939, "step_time_sec": 8.228696555946954, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8863, "loss": 4.054679870605469, "lr": 9.16090623297931e-05, "elapsed_sec": 73520.23482489586, "step_time_sec": 8.229417358990759, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8864, "loss": 4.035177230834961, "lr": 9.149083705997474e-05, "elapsed_sec": 73528.46494793892, "step_time_sec": 8.22998068999732, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8865, "loss": 4.066791534423828, "lr": 9.137263314004506e-05, "elapsed_sec": 73536.69554805756, "step_time_sec": 8.230419311963487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8866, "loss": 3.968559741973877, "lr": 9.125445075681369e-05, "elapsed_sec": 73544.9250061512, "step_time_sec": 8.22929882403696, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8867, "loss": 4.062860012054443, "lr": 9.113629009705605e-05, "elapsed_sec": 73553.15273666382, "step_time_sec": 8.227607776992954, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8868, "loss": 4.125980377197266, "lr": 9.101815134751337e-05, "elapsed_sec": 73561.38185858727, "step_time_sec": 8.228976699989289, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8869, "loss": 4.064316749572754, "lr": 9.090003469489222e-05, "elapsed_sec": 73569.61194729805, "step_time_sec": 8.229884425003547, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8870, "loss": 3.9767792224884033, "lr": 9.078194032586413e-05, "elapsed_sec": 73577.84285855293, "step_time_sec": 8.230735672987066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8871, "loss": 4.041011810302734, "lr": 9.066386842706563e-05, "elapsed_sec": 73586.0735104084, "step_time_sec": 8.230541196011472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8872, "loss": 4.004321575164795, "lr": 9.05458191850975e-05, "elapsed_sec": 73594.30412077904, "step_time_sec": 8.23044278100133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8873, "loss": 4.061252593994141, "lr": 9.042779278652487e-05, "elapsed_sec": 73602.5354719162, "step_time_sec": 8.231149449013174, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8874, "loss": 3.9387333393096924, "lr": 9.030978941787679e-05, "elapsed_sec": 73610.76630091667, "step_time_sec": 8.230715433019213, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8875, "loss": 3.9490416049957275, "lr": 9.019180926564575e-05, "elapsed_sec": 73618.99720215797, "step_time_sec": 8.230714993027505, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8876, "loss": 3.9699244499206543, "lr": 9.007385251628774e-05, "elapsed_sec": 73627.22693324089, "step_time_sec": 8.229582875035703, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8877, "loss": 3.911797046661377, "lr": 8.995591935622159e-05, "elapsed_sec": 73635.45821094513, "step_time_sec": 8.231182763993274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8878, "loss": 3.9717094898223877, "lr": 8.9838009971829e-05, "elapsed_sec": 73643.68906068802, "step_time_sec": 8.230591719970107, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8879, "loss": 4.20480489730835, "lr": 8.972012454945396e-05, "elapsed_sec": 73651.91933846474, "step_time_sec": 8.230138206970878, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8880, "loss": 4.017090320587158, "lr": 8.96022632754027e-05, "elapsed_sec": 73660.1494987011, "step_time_sec": 8.229998297989368, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8881, "loss": 4.0207295417785645, "lr": 8.948442633594329e-05, "elapsed_sec": 73668.3809132576, "step_time_sec": 8.231308771995828, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8882, "loss": 4.052187442779541, "lr": 8.93666139173052e-05, "elapsed_sec": 73676.6120839119, "step_time_sec": 8.230936899024528, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8883, "loss": 4.2077531814575195, "lr": 8.924882620567932e-05, "elapsed_sec": 73684.84250211716, "step_time_sec": 8.230229633045383, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8884, "loss": 4.143472671508789, "lr": 8.913106338721735e-05, "elapsed_sec": 73693.07437419891, "step_time_sec": 8.231733559980057, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8885, "loss": 3.874617576599121, "lr": 8.901332564803176e-05, "elapsed_sec": 73701.30541443825, "step_time_sec": 8.230885459983256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8886, "loss": 4.033829689025879, "lr": 8.889561317419536e-05, "elapsed_sec": 73709.53566884995, "step_time_sec": 8.230068977049086, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8887, "loss": 4.191888332366943, "lr": 8.877792615174093e-05, "elapsed_sec": 73717.76651859283, "step_time_sec": 8.230665759998374, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8888, "loss": 4.048867225646973, "lr": 8.866026476666121e-05, "elapsed_sec": 73725.99717926979, "step_time_sec": 8.230495148978662, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8889, "loss": 4.0454301834106445, "lr": 8.854262920490822e-05, "elapsed_sec": 73734.22903084755, "step_time_sec": 8.231775251973886, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8890, "loss": 4.098632335662842, "lr": 8.842501965239331e-05, "elapsed_sec": 73742.4598865509, "step_time_sec": 8.230616529996041, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8891, "loss": 3.9069955348968506, "lr": 8.830743629498673e-05, "elapsed_sec": 73750.69001102448, "step_time_sec": 8.229990168008953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8892, "loss": 4.147960186004639, "lr": 8.81898793185172e-05, "elapsed_sec": 73758.92046093941, "step_time_sec": 8.230228380009066, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8893, "loss": 3.9446861743927, "lr": 8.807234890877192e-05, "elapsed_sec": 73767.15143585205, "step_time_sec": 8.230843222991098, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8894, "loss": 4.134331226348877, "lr": 8.795484525149592e-05, "elapsed_sec": 73775.38233017921, "step_time_sec": 8.230739984952379, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8895, "loss": 3.959038734436035, "lr": 8.783736853239212e-05, "elapsed_sec": 73783.61283302307, "step_time_sec": 8.230337790970225, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8896, "loss": 3.811173439025879, "lr": 8.771991893712075e-05, "elapsed_sec": 73791.84381365776, "step_time_sec": 8.230820244003553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8897, "loss": 4.046817302703857, "lr": 8.760249665129922e-05, "elapsed_sec": 73800.0716919899, "step_time_sec": 8.227665338024963, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8898, "loss": 3.993635654449463, "lr": 8.748510186050185e-05, "elapsed_sec": 73808.30056023598, "step_time_sec": 8.228740851045586, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8899, "loss": 4.009241580963135, "lr": 8.736773475025931e-05, "elapsed_sec": 73816.530179739, "step_time_sec": 8.2294361940003, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8900, "loss": 3.927475690841675, "lr": 8.725039550605876e-05, "elapsed_sec": 73824.7598001957, "step_time_sec": 8.229507283016574, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8901, "loss": 3.9064667224884033, "lr": 8.713308431334313e-05, "elapsed_sec": 73832.99131536484, "step_time_sec": 8.231292073964141, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8902, "loss": 3.9265379905700684, "lr": 8.70158013575111e-05, "elapsed_sec": 73841.21875286102, "step_time_sec": 8.227247851027641, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8903, "loss": 3.995030403137207, "lr": 8.689854682391682e-05, "elapsed_sec": 73849.449832201, "step_time_sec": 8.230934400984552, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8904, "loss": 4.108218669891357, "lr": 8.678132089786928e-05, "elapsed_sec": 73857.68156480789, "step_time_sec": 8.231546292023268, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8905, "loss": 4.039391040802002, "lr": 8.666412376463252e-05, "elapsed_sec": 73865.91274023056, "step_time_sec": 8.23102927999571, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8906, "loss": 4.141515254974365, "lr": 8.654695560942484e-05, "elapsed_sec": 73874.14442706108, "step_time_sec": 8.231604942993727, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8907, "loss": 4.083366394042969, "lr": 8.642981661741896e-05, "elapsed_sec": 73882.37536382675, "step_time_sec": 8.230729045986664, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8908, "loss": 3.972076654434204, "lr": 8.631270697374135e-05, "elapsed_sec": 73890.6039428711, "step_time_sec": 8.228397677012254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8909, "loss": 3.9423232078552246, "lr": 8.619562686347215e-05, "elapsed_sec": 73898.83213543892, "step_time_sec": 8.227990559011232, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8910, "loss": 4.067208290100098, "lr": 8.607857647164491e-05, "elapsed_sec": 73907.06127643585, "step_time_sec": 8.2289842540049, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8911, "loss": 3.9637532234191895, "lr": 8.596155598324604e-05, "elapsed_sec": 73915.2899916172, "step_time_sec": 8.228606835997198, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8912, "loss": 3.924915075302124, "lr": 8.584456558321487e-05, "elapsed_sec": 73923.51607298851, "step_time_sec": 8.225917531002779, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8913, "loss": 4.185580253601074, "lr": 8.572760545644301e-05, "elapsed_sec": 73931.74714636803, "step_time_sec": 8.230868401005864, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8914, "loss": 3.9479453563690186, "lr": 8.561067578777435e-05, "elapsed_sec": 73939.97883796692, "step_time_sec": 8.231485079973936, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8915, "loss": 4.0930094718933105, "lr": 8.549377676200464e-05, "elapsed_sec": 73948.2089214325, "step_time_sec": 8.229932804999407, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8916, "loss": 3.8819851875305176, "lr": 8.53769085638811e-05, "elapsed_sec": 73956.43878054619, "step_time_sec": 8.22973802901106, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8917, "loss": 4.05012845993042, "lr": 8.526007137810237e-05, "elapsed_sec": 73964.66958618164, "step_time_sec": 8.230589721002616, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8918, "loss": 3.9237866401672363, "lr": 8.514326538931792e-05, "elapsed_sec": 73972.90080332756, "step_time_sec": 8.231059771031141, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8919, "loss": 4.040241718292236, "lr": 8.502649078212807e-05, "elapsed_sec": 73981.13126707077, "step_time_sec": 8.23029127100017, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8920, "loss": 4.082338809967041, "lr": 8.490974774108344e-05, "elapsed_sec": 73989.36025094986, "step_time_sec": 8.228835568996146, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8921, "loss": 3.886446237564087, "lr": 8.47930364506848e-05, "elapsed_sec": 73997.5890071392, "step_time_sec": 8.228609052021056, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8922, "loss": 3.830479145050049, "lr": 8.467635709538278e-05, "elapsed_sec": 74005.81774878502, "step_time_sec": 8.228532234963495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8923, "loss": 3.9593913555145264, "lr": 8.455970985957741e-05, "elapsed_sec": 74014.046813488, "step_time_sec": 8.228891670994926, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8924, "loss": 4.115396976470947, "lr": 8.444309492761813e-05, "elapsed_sec": 74022.2734837532, "step_time_sec": 8.22651288000634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8925, "loss": 3.923236846923828, "lr": 8.432651248380322e-05, "elapsed_sec": 74030.50312590599, "step_time_sec": 8.22948037396418, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8926, "loss": 4.118894100189209, "lr": 8.420996271237962e-05, "elapsed_sec": 74038.73191738129, "step_time_sec": 8.228646006027702, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8927, "loss": 3.950331211090088, "lr": 8.40934457975427e-05, "elapsed_sec": 74046.95926165581, "step_time_sec": 8.227136695000809, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8928, "loss": 4.181774139404297, "lr": 8.39769619234358e-05, "elapsed_sec": 74055.19040608406, "step_time_sec": 8.231042840983719, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8929, "loss": 3.9894275665283203, "lr": 8.386051127415015e-05, "elapsed_sec": 74063.42095208168, "step_time_sec": 8.23035595699912, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8930, "loss": 3.813159227371216, "lr": 8.374409403372439e-05, "elapsed_sec": 74071.64996528625, "step_time_sec": 8.228853781998623, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8931, "loss": 3.8734049797058105, "lr": 8.36277103861444e-05, "elapsed_sec": 74079.87913060188, "step_time_sec": 8.228951647004578, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8932, "loss": 3.9744019508361816, "lr": 8.351136051534297e-05, "elapsed_sec": 74088.10812401772, "step_time_sec": 8.228865551005583, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8933, "loss": 4.158337116241455, "lr": 8.339504460519948e-05, "elapsed_sec": 74096.33986830711, "step_time_sec": 8.231527108000591, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8934, "loss": 3.802748680114746, "lr": 8.327876283953969e-05, "elapsed_sec": 74104.57066297531, "step_time_sec": 8.230640435998794, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8935, "loss": 3.9030022621154785, "lr": 8.316251540213536e-05, "elapsed_sec": 74112.7999022007, "step_time_sec": 8.229116356000304, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8936, "loss": 3.7849597930908203, "lr": 8.304630247670401e-05, "elapsed_sec": 74121.03067040443, "step_time_sec": 8.230587644968182, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8937, "loss": 4.033621311187744, "lr": 8.293012424690859e-05, "elapsed_sec": 74129.26233649254, "step_time_sec": 8.231480527028907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8938, "loss": 4.065943241119385, "lr": 8.281398089635723e-05, "elapsed_sec": 74137.49268603325, "step_time_sec": 8.230193965951912, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8939, "loss": 4.0400238037109375, "lr": 8.269787260860301e-05, "elapsed_sec": 74145.72331118584, "step_time_sec": 8.23049721098505, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8940, "loss": 3.912747859954834, "lr": 8.258179956714348e-05, "elapsed_sec": 74153.95181393623, "step_time_sec": 8.228309341997374, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8941, "loss": 4.066464424133301, "lr": 8.246576195542054e-05, "elapsed_sec": 74162.18256878853, "step_time_sec": 8.230583944008686, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8942, "loss": 3.859767436981201, "lr": 8.234975995682011e-05, "elapsed_sec": 74170.41281890869, "step_time_sec": 8.230102412053384, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8943, "loss": 4.068592548370361, "lr": 8.223379375467184e-05, "elapsed_sec": 74178.64227819443, "step_time_sec": 8.229341944970656, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8944, "loss": 3.9963479042053223, "lr": 8.211786353224872e-05, "elapsed_sec": 74186.86980962753, "step_time_sec": 8.227369564992841, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8945, "loss": 4.049694061279297, "lr": 8.200196947276699e-05, "elapsed_sec": 74195.1000931263, "step_time_sec": 8.230081567016896, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8946, "loss": 3.8980181217193604, "lr": 8.188611175938569e-05, "elapsed_sec": 74203.33030962944, "step_time_sec": 8.23003149899887, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8947, "loss": 4.135015487670898, "lr": 8.177029057520636e-05, "elapsed_sec": 74211.56213474274, "step_time_sec": 8.231702721037436, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8948, "loss": 4.003182411193848, "lr": 8.165450610327295e-05, "elapsed_sec": 74219.790964365, "step_time_sec": 8.228629545017611, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8949, "loss": 3.9946494102478027, "lr": 8.15387585265712e-05, "elapsed_sec": 74228.0196185112, "step_time_sec": 8.228492899041157, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8950, "loss": 4.084916114807129, "lr": 8.142304802802875e-05, "elapsed_sec": 74236.24812269211, "step_time_sec": 8.228345722018275, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8951, "loss": 3.9052414894104004, "lr": 8.130737479051446e-05, "elapsed_sec": 74244.4784655571, "step_time_sec": 8.23021016601706, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8952, "loss": 4.152119159698486, "lr": 8.11917389968384e-05, "elapsed_sec": 74252.70961403847, "step_time_sec": 8.230983196001034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8953, "loss": 4.034235000610352, "lr": 8.10761408297515e-05, "elapsed_sec": 74260.93993663788, "step_time_sec": 8.230164017004427, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8954, "loss": 4.156839847564697, "lr": 8.096058047194507e-05, "elapsed_sec": 74269.17073941231, "step_time_sec": 8.230620438989718, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8955, "loss": 3.9638047218322754, "lr": 8.084505810605086e-05, "elapsed_sec": 74277.40214180946, "step_time_sec": 8.231316610996146, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8956, "loss": 4.072976589202881, "lr": 8.07295739146404e-05, "elapsed_sec": 74285.62949442863, "step_time_sec": 8.227131499967072, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8957, "loss": 3.921566963195801, "lr": 8.061412808022501e-05, "elapsed_sec": 74293.85881638527, "step_time_sec": 8.229196475993376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8958, "loss": 4.00291633605957, "lr": 8.049872078525538e-05, "elapsed_sec": 74302.08968544006, "step_time_sec": 8.230660149012692, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8959, "loss": 3.9868264198303223, "lr": 8.038335221212121e-05, "elapsed_sec": 74310.32069253922, "step_time_sec": 8.230853842978831, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8960, "loss": 3.990278720855713, "lr": 8.026802254315111e-05, "elapsed_sec": 74318.55187773705, "step_time_sec": 8.231045971042477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8961, "loss": 3.931891441345215, "lr": 8.015273196061208e-05, "elapsed_sec": 74326.78236985207, "step_time_sec": 8.23038862698013, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8962, "loss": 4.1246337890625, "lr": 8.003748064670948e-05, "elapsed_sec": 74335.01394200325, "step_time_sec": 8.23135820304742, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8963, "loss": 4.036827087402344, "lr": 7.992226878358657e-05, "elapsed_sec": 74343.24421429634, "step_time_sec": 8.230156744015403, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8964, "loss": 3.9702131748199463, "lr": 7.980709655332419e-05, "elapsed_sec": 74351.47509908676, "step_time_sec": 8.230671009980142, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8965, "loss": 3.8878448009490967, "lr": 7.969196413794065e-05, "elapsed_sec": 74359.70624470711, "step_time_sec": 8.230973833997268, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8966, "loss": 3.957819700241089, "lr": 7.957687171939122e-05, "elapsed_sec": 74367.93509626389, "step_time_sec": 8.228757623990532, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8967, "loss": 4.103140830993652, "lr": 7.94618194795681e-05, "elapsed_sec": 74376.16409778595, "step_time_sec": 8.228739632002544, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8968, "loss": 3.828500747680664, "lr": 7.934680760029982e-05, "elapsed_sec": 74384.3909714222, "step_time_sec": 8.226741413993295, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8969, "loss": 3.9244089126586914, "lr": 7.923183626335128e-05, "elapsed_sec": 74392.62361240387, "step_time_sec": 8.232456409023143, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8970, "loss": 3.994502305984497, "lr": 7.911690565042327e-05, "elapsed_sec": 74400.85420680046, "step_time_sec": 8.230464093037881, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8971, "loss": 3.924790143966675, "lr": 7.900201594315214e-05, "elapsed_sec": 74409.08554077148, "step_time_sec": 8.231167351012118, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8972, "loss": 4.1276960372924805, "lr": 7.888716732310969e-05, "elapsed_sec": 74417.31625676155, "step_time_sec": 8.230530511005782, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8973, "loss": 3.9693074226379395, "lr": 7.877235997180269e-05, "elapsed_sec": 74425.54770493507, "step_time_sec": 8.231282585998997, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8974, "loss": 4.012401103973389, "lr": 7.865759407067278e-05, "elapsed_sec": 74433.77750372887, "step_time_sec": 8.22969051398104, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8975, "loss": 4.041694164276123, "lr": 7.854286980109609e-05, "elapsed_sec": 74442.00778651237, "step_time_sec": 8.230106890026946, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8976, "loss": 3.8294131755828857, "lr": 7.842818734438285e-05, "elapsed_sec": 74450.23772764206, "step_time_sec": 8.229736759036314, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8977, "loss": 3.978301763534546, "lr": 7.831354688177734e-05, "elapsed_sec": 74458.46804618835, "step_time_sec": 8.230251225002576, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8978, "loss": 3.965348720550537, "lr": 7.819894859445735e-05, "elapsed_sec": 74466.69913244247, "step_time_sec": 8.230864958022721, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8979, "loss": 4.037569522857666, "lr": 7.808439266353417e-05, "elapsed_sec": 74474.92993855476, "step_time_sec": 8.230711529031396, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8980, "loss": 3.9426310062408447, "lr": 7.796987927005192e-05, "elapsed_sec": 74483.1602602005, "step_time_sec": 8.230081143963616, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8981, "loss": 3.945420980453491, "lr": 7.785540859498775e-05, "elapsed_sec": 74491.39084720612, "step_time_sec": 8.23042430903297, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8982, "loss": 3.949613332748413, "lr": 7.774098081925115e-05, "elapsed_sec": 74499.62079429626, "step_time_sec": 8.229740437993314, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8983, "loss": 3.9077954292297363, "lr": 7.76265961236838e-05, "elapsed_sec": 74507.85066080093, "step_time_sec": 8.229745015967637, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8984, "loss": 3.9541797637939453, "lr": 7.751225468905943e-05, "elapsed_sec": 74516.08156824112, "step_time_sec": 8.23072328302078, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8985, "loss": 3.9510138034820557, "lr": 7.739795669608317e-05, "elapsed_sec": 74524.31271219254, "step_time_sec": 8.23101879900787, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8986, "loss": 3.8379874229431152, "lr": 7.728370232539174e-05, "elapsed_sec": 74532.54372406006, "step_time_sec": 8.230814529000781, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8987, "loss": 3.9137139320373535, "lr": 7.716949175755282e-05, "elapsed_sec": 74540.77371072769, "step_time_sec": 8.229867187037598, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8988, "loss": 4.057743072509766, "lr": 7.705532517306476e-05, "elapsed_sec": 74549.00512886047, "step_time_sec": 8.231284441018943, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8989, "loss": 4.0531792640686035, "lr": 7.694120275235659e-05, "elapsed_sec": 74557.23536610603, "step_time_sec": 8.230063306051306, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8990, "loss": 4.017405986785889, "lr": 7.682712467578738e-05, "elapsed_sec": 74565.46680021286, "step_time_sec": 8.23127063899301, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8991, "loss": 3.9739701747894287, "lr": 7.671309112364622e-05, "elapsed_sec": 74573.69692230225, "step_time_sec": 8.229959631979, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8992, "loss": 4.022279739379883, "lr": 7.659910227615182e-05, "elapsed_sec": 74581.92591500282, "step_time_sec": 8.228841823991388, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8993, "loss": 3.990663766860962, "lr": 7.648515831345217e-05, "elapsed_sec": 74590.15249752998, "step_time_sec": 8.226377630024217, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8994, "loss": 4.0685133934021, "lr": 7.637125941562444e-05, "elapsed_sec": 74598.38102245331, "step_time_sec": 8.228355647996068, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8995, "loss": 3.838721990585327, "lr": 7.625740576267442e-05, "elapsed_sec": 74606.61196303368, "step_time_sec": 8.230863114004023, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8996, "loss": 3.849311590194702, "lr": 7.614359753453659e-05, "elapsed_sec": 74614.843146801, "step_time_sec": 8.230928940989543, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8997, "loss": 3.97025990486145, "lr": 7.602983491107342e-05, "elapsed_sec": 74623.0715932846, "step_time_sec": 8.22828252898762, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8998, "loss": 4.021052360534668, "lr": 7.591611807207551e-05, "elapsed_sec": 74631.30117678642, "step_time_sec": 8.22941000800347, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 8999, "loss": 3.9326860904693604, "lr": 7.580244719726102e-05, "elapsed_sec": 74639.53052091599, "step_time_sec": 8.229174067033455, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9000, "loss": 3.9360899925231934, "lr": 7.568882246627542e-05, "elapsed_sec": 74647.76208424568, "step_time_sec": 52.69723253400298, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": true, "probe_ran": true, "probe_elapsed_sec": 0.9833371759741567, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9001, "loss": 3.911701202392578, "lr": 7.557524405869135e-05, "elapsed_sec": 74700.4562125206, "step_time_sec": 8.228184867999516, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9002, "loss": 3.947158098220825, "lr": 7.546171215400813e-05, "elapsed_sec": 74708.67279458046, "step_time_sec": 8.21638536802493, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9003, "loss": 3.9195480346679688, "lr": 7.534822693165166e-05, "elapsed_sec": 74716.89813923836, "step_time_sec": 8.225247528986074, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9004, "loss": 4.039090633392334, "lr": 7.523478857097411e-05, "elapsed_sec": 74725.12785744667, "step_time_sec": 8.229519057029393, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9005, "loss": 4.032364368438721, "lr": 7.512139725125345e-05, "elapsed_sec": 74733.35642457008, "step_time_sec": 8.228407274000347, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9006, "loss": 4.012650012969971, "lr": 7.500805315169345e-05, "elapsed_sec": 74741.58407521248, "step_time_sec": 8.227485276991501, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9007, "loss": 3.8534913063049316, "lr": 7.489475645142313e-05, "elapsed_sec": 74749.81367564201, "step_time_sec": 8.229456731001846, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9008, "loss": 4.122915744781494, "lr": 7.47815073294967e-05, "elapsed_sec": 74758.04545903206, "step_time_sec": 8.231633064977359, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9009, "loss": 4.042388916015625, "lr": 7.466830596489311e-05, "elapsed_sec": 74766.27596592903, "step_time_sec": 8.23034488101257, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9010, "loss": 3.891667127609253, "lr": 7.455515253651587e-05, "elapsed_sec": 74774.50634646416, "step_time_sec": 8.23023920803098, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9011, "loss": 4.176833152770996, "lr": 7.444204722319275e-05, "elapsed_sec": 74782.73673582077, "step_time_sec": 8.230207352957223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9012, "loss": 4.086357116699219, "lr": 7.43289902036754e-05, "elapsed_sec": 74790.96762275696, "step_time_sec": 8.230745842971373, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9013, "loss": 4.0441436767578125, "lr": 7.421598165663926e-05, "elapsed_sec": 74799.19822216034, "step_time_sec": 8.230473320989404, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9014, "loss": 4.122117519378662, "lr": 7.410302176068304e-05, "elapsed_sec": 74807.42867732048, "step_time_sec": 8.230265630991198, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9015, "loss": 3.847987413406372, "lr": 7.399011069432864e-05, "elapsed_sec": 74815.6603987217, "step_time_sec": 8.231536057021003, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9016, "loss": 4.014702796936035, "lr": 7.38772486360208e-05, "elapsed_sec": 74823.89072084427, "step_time_sec": 8.230158251011744, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9017, "loss": 4.063714504241943, "lr": 7.376443576412675e-05, "elapsed_sec": 74832.1221151352, "step_time_sec": 8.231283727975097, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9018, "loss": 3.9726145267486572, "lr": 7.365167225693604e-05, "elapsed_sec": 74840.35280108452, "step_time_sec": 8.230506027000956, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9019, "loss": 4.048367977142334, "lr": 7.353895829266016e-05, "elapsed_sec": 74848.5831375122, "step_time_sec": 8.230180234008003, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9020, "loss": 4.001279830932617, "lr": 7.342629404943234e-05, "elapsed_sec": 74856.81381249428, "step_time_sec": 8.230508422013372, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9021, "loss": 4.016814708709717, "lr": 7.331367970530721e-05, "elapsed_sec": 74865.04491329193, "step_time_sec": 8.230958953965455, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9022, "loss": 4.0052361488342285, "lr": 7.320111543826052e-05, "elapsed_sec": 74873.27578663826, "step_time_sec": 8.230701828026213, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9023, "loss": 3.8270559310913086, "lr": 7.308860142618892e-05, "elapsed_sec": 74881.5062084198, "step_time_sec": 8.23025989398593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9024, "loss": 4.11392068862915, "lr": 7.297613784690964e-05, "elapsed_sec": 74889.73733901978, "step_time_sec": 8.230966978997458, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9025, "loss": 3.9786221981048584, "lr": 7.286372487816014e-05, "elapsed_sec": 74897.96812701225, "step_time_sec": 8.230660389002878, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9026, "loss": 3.9749655723571777, "lr": 7.275136269759798e-05, "elapsed_sec": 74906.19837260246, "step_time_sec": 8.230056939995848, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9027, "loss": 4.014305591583252, "lr": 7.263905148280037e-05, "elapsed_sec": 74914.4298286438, "step_time_sec": 8.231368031003512, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9028, "loss": 3.914410352706909, "lr": 7.252679141126405e-05, "elapsed_sec": 74922.66044855118, "step_time_sec": 8.230389840959106, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9029, "loss": 4.2299089431762695, "lr": 7.24145826604049e-05, "elapsed_sec": 74930.8913962841, "step_time_sec": 8.230871042993385, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9030, "loss": 3.948302745819092, "lr": 7.230242540755769e-05, "elapsed_sec": 74939.12152957916, "step_time_sec": 8.229961759992875, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9031, "loss": 3.970876932144165, "lr": 7.219031982997579e-05, "elapsed_sec": 74947.35175728798, "step_time_sec": 8.230046749988105, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9032, "loss": 4.068755626678467, "lr": 7.207826610483094e-05, "elapsed_sec": 74955.58192658424, "step_time_sec": 8.230069054989144, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9033, "loss": 3.91737699508667, "lr": 7.196626440921286e-05, "elapsed_sec": 74963.81291103363, "step_time_sec": 8.230702809989452, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9034, "loss": 4.104964256286621, "lr": 7.185431492012917e-05, "elapsed_sec": 74972.04419517517, "step_time_sec": 8.231132538989186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9035, "loss": 4.023366451263428, "lr": 7.174241781450487e-05, "elapsed_sec": 74980.27440881729, "step_time_sec": 8.230077565996908, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9036, "loss": 4.009715557098389, "lr": 7.16305732691822e-05, "elapsed_sec": 74988.50796222687, "step_time_sec": 8.23339449102059, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9037, "loss": 4.1377739906311035, "lr": 7.151878146092034e-05, "elapsed_sec": 74996.73753714561, "step_time_sec": 8.22940141899744, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9038, "loss": 3.9784340858459473, "lr": 7.140704256639511e-05, "elapsed_sec": 75004.96831130981, "step_time_sec": 8.230663519992959, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9039, "loss": 3.897658109664917, "lr": 7.129535676219879e-05, "elapsed_sec": 75013.19884443283, "step_time_sec": 8.230348967015743, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9040, "loss": 4.025088310241699, "lr": 7.118372422483962e-05, "elapsed_sec": 75021.42935204506, "step_time_sec": 8.23031907499535, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9041, "loss": 3.982135534286499, "lr": 7.107214513074175e-05, "elapsed_sec": 75029.65976786613, "step_time_sec": 8.230233807000332, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9042, "loss": 3.92281436920166, "lr": 7.096061965624488e-05, "elapsed_sec": 75037.89077186584, "step_time_sec": 8.230859051982407, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9043, "loss": 4.088205814361572, "lr": 7.084914797760386e-05, "elapsed_sec": 75046.1213209629, "step_time_sec": 8.230427544971462, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9044, "loss": 3.9797892570495605, "lr": 7.073773027098866e-05, "elapsed_sec": 75054.35111689568, "step_time_sec": 8.229638444026932, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9045, "loss": 3.917300224304199, "lr": 7.062636671248381e-05, "elapsed_sec": 75062.57594680786, "step_time_sec": 8.224648257018998, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9046, "loss": 3.945065498352051, "lr": 7.051505747808841e-05, "elapsed_sec": 75070.80412054062, "step_time_sec": 8.227982020005584, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9047, "loss": 4.014979839324951, "lr": 7.040380274371564e-05, "elapsed_sec": 75079.03453946114, "step_time_sec": 8.230261220014654, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9048, "loss": 3.8252525329589844, "lr": 7.029260268519248e-05, "elapsed_sec": 75087.26465964317, "step_time_sec": 8.229969981999602, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9049, "loss": 3.9890296459198, "lr": 7.018145747825963e-05, "elapsed_sec": 75095.49278521538, "step_time_sec": 8.227945293998346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9050, "loss": 4.058050155639648, "lr": 7.0070367298571e-05, "elapsed_sec": 75103.72053098679, "step_time_sec": 8.22757221799111, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9051, "loss": 4.0064496994018555, "lr": 6.995933232169357e-05, "elapsed_sec": 75111.9503467083, "step_time_sec": 8.22968073899392, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9052, "loss": 3.8732423782348633, "lr": 6.984835272310712e-05, "elapsed_sec": 75120.18069219589, "step_time_sec": 8.230176757031586, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9053, "loss": 4.061489582061768, "lr": 6.973742867820383e-05, "elapsed_sec": 75128.4121518135, "step_time_sec": 8.231315799988806, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9054, "loss": 4.01174783706665, "lr": 6.962656036228815e-05, "elapsed_sec": 75136.64029216766, "step_time_sec": 8.228009181038942, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9055, "loss": 3.9437644481658936, "lr": 6.951574795057641e-05, "elapsed_sec": 75144.86813235283, "step_time_sec": 8.227628035994712, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9056, "loss": 4.005334377288818, "lr": 6.940499161819663e-05, "elapsed_sec": 75153.09671854973, "step_time_sec": 8.228432756965049, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9057, "loss": 4.130579471588135, "lr": 6.929429154018814e-05, "elapsed_sec": 75161.32762050629, "step_time_sec": 8.23076383699663, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9058, "loss": 4.140211582183838, "lr": 6.918364789150143e-05, "elapsed_sec": 75169.55885910988, "step_time_sec": 8.2311090810108, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9059, "loss": 3.830761432647705, "lr": 6.90730608469978e-05, "elapsed_sec": 75177.78893446922, "step_time_sec": 8.22990158101311, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9060, "loss": 4.002269268035889, "lr": 6.896253058144904e-05, "elapsed_sec": 75186.02036976814, "step_time_sec": 8.231266841001343, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9061, "loss": 4.000836372375488, "lr": 6.885205726953732e-05, "elapsed_sec": 75194.25186729431, "step_time_sec": 8.231413645960856, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9062, "loss": 3.9237778186798096, "lr": 6.874164108585461e-05, "elapsed_sec": 75202.48281431198, "step_time_sec": 8.230744541040622, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9063, "loss": 3.9218966960906982, "lr": 6.863128220490277e-05, "elapsed_sec": 75210.713088274, "step_time_sec": 8.230099030013662, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9064, "loss": 4.142289161682129, "lr": 6.85209808010931e-05, "elapsed_sec": 75218.94125676155, "step_time_sec": 8.227990068960935, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9065, "loss": 3.9881973266601562, "lr": 6.841073704874588e-05, "elapsed_sec": 75227.17244911194, "step_time_sec": 8.231034739990719, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9066, "loss": 3.960620641708374, "lr": 6.83005511220905e-05, "elapsed_sec": 75235.40414690971, "step_time_sec": 8.231605289969593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9067, "loss": 3.8963193893432617, "lr": 6.819042319526478e-05, "elapsed_sec": 75243.63522863388, "step_time_sec": 8.230860239942558, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9068, "loss": 3.89780855178833, "lr": 6.808035344231503e-05, "elapsed_sec": 75251.86601519585, "step_time_sec": 8.230632770049851, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9069, "loss": 3.9196276664733887, "lr": 6.797034203719547e-05, "elapsed_sec": 75260.09722423553, "step_time_sec": 8.231101153010968, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9070, "loss": 4.025026798248291, "lr": 6.786038915376826e-05, "elapsed_sec": 75268.3278696537, "step_time_sec": 8.230475471005775, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9071, "loss": 4.012434482574463, "lr": 6.775049496580297e-05, "elapsed_sec": 75276.55862784386, "step_time_sec": 8.230533694964834, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9072, "loss": 3.8991644382476807, "lr": 6.764065964697643e-05, "elapsed_sec": 75284.78586006165, "step_time_sec": 8.227076352981385, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9073, "loss": 3.9811789989471436, "lr": 6.753088337087246e-05, "elapsed_sec": 75293.01690459251, "step_time_sec": 8.230869788036216, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9074, "loss": 3.959756851196289, "lr": 6.742116631098151e-05, "elapsed_sec": 75301.24781799316, "step_time_sec": 8.23075681301998, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9075, "loss": 4.030013561248779, "lr": 6.731150864070051e-05, "elapsed_sec": 75309.47879314423, "step_time_sec": 8.230855884030461, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9076, "loss": 3.989290952682495, "lr": 6.720191053333254e-05, "elapsed_sec": 75317.70910859108, "step_time_sec": 8.230180658982135, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9077, "loss": 4.002325057983398, "lr": 6.709237216208643e-05, "elapsed_sec": 75325.93975067139, "step_time_sec": 8.230410987976938, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9078, "loss": 3.926994800567627, "lr": 6.698289370007677e-05, "elapsed_sec": 75334.17035341263, "step_time_sec": 8.230456010031048, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9079, "loss": 4.174647808074951, "lr": 6.68734753203233e-05, "elapsed_sec": 75342.40151691437, "step_time_sec": 8.231003666005563, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9080, "loss": 3.9725005626678467, "lr": 6.676411719575093e-05, "elapsed_sec": 75350.63244605064, "step_time_sec": 8.230780873040203, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9081, "loss": 4.03554105758667, "lr": 6.665481949918934e-05, "elapsed_sec": 75358.86387252808, "step_time_sec": 8.231328200025018, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9082, "loss": 3.927546739578247, "lr": 6.654558240337259e-05, "elapsed_sec": 75367.09417629242, "step_time_sec": 8.230077999003697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9083, "loss": 3.918297052383423, "lr": 6.643640608093912e-05, "elapsed_sec": 75375.32426714897, "step_time_sec": 8.23000309796771, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9084, "loss": 3.9280805587768555, "lr": 6.63272907044312e-05, "elapsed_sec": 75383.55477499962, "step_time_sec": 8.230272896005772, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9085, "loss": 3.7980127334594727, "lr": 6.621823644629488e-05, "elapsed_sec": 75391.78532528877, "step_time_sec": 8.23042857600376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9086, "loss": 3.9574942588806152, "lr": 6.610924347887953e-05, "elapsed_sec": 75400.01656126976, "step_time_sec": 8.231051604030654, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9087, "loss": 3.9843697547912598, "lr": 6.600031197443769e-05, "elapsed_sec": 75408.24809384346, "step_time_sec": 8.231426417012699, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9088, "loss": 3.8140347003936768, "lr": 6.589144210512483e-05, "elapsed_sec": 75416.47881603241, "step_time_sec": 8.230502359045204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9089, "loss": 4.114193439483643, "lr": 6.578263404299888e-05, "elapsed_sec": 75424.70988559723, "step_time_sec": 8.230971848010086, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9090, "loss": 4.08737325668335, "lr": 6.567388796002024e-05, "elapsed_sec": 75432.94020462036, "step_time_sec": 8.230162015010137, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9091, "loss": 4.046204566955566, "lr": 6.556520402805121e-05, "elapsed_sec": 75441.1716310978, "step_time_sec": 8.231209010002203, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9092, "loss": 4.259087085723877, "lr": 6.545658241885596e-05, "elapsed_sec": 75449.40224051476, "step_time_sec": 8.230466253997292, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9093, "loss": 3.909079074859619, "lr": 6.534802330410017e-05, "elapsed_sec": 75457.63179016113, "step_time_sec": 8.229454350017477, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9094, "loss": 4.017058849334717, "lr": 6.52395268553507e-05, "elapsed_sec": 75465.86131095886, "step_time_sec": 8.229345548024867, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9095, "loss": 3.9026999473571777, "lr": 6.513109324407541e-05, "elapsed_sec": 75474.09255456924, "step_time_sec": 8.231039300968405, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9096, "loss": 3.8495185375213623, "lr": 6.502272264164283e-05, "elapsed_sec": 75482.32267975807, "step_time_sec": 8.229964451980777, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9097, "loss": 4.104247093200684, "lr": 6.491441521932192e-05, "elapsed_sec": 75490.55319547653, "step_time_sec": 8.230372355959844, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9098, "loss": 3.9153661727905273, "lr": 6.480617114828182e-05, "elapsed_sec": 75498.7829656601, "step_time_sec": 8.229613457981031, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9099, "loss": 3.9722909927368164, "lr": 6.469799059959143e-05, "elapsed_sec": 75507.01315379143, "step_time_sec": 8.230037040018942, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9100, "loss": 3.9926986694335938, "lr": 6.458987374421951e-05, "elapsed_sec": 75515.24464249611, "step_time_sec": 8.231342083017807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9101, "loss": 3.8921990394592285, "lr": 6.448182075303385e-05, "elapsed_sec": 75523.4735891819, "step_time_sec": 8.228769091016147, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9102, "loss": 3.953434705734253, "lr": 6.437383179680155e-05, "elapsed_sec": 75531.70094680786, "step_time_sec": 8.227218911983073, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9103, "loss": 3.989896059036255, "lr": 6.426590704618834e-05, "elapsed_sec": 75539.92930841446, "step_time_sec": 8.228248702012934, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9104, "loss": 3.921700954437256, "lr": 6.415804667175863e-05, "elapsed_sec": 75548.15793967247, "step_time_sec": 8.22843819099944, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9105, "loss": 4.05145263671875, "lr": 6.405025084397498e-05, "elapsed_sec": 75556.38701057434, "step_time_sec": 8.228900005982723, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9106, "loss": 4.097762584686279, "lr": 6.394251973319799e-05, "elapsed_sec": 75564.61748886108, "step_time_sec": 8.230332833016291, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9107, "loss": 4.118495464324951, "lr": 6.383485350968595e-05, "elapsed_sec": 75572.84785699844, "step_time_sec": 8.230219635006506, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9108, "loss": 3.983177661895752, "lr": 6.372725234359465e-05, "elapsed_sec": 75581.07847166061, "step_time_sec": 8.230439513979945, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9109, "loss": 3.910738706588745, "lr": 6.361971640497701e-05, "elapsed_sec": 75589.3089196682, "step_time_sec": 8.230280499032233, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9110, "loss": 3.8025877475738525, "lr": 6.351224586378289e-05, "elapsed_sec": 75597.53928852081, "step_time_sec": 8.230306349985767, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9111, "loss": 4.037800312042236, "lr": 6.340484088985875e-05, "elapsed_sec": 75605.76997041702, "step_time_sec": 8.230448813003022, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9112, "loss": 3.9999823570251465, "lr": 6.32975016529476e-05, "elapsed_sec": 75614.00014662743, "step_time_sec": 8.230043106013909, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9113, "loss": 4.07348108291626, "lr": 6.319022832268828e-05, "elapsed_sec": 75622.23113656044, "step_time_sec": 8.230747189023532, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9114, "loss": 3.858203649520874, "lr": 6.308302106861576e-05, "elapsed_sec": 75630.46126770973, "step_time_sec": 8.22998474596534, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9115, "loss": 3.9546735286712646, "lr": 6.29758800601603e-05, "elapsed_sec": 75638.69143772125, "step_time_sec": 8.229986247024499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9116, "loss": 4.05465030670166, "lr": 6.286880546664773e-05, "elapsed_sec": 75646.92271327972, "step_time_sec": 8.231167464982718, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9117, "loss": 3.9166343212127686, "lr": 6.276179745729874e-05, "elapsed_sec": 75655.15450549126, "step_time_sec": 8.23166966100689, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9118, "loss": 3.820749044418335, "lr": 6.265485620122888e-05, "elapsed_sec": 75663.38599276543, "step_time_sec": 8.231254137994256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9119, "loss": 4.010116100311279, "lr": 6.254798186744813e-05, "elapsed_sec": 75671.61593723297, "step_time_sec": 8.229788987024222, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9120, "loss": 3.9904849529266357, "lr": 6.244117462486079e-05, "elapsed_sec": 75679.84644627571, "step_time_sec": 8.230397994979285, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9121, "loss": 3.815889596939087, "lr": 6.233443464226507e-05, "elapsed_sec": 75688.07746243477, "step_time_sec": 8.230839065043256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9122, "loss": 4.109527111053467, "lr": 6.222776208835287e-05, "elapsed_sec": 75696.30901432037, "step_time_sec": 8.231403479992878, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9123, "loss": 3.9124317169189453, "lr": 6.212115713170956e-05, "elapsed_sec": 75704.53938341141, "step_time_sec": 8.230177248013206, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9124, "loss": 4.13474178314209, "lr": 6.201461994081378e-05, "elapsed_sec": 75712.77118968964, "step_time_sec": 8.23162647697609, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9125, "loss": 3.8608062267303467, "lr": 6.19081506840368e-05, "elapsed_sec": 75721.00096440315, "step_time_sec": 8.229632436006796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9126, "loss": 3.9045968055725098, "lr": 6.180174952964283e-05, "elapsed_sec": 75729.23125314713, "step_time_sec": 8.230099880020134, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9127, "loss": 3.910386562347412, "lr": 6.16954166457882e-05, "elapsed_sec": 75737.46210360527, "step_time_sec": 8.230747443973087, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9128, "loss": 3.932112455368042, "lr": 6.158915220052155e-05, "elapsed_sec": 75745.69175601006, "step_time_sec": 8.229425574012566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9129, "loss": 4.0071024894714355, "lr": 6.148295636178324e-05, "elapsed_sec": 75753.9226295948, "step_time_sec": 8.230715567013249, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9130, "loss": 3.9348490238189697, "lr": 6.137682929740523e-05, "elapsed_sec": 75762.15332770348, "step_time_sec": 8.230533574009314, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9131, "loss": 3.9718871116638184, "lr": 6.127077117511079e-05, "elapsed_sec": 75770.38415169716, "step_time_sec": 8.230680547014344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9132, "loss": 4.061545372009277, "lr": 6.116478216251424e-05, "elapsed_sec": 75778.61400866508, "step_time_sec": 8.22969542897772, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9133, "loss": 3.9582366943359375, "lr": 6.105886242712068e-05, "elapsed_sec": 75786.84461259842, "step_time_sec": 8.230451620009262, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9134, "loss": 4.0319013595581055, "lr": 6.0953012136325694e-05, "elapsed_sec": 75795.07589387894, "step_time_sec": 8.231183028023224, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9135, "loss": 4.080520153045654, "lr": 6.084723145741514e-05, "elapsed_sec": 75803.30542135239, "step_time_sec": 8.229309214977548, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9136, "loss": 4.059848308563232, "lr": 6.074152055756495e-05, "elapsed_sec": 75811.53623914719, "step_time_sec": 8.230639019981027, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9137, "loss": 3.9147789478302, "lr": 6.0635879603840555e-05, "elapsed_sec": 75819.76414036751, "step_time_sec": 8.22779353801161, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9138, "loss": 3.9670586585998535, "lr": 6.0530308763197123e-05, "elapsed_sec": 75827.99506402016, "step_time_sec": 8.230744791973848, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9139, "loss": 4.033074378967285, "lr": 6.042480820247872e-05, "elapsed_sec": 75836.22606706619, "step_time_sec": 8.230799940007273, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9140, "loss": 3.888608455657959, "lr": 6.031937808841859e-05, "elapsed_sec": 75844.45654082298, "step_time_sec": 8.230330437014345, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9141, "loss": 4.2190423011779785, "lr": 6.021401858763853e-05, "elapsed_sec": 75852.68796825409, "step_time_sec": 8.231238072970882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9142, "loss": 4.213825225830078, "lr": 6.0108729866648735e-05, "elapsed_sec": 75860.91836929321, "step_time_sec": 8.230269702966325, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9143, "loss": 3.9849183559417725, "lr": 6.000351209184758e-05, "elapsed_sec": 75869.14937329292, "step_time_sec": 8.230830040993169, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9144, "loss": 3.9067845344543457, "lr": 5.989836542952127e-05, "elapsed_sec": 75877.37934803963, "step_time_sec": 8.229810683988035, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9145, "loss": 3.959580183029175, "lr": 5.9793290045843674e-05, "elapsed_sec": 75885.60822534561, "step_time_sec": 8.22878867998952, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9146, "loss": 4.057381629943848, "lr": 5.968828610687597e-05, "elapsed_sec": 75893.83701467514, "step_time_sec": 8.228564404009376, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9147, "loss": 4.2311787605285645, "lr": 5.958335377856642e-05, "elapsed_sec": 75902.06492495537, "step_time_sec": 8.227704317017924, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9148, "loss": 3.9814226627349854, "lr": 5.947849322675023e-05, "elapsed_sec": 75910.29557609558, "step_time_sec": 8.230504692997783, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9149, "loss": 4.075140953063965, "lr": 5.937370461714893e-05, "elapsed_sec": 75918.52639770508, "step_time_sec": 8.23065871198196, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9150, "loss": 4.0025200843811035, "lr": 5.926898811537066e-05, "elapsed_sec": 75926.75656795502, "step_time_sec": 8.23006021004403, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9151, "loss": 3.882326364517212, "lr": 5.916434388690928e-05, "elapsed_sec": 75934.9871942997, "step_time_sec": 8.230476233991794, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9152, "loss": 3.986959934234619, "lr": 5.9059772097144694e-05, "elapsed_sec": 75943.21838235855, "step_time_sec": 8.231017054989934, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9153, "loss": 3.962545871734619, "lr": 5.895527291134219e-05, "elapsed_sec": 75951.45013833046, "step_time_sec": 8.231572291988414, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9154, "loss": 4.007264137268066, "lr": 5.885084649465235e-05, "elapsed_sec": 75959.68095803261, "step_time_sec": 8.230624751013238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9155, "loss": 4.115708827972412, "lr": 5.8746493012110706e-05, "elapsed_sec": 75967.91140437126, "step_time_sec": 8.230296794034075, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9156, "loss": 4.0520853996276855, "lr": 5.86422126286376e-05, "elapsed_sec": 75976.14223623276, "step_time_sec": 8.230657876993064, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9157, "loss": 3.94707989692688, "lr": 5.853800550903778e-05, "elapsed_sec": 75984.37352800369, "step_time_sec": 8.231154543987941, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9158, "loss": 3.97344708442688, "lr": 5.8433871818000246e-05, "elapsed_sec": 75992.60411405563, "step_time_sec": 8.230420665000565, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9159, "loss": 4.102200031280518, "lr": 5.832981172009792e-05, "elapsed_sec": 76000.83477902412, "step_time_sec": 8.230522870959248, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9160, "loss": 4.05454158782959, "lr": 5.822582537978749e-05, "elapsed_sec": 76009.06545209885, "step_time_sec": 8.230487346998416, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9161, "loss": 3.959054470062256, "lr": 5.812191296140901e-05, "elapsed_sec": 76017.29559230804, "step_time_sec": 8.229993654997088, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9162, "loss": 3.957401752471924, "lr": 5.801807462918571e-05, "elapsed_sec": 76025.52698397636, "step_time_sec": 8.231240223976783, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9163, "loss": 4.072127819061279, "lr": 5.7914310547223755e-05, "elapsed_sec": 76033.7577240467, "step_time_sec": 8.230552308959886, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9164, "loss": 4.0916829109191895, "lr": 5.7810620879511944e-05, "elapsed_sec": 76041.98849487305, "step_time_sec": 8.230642051028553, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9165, "loss": 3.9216437339782715, "lr": 5.7707005789921516e-05, "elapsed_sec": 76050.21815538406, "step_time_sec": 8.22954969399143, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9166, "loss": 4.094335079193115, "lr": 5.76034654422058e-05, "elapsed_sec": 76058.44798755646, "step_time_sec": 8.229670436994638, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9167, "loss": 3.9874179363250732, "lr": 5.750000000000002e-05, "elapsed_sec": 76066.67679524422, "step_time_sec": 8.228614018007647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9168, "loss": 3.9325408935546875, "lr": 5.739660962682101e-05, "elapsed_sec": 76074.90556716919, "step_time_sec": 8.228565482015256, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9169, "loss": 4.153478145599365, "lr": 5.729329448606696e-05, "elapsed_sec": 76083.13712286949, "step_time_sec": 8.231457130983472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9170, "loss": 4.019745349884033, "lr": 5.7190054741017234e-05, "elapsed_sec": 76091.36751937866, "step_time_sec": 8.230182499974035, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9171, "loss": 3.9931702613830566, "lr": 5.708689055483195e-05, "elapsed_sec": 76099.59840393066, "step_time_sec": 8.230723531043623, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9172, "loss": 3.859292984008789, "lr": 5.698380209055185e-05, "elapsed_sec": 76107.82932853699, "step_time_sec": 8.230771695962176, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9173, "loss": 4.004998683929443, "lr": 5.6880789511097996e-05, "elapsed_sec": 76116.05989527702, "step_time_sec": 8.230411779950373, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9174, "loss": 4.070640563964844, "lr": 5.6777852979271513e-05, "elapsed_sec": 76124.28699970245, "step_time_sec": 8.226962583954446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9175, "loss": 3.9853198528289795, "lr": 5.667499265775338e-05, "elapsed_sec": 76132.5165143013, "step_time_sec": 8.229318675992545, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9176, "loss": 4.1297926902771, "lr": 5.6572208709104096e-05, "elapsed_sec": 76140.74665904045, "step_time_sec": 8.230051752005238, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9177, "loss": 3.9090182781219482, "lr": 5.646950129576347e-05, "elapsed_sec": 76148.97755479813, "step_time_sec": 8.23064223799156, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9178, "loss": 3.996940851211548, "lr": 5.636687058005032e-05, "elapsed_sec": 76157.20854902267, "step_time_sec": 8.230870129016694, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9179, "loss": 3.9426090717315674, "lr": 5.62643167241624e-05, "elapsed_sec": 76165.43925499916, "step_time_sec": 8.23052852897672, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9180, "loss": 4.135957717895508, "lr": 5.6161839890175725e-05, "elapsed_sec": 76173.6696896553, "step_time_sec": 8.230257599032484, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9181, "loss": 3.981771945953369, "lr": 5.605944024004487e-05, "elapsed_sec": 76181.90165925026, "step_time_sec": 8.231796589971054, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9182, "loss": 3.985708236694336, "lr": 5.595711793560225e-05, "elapsed_sec": 76190.13201642036, "step_time_sec": 8.230240388016682, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9183, "loss": 4.003069877624512, "lr": 5.585487313855809e-05, "elapsed_sec": 76198.3625035286, "step_time_sec": 8.230312303989194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9184, "loss": 3.867379665374756, "lr": 5.575270601050012e-05, "elapsed_sec": 76206.59277629852, "step_time_sec": 8.230037678033113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9185, "loss": 3.8358261585235596, "lr": 5.565061671289335e-05, "elapsed_sec": 76214.82244849205, "step_time_sec": 8.229575221019331, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9186, "loss": 3.9715473651885986, "lr": 5.5548605407079744e-05, "elapsed_sec": 76223.05035448074, "step_time_sec": 8.227664882026147, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9187, "loss": 3.9135544300079346, "lr": 5.544667225427803e-05, "elapsed_sec": 76231.28066420555, "step_time_sec": 8.230163563974202, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9188, "loss": 4.08242130279541, "lr": 5.534481741558339e-05, "elapsed_sec": 76239.51197075844, "step_time_sec": 8.23110714298673, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9189, "loss": 4.001865386962891, "lr": 5.524304105196739e-05, "elapsed_sec": 76247.74240756035, "step_time_sec": 8.230306652956642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9190, "loss": 4.039455890655518, "lr": 5.5141343324277264e-05, "elapsed_sec": 76255.9729449749, "step_time_sec": 8.230384666007012, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9191, "loss": 3.884681463241577, "lr": 5.503972439323634e-05, "elapsed_sec": 76264.20356965065, "step_time_sec": 8.230413644982036, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9192, "loss": 3.9074790477752686, "lr": 5.4938184419443084e-05, "elapsed_sec": 76272.4338388443, "step_time_sec": 8.230088279990014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9193, "loss": 3.9407575130462646, "lr": 5.483672356337143e-05, "elapsed_sec": 76280.66475605965, "step_time_sec": 8.230777654971462, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9194, "loss": 4.227875232696533, "lr": 5.473534198537015e-05, "elapsed_sec": 76288.89365339279, "step_time_sec": 8.228691716969479, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9195, "loss": 4.149547100067139, "lr": 5.463403984566272e-05, "elapsed_sec": 76297.12412405014, "step_time_sec": 8.230323572992347, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9196, "loss": 4.1118245124816895, "lr": 5.453281730434711e-05, "elapsed_sec": 76305.35497021675, "step_time_sec": 8.23074046295369, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9197, "loss": 3.9720346927642822, "lr": 5.443167452139549e-05, "elapsed_sec": 76313.58466911316, "step_time_sec": 8.229519930959214, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9198, "loss": 3.983353853225708, "lr": 5.433061165665398e-05, "elapsed_sec": 76321.81410932541, "step_time_sec": 8.229206255986355, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9199, "loss": 4.022625923156738, "lr": 5.422962886984235e-05, "elapsed_sec": 76330.04478812218, "step_time_sec": 8.230487843975425, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9200, "loss": 3.9871628284454346, "lr": 5.412872632055386e-05, "elapsed_sec": 76338.2765352726, "step_time_sec": 8.231597668025643, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9201, "loss": 3.991136074066162, "lr": 5.4027904168255034e-05, "elapsed_sec": 76346.5070142746, "step_time_sec": 8.23028964997502, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9202, "loss": 3.9654433727264404, "lr": 5.392716257228514e-05, "elapsed_sec": 76354.73814439774, "step_time_sec": 8.230979002953973, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9203, "loss": 3.951000213623047, "lr": 5.3826501691856376e-05, "elapsed_sec": 76362.96939849854, "step_time_sec": 8.23107166500995, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9204, "loss": 3.9789156913757324, "lr": 5.372592168605311e-05, "elapsed_sec": 76371.20011425018, "step_time_sec": 8.230574326007627, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9205, "loss": 3.887219190597534, "lr": 5.362542271383217e-05, "elapsed_sec": 76379.43006324768, "step_time_sec": 8.229791794961784, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9206, "loss": 3.841231107711792, "lr": 5.3525004934022124e-05, "elapsed_sec": 76387.6580016613, "step_time_sec": 8.227752584032714, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9207, "loss": 4.176998615264893, "lr": 5.3424668505323305e-05, "elapsed_sec": 76395.8889245987, "step_time_sec": 8.230743624968454, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9208, "loss": 3.828232765197754, "lr": 5.332441358630747e-05, "elapsed_sec": 76404.11954712868, "step_time_sec": 8.230461097962689, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9209, "loss": 3.879188060760498, "lr": 5.322424033541751e-05, "elapsed_sec": 76412.35038542747, "step_time_sec": 8.230688865005504, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9210, "loss": 3.9737963676452637, "lr": 5.312414891096734e-05, "elapsed_sec": 76420.58151912689, "step_time_sec": 8.231012391974218, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9211, "loss": 4.007088661193848, "lr": 5.302413947114147e-05, "elapsed_sec": 76428.81249713898, "step_time_sec": 8.230793291993905, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9212, "loss": 3.9487195014953613, "lr": 5.292421217399485e-05, "elapsed_sec": 76437.04347109795, "step_time_sec": 8.230806311010383, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9213, "loss": 3.9351696968078613, "lr": 5.282436717745273e-05, "elapsed_sec": 76445.27316379547, "step_time_sec": 8.229551828000695, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9214, "loss": 3.8786165714263916, "lr": 5.272460463931008e-05, "elapsed_sec": 76453.50383138657, "step_time_sec": 8.230494692979846, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9215, "loss": 4.027437210083008, "lr": 5.262492471723179e-05, "elapsed_sec": 76461.73225998878, "step_time_sec": 8.228223592974246, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9216, "loss": 3.9708101749420166, "lr": 5.252532756875195e-05, "elapsed_sec": 76469.96030330658, "step_time_sec": 8.227873968018685, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9217, "loss": 3.9893455505371094, "lr": 5.2425813351274034e-05, "elapsed_sec": 76478.18911790848, "step_time_sec": 8.228699986997526, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9218, "loss": 4.0444769859313965, "lr": 5.2326382222070326e-05, "elapsed_sec": 76486.41906094551, "step_time_sec": 8.229713422013447, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9219, "loss": 4.071074485778809, "lr": 5.2227034338281865e-05, "elapsed_sec": 76494.65014362335, "step_time_sec": 8.230925371986814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9220, "loss": 3.9375052452087402, "lr": 5.212776985691809e-05, "elapsed_sec": 76502.88065814972, "step_time_sec": 8.230346532014664, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9221, "loss": 3.936163902282715, "lr": 5.202858893485663e-05, "elapsed_sec": 76511.11034440994, "step_time_sec": 8.22952782199718, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9222, "loss": 4.222146034240723, "lr": 5.1929491728843086e-05, "elapsed_sec": 76519.34174180031, "step_time_sec": 8.23121977300616, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9223, "loss": 3.982567548751831, "lr": 5.1830478395490746e-05, "elapsed_sec": 76527.57217097282, "step_time_sec": 8.230314256972633, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9224, "loss": 3.9270970821380615, "lr": 5.173154909128028e-05, "elapsed_sec": 76535.8030204773, "step_time_sec": 8.230635006038938, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9225, "loss": 3.9135217666625977, "lr": 5.163270397255973e-05, "elapsed_sec": 76544.03298592567, "step_time_sec": 8.229790915967897, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9226, "loss": 4.010809421539307, "lr": 5.1533943195543847e-05, "elapsed_sec": 76552.26447534561, "step_time_sec": 8.231318341975566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9227, "loss": 4.055014610290527, "lr": 5.1435266916314346e-05, "elapsed_sec": 76560.49530768394, "step_time_sec": 8.230733434960712, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9228, "loss": 3.9308998584747314, "lr": 5.1336675290819144e-05, "elapsed_sec": 76568.72643685341, "step_time_sec": 8.230916524014901, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9229, "loss": 4.052155017852783, "lr": 5.12381684748726e-05, "elapsed_sec": 76576.95722603798, "step_time_sec": 8.230637278989889, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9230, "loss": 4.068618297576904, "lr": 5.113974662415491e-05, "elapsed_sec": 76585.18763589859, "step_time_sec": 8.230198737990577, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9231, "loss": 4.079677581787109, "lr": 5.104140989421201e-05, "elapsed_sec": 76593.41850471497, "step_time_sec": 8.230711210984737, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9232, "loss": 4.122682094573975, "lr": 5.094315844045535e-05, "elapsed_sec": 76601.64893627167, "step_time_sec": 8.230257349961903, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9233, "loss": 3.960638999938965, "lr": 5.084499241816156e-05, "elapsed_sec": 76609.88122391701, "step_time_sec": 8.232138690014835, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9234, "loss": 4.00953483581543, "lr": 5.074691198247229e-05, "elapsed_sec": 76618.11146688461, "step_time_sec": 8.23007511801552, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9235, "loss": 3.9076504707336426, "lr": 5.0648917288393904e-05, "elapsed_sec": 76626.3425424099, "step_time_sec": 8.230902409995906, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9236, "loss": 3.8008499145507812, "lr": 5.055100849079724e-05, "elapsed_sec": 76634.5730907917, "step_time_sec": 8.230400214029942, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9237, "loss": 4.110025882720947, "lr": 5.0453185744417534e-05, "elapsed_sec": 76642.80357933044, "step_time_sec": 8.230334847001359, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9238, "loss": 3.9115567207336426, "lr": 5.035544920385376e-05, "elapsed_sec": 76651.03480815887, "step_time_sec": 8.231062653008848, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9239, "loss": 3.9929358959198, "lr": 5.025779902356894e-05, "elapsed_sec": 76659.26417994499, "step_time_sec": 8.229177596978843, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9240, "loss": 3.986232280731201, "lr": 5.0160235357889334e-05, "elapsed_sec": 76667.49497532845, "step_time_sec": 8.230632535007317, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9241, "loss": 3.8846940994262695, "lr": 5.006275836100473e-05, "elapsed_sec": 76675.72424340248, "step_time_sec": 8.229141611955129, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9242, "loss": 3.8334808349609375, "lr": 4.996536818696777e-05, "elapsed_sec": 76683.9524230957, "step_time_sec": 8.22799521900015, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9243, "loss": 4.132922649383545, "lr": 4.986806498969396e-05, "elapsed_sec": 76692.18111371994, "step_time_sec": 8.228484530001879, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9244, "loss": 4.0701751708984375, "lr": 4.9770848922961325e-05, "elapsed_sec": 76700.41199564934, "step_time_sec": 8.230785235995427, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9245, "loss": 3.884369134902954, "lr": 4.96737201404102e-05, "elapsed_sec": 76708.64307069778, "step_time_sec": 8.230908141995315, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9246, "loss": 4.064744472503662, "lr": 4.9576678795542935e-05, "elapsed_sec": 76716.87299442291, "step_time_sec": 8.229714374989271, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9247, "loss": 3.856666326522827, "lr": 4.947972504172377e-05, "elapsed_sec": 76725.10302233696, "step_time_sec": 8.22983839403605, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9248, "loss": 4.034632682800293, "lr": 4.93828590321784e-05, "elapsed_sec": 76733.33431649208, "step_time_sec": 8.231115486007184, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9249, "loss": 3.8155438899993896, "lr": 4.928608091999405e-05, "elapsed_sec": 76741.56451702118, "step_time_sec": 8.230024549004156, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9250, "loss": 3.8976869583129883, "lr": 4.918939085811875e-05, "elapsed_sec": 76749.79524278641, "step_time_sec": 8.230569589999504, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9251, "loss": 3.939523220062256, "lr": 4.909278899936165e-05, "elapsed_sec": 76758.02528214455, "step_time_sec": 8.229880077997223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9252, "loss": 3.9030725955963135, "lr": 4.899627549639233e-05, "elapsed_sec": 76766.25602316856, "step_time_sec": 8.230552324966993, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9253, "loss": 4.032766819000244, "lr": 4.8899850501740785e-05, "elapsed_sec": 76774.48691654205, "step_time_sec": 8.230747107008938, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9254, "loss": 4.057214260101318, "lr": 4.880351416779714e-05, "elapsed_sec": 76782.71705579758, "step_time_sec": 8.229988760023843, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9255, "loss": 3.789537191390991, "lr": 4.870726664681138e-05, "elapsed_sec": 76790.9481203556, "step_time_sec": 8.230892791994847, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9256, "loss": 3.907358407974243, "lr": 4.8611108090893155e-05, "elapsed_sec": 76799.17817020416, "step_time_sec": 8.229904669977259, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9257, "loss": 4.036969184875488, "lr": 4.8515038652011495e-05, "elapsed_sec": 76807.40908169746, "step_time_sec": 8.230684323993046, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9258, "loss": 3.978188991546631, "lr": 4.8419058481994595e-05, "elapsed_sec": 76815.64017343521, "step_time_sec": 8.23099720699247, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9259, "loss": 4.092867374420166, "lr": 4.832316773252954e-05, "elapsed_sec": 76823.8712720871, "step_time_sec": 8.230875471024774, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9260, "loss": 4.005883693695068, "lr": 4.822736655516218e-05, "elapsed_sec": 76832.10128569603, "step_time_sec": 8.229825815011282, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9261, "loss": 4.137191295623779, "lr": 4.813165510129673e-05, "elapsed_sec": 76840.33230018616, "step_time_sec": 8.230852871958632, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9262, "loss": 3.9715018272399902, "lr": 4.803603352219561e-05, "elapsed_sec": 76848.56321263313, "step_time_sec": 8.230768950015772, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9263, "loss": 4.029177665710449, "lr": 4.794050196897923e-05, "elapsed_sec": 76856.79407286644, "step_time_sec": 8.230706798960455, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9264, "loss": 3.9964993000030518, "lr": 4.7845060592625705e-05, "elapsed_sec": 76865.02470731735, "step_time_sec": 8.23046416300349, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9265, "loss": 4.082147598266602, "lr": 4.7749709543970657e-05, "elapsed_sec": 76873.25431203842, "step_time_sec": 8.229419577983208, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9266, "loss": 4.057180881500244, "lr": 4.7654448973706915e-05, "elapsed_sec": 76881.485912323, "step_time_sec": 8.231434542976785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9267, "loss": 3.8630595207214355, "lr": 4.7559279032384346e-05, "elapsed_sec": 76889.71617913246, "step_time_sec": 8.230122567969374, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9268, "loss": 3.9742114543914795, "lr": 4.7464199870409575e-05, "elapsed_sec": 76897.9466843605, "step_time_sec": 8.230337687011342, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9269, "loss": 3.911381483078003, "lr": 4.736921163804574e-05, "elapsed_sec": 76906.1778204441, "step_time_sec": 8.23096615402028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9270, "loss": 4.054429531097412, "lr": 4.7274314485412357e-05, "elapsed_sec": 76914.40968298912, "step_time_sec": 8.231662674981635, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9271, "loss": 4.023047924041748, "lr": 4.717950856248489e-05, "elapsed_sec": 76922.64098548889, "step_time_sec": 8.231210451980587, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9272, "loss": 4.002179145812988, "lr": 4.708479401909471e-05, "elapsed_sec": 76930.8717122078, "step_time_sec": 8.230503208993468, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9273, "loss": 4.109551906585693, "lr": 4.6990171004928705e-05, "elapsed_sec": 76939.10290622711, "step_time_sec": 8.231015984958503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9274, "loss": 3.9736013412475586, "lr": 4.689563966952915e-05, "elapsed_sec": 76947.33392596245, "step_time_sec": 8.230817269010004, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9275, "loss": 3.9713923931121826, "lr": 4.6801200162293425e-05, "elapsed_sec": 76955.56482553482, "step_time_sec": 8.230761607002933, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9276, "loss": 3.890641927719116, "lr": 4.670685263247376e-05, "elapsed_sec": 76963.79618000984, "step_time_sec": 8.23117420997005, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9277, "loss": 3.8510284423828125, "lr": 4.661259722917704e-05, "elapsed_sec": 76972.02732133865, "step_time_sec": 8.23100972501561, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9278, "loss": 3.9888274669647217, "lr": 4.6518434101364634e-05, "elapsed_sec": 76980.25858092308, "step_time_sec": 8.231084422965068, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9279, "loss": 4.039721965789795, "lr": 4.64243633978519e-05, "elapsed_sec": 76988.48866701126, "step_time_sec": 8.229932454007212, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9280, "loss": 3.9438633918762207, "lr": 4.6330385267308316e-05, "elapsed_sec": 76996.72017264366, "step_time_sec": 8.231369020009879, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9281, "loss": 3.896148920059204, "lr": 4.623649985825689e-05, "elapsed_sec": 77004.95016717911, "step_time_sec": 8.229820493026637, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9282, "loss": 3.9858078956604004, "lr": 4.6142707319074235e-05, "elapsed_sec": 77013.18072533607, "step_time_sec": 8.230458144971635, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9283, "loss": 4.002674102783203, "lr": 4.604900779799012e-05, "elapsed_sec": 77021.41105341911, "step_time_sec": 8.230102311994415, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9284, "loss": 4.051941394805908, "lr": 4.595540144308732e-05, "elapsed_sec": 77029.64231228828, "step_time_sec": 8.23113192699384, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9285, "loss": 3.979912519454956, "lr": 4.5861888402301364e-05, "elapsed_sec": 77037.87358546257, "step_time_sec": 8.231102363031823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9286, "loss": 3.8349006175994873, "lr": 4.576846882342032e-05, "elapsed_sec": 77046.10337853432, "step_time_sec": 8.229566082009114, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9287, "loss": 4.007108211517334, "lr": 4.567514285408454e-05, "elapsed_sec": 77054.33388328552, "step_time_sec": 8.230418680002913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9288, "loss": 4.001216888427734, "lr": 4.558191064178645e-05, "elapsed_sec": 77062.56464242935, "step_time_sec": 8.230524863000028, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9289, "loss": 3.950698137283325, "lr": 4.548877233387024e-05, "elapsed_sec": 77070.79586267471, "step_time_sec": 8.231127104023471, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9290, "loss": 3.8046493530273438, "lr": 4.539572807753185e-05, "elapsed_sec": 77079.02671670914, "step_time_sec": 8.230676235980354, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9291, "loss": 4.005368709564209, "lr": 4.5302778019818344e-05, "elapsed_sec": 77087.25692725182, "step_time_sec": 8.230008515005466, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9292, "loss": 3.9567110538482666, "lr": 4.5209922307628186e-05, "elapsed_sec": 77095.48648095131, "step_time_sec": 8.229413436027244, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9293, "loss": 3.9523720741271973, "lr": 4.511716108771045e-05, "elapsed_sec": 77103.71568083763, "step_time_sec": 8.229033724986948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9294, "loss": 4.013480186462402, "lr": 4.5024494506665117e-05, "elapsed_sec": 77111.94693279266, "step_time_sec": 8.23113505798392, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9295, "loss": 3.9227957725524902, "lr": 4.493192271094249e-05, "elapsed_sec": 77120.17784047127, "step_time_sec": 8.230773894989397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9296, "loss": 3.9601831436157227, "lr": 4.483944584684307e-05, "elapsed_sec": 77128.40796041489, "step_time_sec": 8.229913601011503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9297, "loss": 4.05873966217041, "lr": 4.474706406051736e-05, "elapsed_sec": 77136.6391775608, "step_time_sec": 8.231060886988416, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9298, "loss": 3.7660751342773438, "lr": 4.4654777497965576e-05, "elapsed_sec": 77144.8702673912, "step_time_sec": 8.230970875010826, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9299, "loss": 3.8937511444091797, "lr": 4.456258630503745e-05, "elapsed_sec": 77153.10197067261, "step_time_sec": 8.231523405993357, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9300, "loss": 4.046032428741455, "lr": 4.447049062743201e-05, "elapsed_sec": 77161.33229327202, "step_time_sec": 8.230153708020225, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9301, "loss": 4.041930198669434, "lr": 4.437849061069727e-05, "elapsed_sec": 77169.56206274033, "step_time_sec": 8.229600751015823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9302, "loss": 3.731072187423706, "lr": 4.4286586400230215e-05, "elapsed_sec": 77177.79263782501, "step_time_sec": 8.23038511001505, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9303, "loss": 4.136142730712891, "lr": 4.419477814127617e-05, "elapsed_sec": 77186.02359294891, "step_time_sec": 8.230841558019165, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9304, "loss": 3.9487226009368896, "lr": 4.410306597892911e-05, "elapsed_sec": 77194.25306415558, "step_time_sec": 8.229317059973255, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9305, "loss": 3.930269479751587, "lr": 4.401145005813083e-05, "elapsed_sec": 77202.4833278656, "step_time_sec": 8.230030781996902, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9306, "loss": 3.8869614601135254, "lr": 4.391993052367131e-05, "elapsed_sec": 77210.71466875076, "step_time_sec": 8.231207067030482, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9307, "loss": 3.9065446853637695, "lr": 4.382850752018802e-05, "elapsed_sec": 77218.94561314583, "step_time_sec": 8.230752405012026, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9308, "loss": 3.997391700744629, "lr": 4.3737181192165924e-05, "elapsed_sec": 77227.17570900917, "step_time_sec": 8.229940660996363, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9309, "loss": 4.07125997543335, "lr": 4.364595168393723e-05, "elapsed_sec": 77235.40606117249, "step_time_sec": 8.230251541011967, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9310, "loss": 3.8454136848449707, "lr": 4.355481913968107e-05, "elapsed_sec": 77243.63698005676, "step_time_sec": 8.230673207028303, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9311, "loss": 3.9412388801574707, "lr": 4.346378370342339e-05, "elapsed_sec": 77251.86807155609, "step_time_sec": 8.230940040026326, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9312, "loss": 4.023496627807617, "lr": 4.337284551903664e-05, "elapsed_sec": 77260.09854960442, "step_time_sec": 8.230317431036383, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9313, "loss": 3.9661636352539062, "lr": 4.328200473023953e-05, "elapsed_sec": 77268.3290669918, "step_time_sec": 8.230417871964164, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9314, "loss": 3.8759067058563232, "lr": 4.319126148059703e-05, "elapsed_sec": 77276.55962109566, "step_time_sec": 8.230321042996366, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9315, "loss": 3.8295724391937256, "lr": 4.3100615913519646e-05, "elapsed_sec": 77284.79062747955, "step_time_sec": 8.230850446037948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9316, "loss": 4.086449146270752, "lr": 4.3010068172263844e-05, "elapsed_sec": 77293.02098870277, "step_time_sec": 8.230213648988865, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9317, "loss": 3.9316444396972656, "lr": 4.291961839993119e-05, "elapsed_sec": 77301.24995684624, "step_time_sec": 8.228828127030283, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9318, "loss": 3.825847625732422, "lr": 4.282926673946865e-05, "elapsed_sec": 77309.478774786, "step_time_sec": 8.228630773024634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9319, "loss": 3.845771074295044, "lr": 4.2739013333668e-05, "elapsed_sec": 77317.70742154121, "step_time_sec": 8.228446446999442, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9320, "loss": 3.9418318271636963, "lr": 4.264885832516578e-05, "elapsed_sec": 77325.93940782547, "step_time_sec": 8.231891500996426, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9321, "loss": 3.952950954437256, "lr": 4.255880185644302e-05, "elapsed_sec": 77334.17007732391, "step_time_sec": 8.230488027969841, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9322, "loss": 4.143133163452148, "lr": 4.2468844069825e-05, "elapsed_sec": 77342.4011502266, "step_time_sec": 8.230881621013395, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9323, "loss": 3.923511266708374, "lr": 4.2378985107481076e-05, "elapsed_sec": 77350.63190889359, "step_time_sec": 8.230651281017344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9324, "loss": 3.9353678226470947, "lr": 4.228922511142439e-05, "elapsed_sec": 77358.86267137527, "step_time_sec": 8.230575026012957, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9325, "loss": 3.919606924057007, "lr": 4.219956422351166e-05, "elapsed_sec": 77367.09358406067, "step_time_sec": 8.230766784981824, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9326, "loss": 3.8686537742614746, "lr": 4.211000258544311e-05, "elapsed_sec": 77375.32370471954, "step_time_sec": 8.229955677001271, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9327, "loss": 4.078649997711182, "lr": 4.202054033876186e-05, "elapsed_sec": 77383.55485224724, "step_time_sec": 8.230967332026921, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9328, "loss": 4.0007643699646, "lr": 4.1931177624854226e-05, "elapsed_sec": 77391.78564238548, "step_time_sec": 8.230638428998645, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9329, "loss": 3.94183087348938, "lr": 4.1841914584948975e-05, "elapsed_sec": 77400.01672196388, "step_time_sec": 8.230931915983092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9330, "loss": 3.8490068912506104, "lr": 4.175275136011755e-05, "elapsed_sec": 77408.24723291397, "step_time_sec": 8.230359765992034, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9331, "loss": 3.9385452270507812, "lr": 4.166368809127354e-05, "elapsed_sec": 77416.54411935806, "step_time_sec": 8.23834297095891, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9332, "loss": 3.899122714996338, "lr": 4.157472491917256e-05, "elapsed_sec": 77424.77470088005, "step_time_sec": 8.230408416013233, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9333, "loss": 3.9257278442382812, "lr": 4.14858619844121e-05, "elapsed_sec": 77433.00579309464, "step_time_sec": 8.230962200032081, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9334, "loss": 4.026243209838867, "lr": 4.139709942743115e-05, "elapsed_sec": 77441.23708724976, "step_time_sec": 8.231163471005857, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9335, "loss": 3.8998336791992188, "lr": 4.130843738851013e-05, "elapsed_sec": 77449.46848011017, "step_time_sec": 8.23122317000525, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9336, "loss": 3.837921142578125, "lr": 4.1219876007770574e-05, "elapsed_sec": 77457.69926190376, "step_time_sec": 8.230576041038148, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9337, "loss": 3.9355838298797607, "lr": 4.1131415425174895e-05, "elapsed_sec": 77465.93020057678, "step_time_sec": 8.230781616002787, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9338, "loss": 3.9729959964752197, "lr": 4.104305578052636e-05, "elapsed_sec": 77474.16124629974, "step_time_sec": 8.230973325029481, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9339, "loss": 3.978421688079834, "lr": 4.095479721346846e-05, "elapsed_sec": 77482.3920454979, "step_time_sec": 8.23057511303341, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9340, "loss": 3.899319887161255, "lr": 4.086663986348517e-05, "elapsed_sec": 77490.62267231941, "step_time_sec": 8.230478191981092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9341, "loss": 3.953289031982422, "lr": 4.0778583869900414e-05, "elapsed_sec": 77498.85287714005, "step_time_sec": 8.230031888000667, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9342, "loss": 4.103731632232666, "lr": 4.069062937187791e-05, "elapsed_sec": 77507.0839419365, "step_time_sec": 8.230895142012741, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9343, "loss": 3.9302639961242676, "lr": 4.060277650842101e-05, "elapsed_sec": 77515.31480240822, "step_time_sec": 8.230702407017816, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9344, "loss": 4.140739917755127, "lr": 4.0515025418372425e-05, "elapsed_sec": 77523.54566812515, "step_time_sec": 8.230758015997708, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9345, "loss": 3.9509778022766113, "lr": 4.0427376240414005e-05, "elapsed_sec": 77531.7768881321, "step_time_sec": 8.231019621016458, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9346, "loss": 3.799126148223877, "lr": 4.03398291130666e-05, "elapsed_sec": 77540.00749874115, "step_time_sec": 8.230441463994794, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9347, "loss": 4.030576229095459, "lr": 4.0252384174689714e-05, "elapsed_sec": 77548.23893070221, "step_time_sec": 8.231285052956082, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9348, "loss": 3.9212162494659424, "lr": 4.0165041563481375e-05, "elapsed_sec": 77556.46741652489, "step_time_sec": 8.228346068004612, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9349, "loss": 3.724917411804199, "lr": 4.007780141747788e-05, "elapsed_sec": 77564.69683098793, "step_time_sec": 8.229279863997363, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9350, "loss": 4.081788063049316, "lr": 3.999066387455365e-05, "elapsed_sec": 77572.92635440826, "step_time_sec": 8.229324751009699, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9351, "loss": 3.796027898788452, "lr": 3.990362907242089e-05, "elapsed_sec": 77581.15609145164, "step_time_sec": 8.229616013006307, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9352, "loss": 4.030420780181885, "lr": 3.981669714862945e-05, "elapsed_sec": 77589.3871049881, "step_time_sec": 8.230822830984835, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9353, "loss": 3.954913377761841, "lr": 3.972986824056657e-05, "elapsed_sec": 77597.61499595642, "step_time_sec": 8.2277412309777, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9354, "loss": 3.9254672527313232, "lr": 3.964314248545674e-05, "elapsed_sec": 77605.84511113167, "step_time_sec": 8.229911031958181, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9355, "loss": 4.094823360443115, "lr": 3.955652002036136e-05, "elapsed_sec": 77614.07601642609, "step_time_sec": 8.230728077003732, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9356, "loss": 3.8776471614837646, "lr": 3.9470000982178646e-05, "elapsed_sec": 77622.30656790733, "step_time_sec": 8.230405370006338, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9357, "loss": 4.148153781890869, "lr": 3.938358550764332e-05, "elapsed_sec": 77630.53790736198, "step_time_sec": 8.231190513004549, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9358, "loss": 3.8477652072906494, "lr": 3.929727373332641e-05, "elapsed_sec": 77638.76590776443, "step_time_sec": 8.227818616025615, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9359, "loss": 3.996631145477295, "lr": 3.921106579563518e-05, "elapsed_sec": 77646.99712610245, "step_time_sec": 8.231082891987171, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9360, "loss": 3.803065061569214, "lr": 3.912496183081265e-05, "elapsed_sec": 77655.22734427452, "step_time_sec": 8.230127410963178, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9361, "loss": 4.029995918273926, "lr": 3.903896197493759e-05, "elapsed_sec": 77663.45738554001, "step_time_sec": 8.229809485957958, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9362, "loss": 3.7869644165039062, "lr": 3.895306636392421e-05, "elapsed_sec": 77671.68769574165, "step_time_sec": 8.230180770042352, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9363, "loss": 4.010634899139404, "lr": 3.886727513352198e-05, "elapsed_sec": 77679.91801810265, "step_time_sec": 8.230176343990024, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9364, "loss": 3.780172109603882, "lr": 3.8781588419315416e-05, "elapsed_sec": 77688.14509677887, "step_time_sec": 8.226949379022699, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9365, "loss": 3.8880603313446045, "lr": 3.8696006356723846e-05, "elapsed_sec": 77696.37586522102, "step_time_sec": 8.230555537971668, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9366, "loss": 3.823983669281006, "lr": 3.861052908100121e-05, "elapsed_sec": 77704.60432600975, "step_time_sec": 8.228351735975593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9367, "loss": 3.9386463165283203, "lr": 3.852515672723585e-05, "elapsed_sec": 77712.83389115334, "step_time_sec": 8.229317797988188, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9368, "loss": 4.034359931945801, "lr": 3.843988943035025e-05, "elapsed_sec": 77721.06194496155, "step_time_sec": 8.227928453008644, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9369, "loss": 3.9120843410491943, "lr": 3.835472732510097e-05, "elapsed_sec": 77729.29306387901, "step_time_sec": 8.231008151953574, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9370, "loss": 3.9189422130584717, "lr": 3.8269670546078145e-05, "elapsed_sec": 77737.5237417221, "step_time_sec": 8.230412265984342, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9371, "loss": 3.887817144393921, "lr": 3.818471922770563e-05, "elapsed_sec": 77745.75490379333, "step_time_sec": 8.231030949042179, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9372, "loss": 3.9689691066741943, "lr": 3.8099873504240527e-05, "elapsed_sec": 77753.9858417511, "step_time_sec": 8.230766141961794, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9373, "loss": 4.1023759841918945, "lr": 3.8015133509773056e-05, "elapsed_sec": 77762.21550917625, "step_time_sec": 8.22949942498235, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9374, "loss": 3.950731039047241, "lr": 3.793049937822634e-05, "elapsed_sec": 77770.44589972496, "step_time_sec": 8.230249940010253, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9375, "loss": 3.885064125061035, "lr": 3.784597124335624e-05, "elapsed_sec": 77778.6764087677, "step_time_sec": 8.230348772951402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9376, "loss": 3.852783203125, "lr": 3.776154923875104e-05, "elapsed_sec": 77786.90710759163, "step_time_sec": 8.230536504997872, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9377, "loss": 3.8603219985961914, "lr": 3.767723349783132e-05, "elapsed_sec": 77795.13699412346, "step_time_sec": 8.229788967990316, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9378, "loss": 3.9421064853668213, "lr": 3.759302415384972e-05, "elapsed_sec": 77803.36752080917, "step_time_sec": 8.230280360032339, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9379, "loss": 4.020236492156982, "lr": 3.750892133989081e-05, "elapsed_sec": 77811.59862303734, "step_time_sec": 8.23100109101506, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9380, "loss": 4.122254848480225, "lr": 3.742492518887059e-05, "elapsed_sec": 77819.82948493958, "step_time_sec": 8.230684496986214, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9381, "loss": 3.967653751373291, "lr": 3.734103583353677e-05, "elapsed_sec": 77828.06040072441, "step_time_sec": 8.230739680002443, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9382, "loss": 3.8060009479522705, "lr": 3.7257253406468e-05, "elapsed_sec": 77836.2904791832, "step_time_sec": 8.22996291902382, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9383, "loss": 3.9831998348236084, "lr": 3.717357804007418e-05, "elapsed_sec": 77844.5216114521, "step_time_sec": 8.230932560982183, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9384, "loss": 3.944411516189575, "lr": 3.709000986659586e-05, "elapsed_sec": 77852.75237369537, "step_time_sec": 8.230583583004773, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9385, "loss": 3.8353240489959717, "lr": 3.700654901810423e-05, "elapsed_sec": 77860.98371958733, "step_time_sec": 8.231224443006795, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9386, "loss": 3.847775459289551, "lr": 3.6923195626500864e-05, "elapsed_sec": 77869.21500635147, "step_time_sec": 8.231112035980914, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9387, "loss": 3.7249362468719482, "lr": 3.683994982351751e-05, "elapsed_sec": 77877.44522619247, "step_time_sec": 8.230034444946796, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9388, "loss": 3.929161310195923, "lr": 3.675681174071589e-05, "elapsed_sec": 77885.67615771294, "step_time_sec": 8.230805386963766, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9389, "loss": 3.968472719192505, "lr": 3.667378150948747e-05, "elapsed_sec": 77893.90600943565, "step_time_sec": 8.229672946967185, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9390, "loss": 3.829111099243164, "lr": 3.659085926105325e-05, "elapsed_sec": 77902.13748145103, "step_time_sec": 8.231354342016857, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9391, "loss": 3.834059715270996, "lr": 3.6508045126463666e-05, "elapsed_sec": 77910.36762833595, "step_time_sec": 8.229948776948731, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9392, "loss": 4.058673858642578, "lr": 3.642533923659812e-05, "elapsed_sec": 77918.59838414192, "step_time_sec": 8.2305871679564, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9393, "loss": 3.8624532222747803, "lr": 3.634274172216518e-05, "elapsed_sec": 77926.82907366753, "step_time_sec": 8.2305596289807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9394, "loss": 3.786017894744873, "lr": 3.626025271370186e-05, "elapsed_sec": 77935.06116294861, "step_time_sec": 8.231956303992774, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9395, "loss": 4.004737854003906, "lr": 3.617787234157394e-05, "elapsed_sec": 77943.29209685326, "step_time_sec": 8.230779286008328, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9396, "loss": 3.9183082580566406, "lr": 3.609560073597537e-05, "elapsed_sec": 77951.52187037468, "step_time_sec": 8.22956670995336, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9397, "loss": 3.9572999477386475, "lr": 3.6013438026928235e-05, "elapsed_sec": 77959.75305581093, "step_time_sec": 8.231011038005818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9398, "loss": 3.905196189880371, "lr": 3.593138434428255e-05, "elapsed_sec": 77967.98211169243, "step_time_sec": 8.228895336971618, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9399, "loss": 3.940176248550415, "lr": 3.584943981771599e-05, "elapsed_sec": 77976.2125313282, "step_time_sec": 8.23023662797641, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9400, "loss": 3.834146738052368, "lr": 3.576760457673372e-05, "elapsed_sec": 77984.44195747375, "step_time_sec": 8.229251954006031, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9401, "loss": 3.935765027999878, "lr": 3.5685878750668214e-05, "elapsed_sec": 77992.67210936546, "step_time_sec": 8.230015637003817, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9402, "loss": 3.882781505584717, "lr": 3.560426246867897e-05, "elapsed_sec": 78000.90279483795, "step_time_sec": 8.230477574979886, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9403, "loss": 3.850321054458618, "lr": 3.552275585975252e-05, "elapsed_sec": 78009.13282108307, "step_time_sec": 8.22985541401431, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9404, "loss": 3.840848922729492, "lr": 3.544135905270181e-05, "elapsed_sec": 78017.36411046982, "step_time_sec": 8.231113334011752, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9405, "loss": 3.944812774658203, "lr": 3.536007217616652e-05, "elapsed_sec": 78025.59403276443, "step_time_sec": 8.22974440996768, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9406, "loss": 3.8670692443847656, "lr": 3.527889535861239e-05, "elapsed_sec": 78033.8247950077, "step_time_sec": 8.230598639987875, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9407, "loss": 3.957017421722412, "lr": 3.5197828728331356e-05, "elapsed_sec": 78042.05609250069, "step_time_sec": 8.231122874014545, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9408, "loss": 3.8138108253479004, "lr": 3.5116872413441145e-05, "elapsed_sec": 78050.28573346138, "step_time_sec": 8.229526712966617, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9409, "loss": 3.9699525833129883, "lr": 3.503602654188519e-05, "elapsed_sec": 78058.51518034935, "step_time_sec": 8.229307186964434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9410, "loss": 3.8873965740203857, "lr": 3.4955291241432324e-05, "elapsed_sec": 78066.74335193634, "step_time_sec": 8.227976439055055, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9411, "loss": 3.9498450756073, "lr": 3.487466663967668e-05, "elapsed_sec": 78074.97194933891, "step_time_sec": 8.228457571996842, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9412, "loss": 3.9335381984710693, "lr": 3.47941528640374e-05, "elapsed_sec": 78083.202085495, "step_time_sec": 8.229925941035617, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9413, "loss": 3.871391773223877, "lr": 3.4713750041758524e-05, "elapsed_sec": 78091.43219256401, "step_time_sec": 8.230007458012551, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9414, "loss": 3.9846408367156982, "lr": 3.463345829990868e-05, "elapsed_sec": 78099.66323232651, "step_time_sec": 8.23079446196789, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9415, "loss": 3.924621820449829, "lr": 3.455327776538106e-05, "elapsed_sec": 78107.89196825027, "step_time_sec": 8.22858916799305, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9416, "loss": 3.8543145656585693, "lr": 3.447320856489293e-05, "elapsed_sec": 78116.12305402756, "step_time_sec": 8.230899841990322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9417, "loss": 3.6893961429595947, "lr": 3.439325082498579e-05, "elapsed_sec": 78124.35433030128, "step_time_sec": 8.231144352001138, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9418, "loss": 3.891227960586548, "lr": 3.431340467202481e-05, "elapsed_sec": 78132.58497714996, "step_time_sec": 8.230455612996593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9419, "loss": 4.129190444946289, "lr": 3.423367023219896e-05, "elapsed_sec": 78140.81525087357, "step_time_sec": 8.230128241993953, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9420, "loss": 3.8714892864227295, "lr": 3.415404763152058e-05, "elapsed_sec": 78149.04517292976, "step_time_sec": 8.229763689043466, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9421, "loss": 3.922774076461792, "lr": 3.4074536995825254e-05, "elapsed_sec": 78157.27627468109, "step_time_sec": 8.230920313042589, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9422, "loss": 4.073875427246094, "lr": 3.3995138450771656e-05, "elapsed_sec": 78165.50750780106, "step_time_sec": 8.231097985990345, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9423, "loss": 3.98241925239563, "lr": 3.391585212184127e-05, "elapsed_sec": 78173.73886466026, "step_time_sec": 8.231153447006363, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9424, "loss": 4.0042219161987305, "lr": 3.3836678134338274e-05, "elapsed_sec": 78181.9694404602, "step_time_sec": 8.230413228971884, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9425, "loss": 3.9433810710906982, "lr": 3.3757616613389284e-05, "elapsed_sec": 78190.20048904419, "step_time_sec": 8.230919844005257, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9426, "loss": 3.864459753036499, "lr": 3.367866768394313e-05, "elapsed_sec": 78198.42865633965, "step_time_sec": 8.227942455967423, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9427, "loss": 3.7878222465515137, "lr": 3.359983147077083e-05, "elapsed_sec": 78206.65760397911, "step_time_sec": 8.228832433000207, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9428, "loss": 3.9067556858062744, "lr": 3.352110809846508e-05, "elapsed_sec": 78214.88839626312, "step_time_sec": 8.230618013010826, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9429, "loss": 3.967158794403076, "lr": 3.344249769144042e-05, "elapsed_sec": 78223.11890268326, "step_time_sec": 8.230350156023633, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9430, "loss": 3.9551427364349365, "lr": 3.336400037393274e-05, "elapsed_sec": 78231.35101151466, "step_time_sec": 8.231956855976023, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9431, "loss": 3.976804256439209, "lr": 3.3285616269999256e-05, "elapsed_sec": 78239.58206772804, "step_time_sec": 8.230850004008971, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9432, "loss": 4.014685153961182, "lr": 3.320734550351827e-05, "elapsed_sec": 78247.81234002113, "step_time_sec": 8.230117582017556, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9433, "loss": 3.828608751296997, "lr": 3.3129188198188906e-05, "elapsed_sec": 78256.04391717911, "step_time_sec": 8.231429347011726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9434, "loss": 4.1878814697265625, "lr": 3.305114447753103e-05, "elapsed_sec": 78264.27430963516, "step_time_sec": 8.230243008001707, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9435, "loss": 3.952697515487671, "lr": 3.2973214464885e-05, "elapsed_sec": 78272.505215168, "step_time_sec": 8.230725662026089, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9436, "loss": 3.9467999935150146, "lr": 3.2895398283411435e-05, "elapsed_sec": 78280.73465251923, "step_time_sec": 8.229284978006035, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9437, "loss": 3.949458122253418, "lr": 3.2817696056091076e-05, "elapsed_sec": 78288.96602082253, "step_time_sec": 8.231195525964722, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9438, "loss": 4.0142340660095215, "lr": 3.274010790572453e-05, "elapsed_sec": 78297.19657564163, "step_time_sec": 8.230404986999929, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9439, "loss": 3.7952587604522705, "lr": 3.266263395493223e-05, "elapsed_sec": 78305.4274251461, "step_time_sec": 8.230729133996647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9440, "loss": 4.112387180328369, "lr": 3.2585274326154004e-05, "elapsed_sec": 78313.65887594223, "step_time_sec": 8.231253004982136, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9441, "loss": 4.05242919921875, "lr": 3.2508029141649056e-05, "elapsed_sec": 78321.88951802254, "step_time_sec": 8.230539013980888, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9442, "loss": 3.932960271835327, "lr": 3.243089852349572e-05, "elapsed_sec": 78330.12156438828, "step_time_sec": 8.231888898997568, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9443, "loss": 3.951808214187622, "lr": 3.235388259359128e-05, "elapsed_sec": 78338.35201764107, "step_time_sec": 8.230244258011226, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9444, "loss": 3.8281052112579346, "lr": 3.227698147365174e-05, "elapsed_sec": 78346.57974481583, "step_time_sec": 8.2275962549611, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9445, "loss": 4.084792137145996, "lr": 3.2200195285211684e-05, "elapsed_sec": 78354.80967950821, "step_time_sec": 8.229721035051625, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9446, "loss": 3.779038906097412, "lr": 3.212352414962404e-05, "elapsed_sec": 78363.03791379929, "step_time_sec": 8.228092022996861, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9447, "loss": 3.8490214347839355, "lr": 3.204696818805992e-05, "elapsed_sec": 78371.26913785934, "step_time_sec": 8.231091727036983, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9448, "loss": 3.8590023517608643, "lr": 3.1970527521508376e-05, "elapsed_sec": 78379.49952077866, "step_time_sec": 8.230202006991021, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9449, "loss": 3.847120523452759, "lr": 3.189420227077633e-05, "elapsed_sec": 78387.73037934303, "step_time_sec": 8.230784629005939, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9450, "loss": 3.8003079891204834, "lr": 3.181799255648824e-05, "elapsed_sec": 78395.96090269089, "step_time_sec": 8.230285461992025, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9451, "loss": 3.909497022628784, "lr": 3.1741898499085985e-05, "elapsed_sec": 78404.19210958481, "step_time_sec": 8.231119387957733, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9452, "loss": 3.8948543071746826, "lr": 3.166592021882862e-05, "elapsed_sec": 78412.42239785194, "step_time_sec": 8.230056056985632, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9453, "loss": 4.486835479736328, "lr": 3.15900578357923e-05, "elapsed_sec": 78420.65322780609, "step_time_sec": 8.230661328998394, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9454, "loss": 4.059921741485596, "lr": 3.151431146986996e-05, "elapsed_sec": 78428.88293790817, "step_time_sec": 8.229526821989566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9455, "loss": 3.9807991981506348, "lr": 3.1438681240771205e-05, "elapsed_sec": 78437.1137137413, "step_time_sec": 8.230598287947942, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9456, "loss": 3.9084930419921875, "lr": 3.136316726802209e-05, "elapsed_sec": 78445.34475445747, "step_time_sec": 8.23093570198398, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9457, "loss": 3.9075686931610107, "lr": 3.128776967096491e-05, "elapsed_sec": 78453.57386493683, "step_time_sec": 8.228891869017389, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9458, "loss": 4.028278827667236, "lr": 3.121248856875816e-05, "elapsed_sec": 78461.80111694336, "step_time_sec": 8.227126857964322, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9459, "loss": 3.992558479309082, "lr": 3.1137324080376023e-05, "elapsed_sec": 78470.03146123886, "step_time_sec": 8.230158013000619, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9460, "loss": 3.8810935020446777, "lr": 3.1062276324608575e-05, "elapsed_sec": 78478.26164865494, "step_time_sec": 8.230090447003022, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9461, "loss": 3.9161038398742676, "lr": 3.098734542006131e-05, "elapsed_sec": 78486.49329519272, "step_time_sec": 8.231516769039445, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9462, "loss": 3.985088586807251, "lr": 3.0912531485155065e-05, "elapsed_sec": 78494.72202396393, "step_time_sec": 8.228469670983031, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9463, "loss": 4.0569376945495605, "lr": 3.083783463812584e-05, "elapsed_sec": 78502.94985675812, "step_time_sec": 8.227639885037206, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9464, "loss": 3.898383378982544, "lr": 3.076325499702456e-05, "elapsed_sec": 78511.17956638336, "step_time_sec": 8.229571550968103, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9465, "loss": 3.9327991008758545, "lr": 3.068879267971694e-05, "elapsed_sec": 78519.40941596031, "step_time_sec": 8.229705471021589, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9466, "loss": 3.9405782222747803, "lr": 3.061444780388326e-05, "elapsed_sec": 78527.6407186985, "step_time_sec": 8.231165768986102, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9467, "loss": 3.989656686782837, "lr": 3.0540220487018186e-05, "elapsed_sec": 78535.87201237679, "step_time_sec": 8.231091452005785, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9468, "loss": 4.040224075317383, "lr": 3.0466110846430677e-05, "elapsed_sec": 78544.10306191444, "step_time_sec": 8.230908602010459, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9469, "loss": 3.8671441078186035, "lr": 3.0392118999243568e-05, "elapsed_sec": 78552.33422899246, "step_time_sec": 8.231015840021428, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9470, "loss": 3.9488134384155273, "lr": 3.03182450623937e-05, "elapsed_sec": 78560.56441688538, "step_time_sec": 8.230065473006107, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9471, "loss": 3.9589028358459473, "lr": 3.0244489152631395e-05, "elapsed_sec": 78568.79350543022, "step_time_sec": 8.228874902997632, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9472, "loss": 4.021341323852539, "lr": 3.0170851386520608e-05, "elapsed_sec": 78577.02458763123, "step_time_sec": 8.23100886499742, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9473, "loss": 3.969391345977783, "lr": 3.0097331880438462e-05, "elapsed_sec": 78585.25617647171, "step_time_sec": 8.231411695014685, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9474, "loss": 3.8434884548187256, "lr": 3.0023930750575252e-05, "elapsed_sec": 78593.48764038086, "step_time_sec": 8.23125724302372, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9475, "loss": 3.9312171936035156, "lr": 2.9950648112934153e-05, "elapsed_sec": 78601.7186126709, "step_time_sec": 8.230823044024874, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9476, "loss": 3.899813652038574, "lr": 2.9877484083331084e-05, "elapsed_sec": 78609.94927549362, "step_time_sec": 8.230496807023883, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9477, "loss": 3.9423017501831055, "lr": 2.9804438777394533e-05, "elapsed_sec": 78618.18013691902, "step_time_sec": 8.230705341964494, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9478, "loss": 4.0043487548828125, "lr": 2.9731512310565326e-05, "elapsed_sec": 78626.41135025024, "step_time_sec": 8.231083195016254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9479, "loss": 4.035171031951904, "lr": 2.9658704798096478e-05, "elapsed_sec": 78634.6424202919, "step_time_sec": 8.230922440008726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9480, "loss": 3.9776790142059326, "lr": 2.958601635505308e-05, "elapsed_sec": 78642.87403821945, "step_time_sec": 8.23141393403057, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9481, "loss": 3.8859736919403076, "lr": 2.951344709631191e-05, "elapsed_sec": 78651.10488343239, "step_time_sec": 8.230710021976847, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9482, "loss": 4.287928104400635, "lr": 2.9440997136561562e-05, "elapsed_sec": 78659.33622956276, "step_time_sec": 8.231169650040101, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9483, "loss": 3.98823618888855, "lr": 2.9368666590301892e-05, "elapsed_sec": 78667.56704163551, "step_time_sec": 8.230659707973246, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9484, "loss": 3.9368152618408203, "lr": 2.9296455571844204e-05, "elapsed_sec": 78675.79779911041, "step_time_sec": 8.230625723022968, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9485, "loss": 4.114383697509766, "lr": 2.92243641953108e-05, "elapsed_sec": 78684.0292892456, "step_time_sec": 8.231292481999844, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9486, "loss": 3.9033138751983643, "lr": 2.9152392574634965e-05, "elapsed_sec": 78692.25969934464, "step_time_sec": 8.230314688000362, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9487, "loss": 3.968386650085449, "lr": 2.908054082356065e-05, "elapsed_sec": 78700.49040746689, "step_time_sec": 8.230573642998934, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9488, "loss": 4.02667236328125, "lr": 2.9008809055642418e-05, "elapsed_sec": 78708.72126984596, "step_time_sec": 8.23065449699061, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9489, "loss": 3.9664688110351562, "lr": 2.893719738424521e-05, "elapsed_sec": 78716.95183825493, "step_time_sec": 8.230420611973386, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9490, "loss": 3.9327826499938965, "lr": 2.8865705922544136e-05, "elapsed_sec": 78725.18258619308, "step_time_sec": 8.23062706302153, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9491, "loss": 3.893310785293579, "lr": 2.8794334783524317e-05, "elapsed_sec": 78733.41343593597, "step_time_sec": 8.23061421199236, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9492, "loss": 4.057712554931641, "lr": 2.872308407998082e-05, "elapsed_sec": 78741.64428210258, "step_time_sec": 8.230738404032309, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9493, "loss": 3.9268009662628174, "lr": 2.8651953924518194e-05, "elapsed_sec": 78749.87447929382, "step_time_sec": 8.229990691994317, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9494, "loss": 4.001918792724609, "lr": 2.8580944429550685e-05, "elapsed_sec": 78758.10586953163, "step_time_sec": 8.231234168983065, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9495, "loss": 4.0872321128845215, "lr": 2.8510055707301636e-05, "elapsed_sec": 78766.33588910103, "step_time_sec": 8.229894598014653, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9496, "loss": 3.9532833099365234, "lr": 2.8439287869803688e-05, "elapsed_sec": 78774.5678229332, "step_time_sec": 8.23176620103186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9497, "loss": 3.918994665145874, "lr": 2.8368641028898362e-05, "elapsed_sec": 78782.79837465286, "step_time_sec": 8.230403074994683, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9498, "loss": 3.8920254707336426, "lr": 2.8298115296235957e-05, "elapsed_sec": 78791.0294957161, "step_time_sec": 8.23097198503092, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9499, "loss": 3.853797197341919, "lr": 2.8227710783275383e-05, "elapsed_sec": 78799.26061844826, "step_time_sec": 8.230949271994177, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9500, "loss": 3.934995412826538, "lr": 2.815742760128396e-05, "elapsed_sec": 78807.4916408062, "step_time_sec": 30.70950300002005, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9501, "loss": 4.099531173706055, "lr": 2.8087265861337283e-05, "elapsed_sec": 78838.21587133408, "step_time_sec": 8.245178921031766, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9502, "loss": 3.920382261276245, "lr": 2.8017225674319008e-05, "elapsed_sec": 78846.4319357872, "step_time_sec": 8.215819106961135, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9503, "loss": 3.950387477874756, "lr": 2.7947307150920653e-05, "elapsed_sec": 78854.64866161346, "step_time_sec": 8.216624842956662, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9504, "loss": 3.924221992492676, "lr": 2.7877510401641562e-05, "elapsed_sec": 78862.8654615879, "step_time_sec": 8.216567506955471, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9505, "loss": 3.903296709060669, "lr": 2.780783553678848e-05, "elapsed_sec": 78871.0921459198, "step_time_sec": 8.226549279002938, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9506, "loss": 4.009622097015381, "lr": 2.7738282666475666e-05, "elapsed_sec": 78879.3222360611, "step_time_sec": 8.22991064604139, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9507, "loss": 4.083584308624268, "lr": 2.766885190062444e-05, "elapsed_sec": 78887.55309128761, "step_time_sec": 8.23068651498761, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9508, "loss": 3.8446433544158936, "lr": 2.75995433489633e-05, "elapsed_sec": 78895.78469824791, "step_time_sec": 8.231516613974236, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9509, "loss": 4.018352031707764, "lr": 2.753035712102748e-05, "elapsed_sec": 78904.01413106918, "step_time_sec": 8.229203771974426, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9510, "loss": 4.0367608070373535, "lr": 2.746129332615895e-05, "elapsed_sec": 78912.24516272545, "step_time_sec": 8.230891741986852, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9511, "loss": 3.9688968658447266, "lr": 2.7392352073506157e-05, "elapsed_sec": 78920.47628831863, "step_time_sec": 8.230938285996672, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9512, "loss": 3.9368932247161865, "lr": 2.7323533472023903e-05, "elapsed_sec": 78928.70771479607, "step_time_sec": 8.231235979998019, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9513, "loss": 3.957277774810791, "lr": 2.7254837630473148e-05, "elapsed_sec": 78936.93887329102, "step_time_sec": 8.230998616025317, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9514, "loss": 3.905150890350342, "lr": 2.718626465742083e-05, "elapsed_sec": 78945.16607570648, "step_time_sec": 8.227054726041388, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9515, "loss": 3.8929831981658936, "lr": 2.711781466123969e-05, "elapsed_sec": 78953.39389181137, "step_time_sec": 8.227607069013175, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9516, "loss": 4.036838054656982, "lr": 2.7049487750108224e-05, "elapsed_sec": 78961.62263584137, "step_time_sec": 8.228588911006227, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9517, "loss": 3.963085174560547, "lr": 2.698128403201021e-05, "elapsed_sec": 78969.85417246819, "step_time_sec": 8.231382626050618, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9518, "loss": 3.9309017658233643, "lr": 2.6913203614734947e-05, "elapsed_sec": 78978.08502292633, "step_time_sec": 8.230665160983335, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9519, "loss": 4.006977558135986, "lr": 2.684524660587668e-05, "elapsed_sec": 78986.31639766693, "step_time_sec": 8.23122582299402, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9520, "loss": 3.842883586883545, "lr": 2.677741311283475e-05, "elapsed_sec": 78994.54736113548, "step_time_sec": 8.230808796011843, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9521, "loss": 3.8609542846679688, "lr": 2.6709703242813244e-05, "elapsed_sec": 79002.77876520157, "step_time_sec": 8.23126085504191, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9522, "loss": 4.076748371124268, "lr": 2.6642117102820862e-05, "elapsed_sec": 79011.00978636742, "step_time_sec": 8.230862558004446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9523, "loss": 3.964043378829956, "lr": 2.6574654799670797e-05, "elapsed_sec": 79019.24016427994, "step_time_sec": 8.230170277005527, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9524, "loss": 3.9806792736053467, "lr": 2.6507316439980504e-05, "elapsed_sec": 79027.46887946129, "step_time_sec": 8.228613618004601, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9525, "loss": 3.8306057453155518, "lr": 2.644010213017156e-05, "elapsed_sec": 79035.69942736626, "step_time_sec": 8.230393004021607, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9526, "loss": 3.9516756534576416, "lr": 2.6373011976469492e-05, "elapsed_sec": 79043.93130326271, "step_time_sec": 8.231650245026685, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9527, "loss": 3.9424216747283936, "lr": 2.6306046084903616e-05, "elapsed_sec": 79052.16224908829, "step_time_sec": 8.230795623036101, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9528, "loss": 3.9549720287323, "lr": 2.6239204561306896e-05, "elapsed_sec": 79060.39202785492, "step_time_sec": 8.229590466013178, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9529, "loss": 3.9300742149353027, "lr": 2.6172487511315686e-05, "elapsed_sec": 79068.62348437309, "step_time_sec": 8.231330272974446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9530, "loss": 3.993058443069458, "lr": 2.6105895040369655e-05, "elapsed_sec": 79076.85383486748, "step_time_sec": 8.230132287950255, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9531, "loss": 3.8911149501800537, "lr": 2.6039427253711583e-05, "elapsed_sec": 79085.083984375, "step_time_sec": 8.229972993955016, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9532, "loss": 3.997762680053711, "lr": 2.597308425638719e-05, "elapsed_sec": 79093.31343722343, "step_time_sec": 8.229297877987847, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9533, "loss": 4.173551082611084, "lr": 2.5906866153245006e-05, "elapsed_sec": 79101.54452180862, "step_time_sec": 8.230939016037155, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9534, "loss": 3.8910605907440186, "lr": 2.5840773048936134e-05, "elapsed_sec": 79109.77260303497, "step_time_sec": 8.227914824034087, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9535, "loss": 4.028104782104492, "lr": 2.577480504791417e-05, "elapsed_sec": 79118.00325655937, "step_time_sec": 8.230488645029254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9536, "loss": 4.084015846252441, "lr": 2.5708962254434972e-05, "elapsed_sec": 79126.23391389847, "step_time_sec": 8.230553035973571, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9537, "loss": 3.887598752975464, "lr": 2.5643244772556538e-05, "elapsed_sec": 79134.46493005753, "step_time_sec": 8.23081988299964, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9538, "loss": 3.973214864730835, "lr": 2.5577652706138774e-05, "elapsed_sec": 79142.69526982307, "step_time_sec": 8.230134557990823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9539, "loss": 4.0510993003845215, "lr": 2.5512186158843477e-05, "elapsed_sec": 79150.92597126961, "step_time_sec": 8.23055698495591, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9540, "loss": 3.9176158905029297, "lr": 2.5446845234133986e-05, "elapsed_sec": 79159.1569018364, "step_time_sec": 8.23071638197871, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9541, "loss": 3.869685411453247, "lr": 2.5381630035275144e-05, "elapsed_sec": 79167.3872961998, "step_time_sec": 8.230258487979881, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9542, "loss": 3.7802178859710693, "lr": 2.5316540665333074e-05, "elapsed_sec": 79175.61742854118, "step_time_sec": 8.229939446027856, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9543, "loss": 3.8867859840393066, "lr": 2.5251577227175064e-05, "elapsed_sec": 79183.84865903854, "step_time_sec": 8.231081266014371, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9544, "loss": 3.977008581161499, "lr": 2.5186739823469372e-05, "elapsed_sec": 79192.07971024513, "step_time_sec": 8.230885230994318, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9545, "loss": 4.006402015686035, "lr": 2.512202855668506e-05, "elapsed_sec": 79200.31051039696, "step_time_sec": 8.230637748958543, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9546, "loss": 3.844935894012451, "lr": 2.5057443529091842e-05, "elapsed_sec": 79208.54205965996, "step_time_sec": 8.231373306014575, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9547, "loss": 3.942101240158081, "lr": 2.499298484275995e-05, "elapsed_sec": 79216.7729871273, "step_time_sec": 8.230795539042447, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9548, "loss": 3.894336462020874, "lr": 2.492865259955989e-05, "elapsed_sec": 79225.00398135185, "step_time_sec": 8.230766858032439, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9549, "loss": 3.95975661277771, "lr": 2.4864446901162436e-05, "elapsed_sec": 79233.23479938507, "step_time_sec": 8.23066816298524, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9550, "loss": 3.9878060817718506, "lr": 2.480036784903827e-05, "elapsed_sec": 79241.46556568146, "step_time_sec": 8.23060600599274, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9551, "loss": 4.024350166320801, "lr": 2.473641554445798e-05, "elapsed_sec": 79249.69707393646, "step_time_sec": 8.23133155098185, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9552, "loss": 3.9657795429229736, "lr": 2.4672590088491806e-05, "elapsed_sec": 79257.92667341232, "step_time_sec": 8.22945805900963, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9553, "loss": 3.9720818996429443, "lr": 2.4608891582009555e-05, "elapsed_sec": 79266.15548992157, "step_time_sec": 8.228688489005435, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9554, "loss": 4.065577983856201, "lr": 2.4545320125680355e-05, "elapsed_sec": 79274.38434481621, "step_time_sec": 8.228720454033464, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9555, "loss": 3.996490001678467, "lr": 2.4481875819972608e-05, "elapsed_sec": 79282.61186528206, "step_time_sec": 8.227332698006649, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9556, "loss": 4.045515537261963, "lr": 2.4418558765153673e-05, "elapsed_sec": 79290.84271550179, "step_time_sec": 8.230677443032619, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9557, "loss": 3.87015438079834, "lr": 2.4355369061289945e-05, "elapsed_sec": 79299.07362604141, "step_time_sec": 8.230782173981424, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9558, "loss": 3.9875729084014893, "lr": 2.429230680824638e-05, "elapsed_sec": 79307.3045372963, "step_time_sec": 8.230669135984499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9559, "loss": 4.114431858062744, "lr": 2.422937210568667e-05, "elapsed_sec": 79315.53594446182, "step_time_sec": 8.231279092025943, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9560, "loss": 4.085266590118408, "lr": 2.4166565053072788e-05, "elapsed_sec": 79323.7671353817, "step_time_sec": 8.231036651006434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9561, "loss": 3.843264102935791, "lr": 2.4103885749665084e-05, "elapsed_sec": 79331.99778652191, "step_time_sec": 8.230456182966009, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9562, "loss": 3.959467649459839, "lr": 2.4041334294521958e-05, "elapsed_sec": 79340.22905683517, "step_time_sec": 8.23112021398265, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9563, "loss": 4.055127143859863, "lr": 2.3978910786499747e-05, "elapsed_sec": 79348.45969295502, "step_time_sec": 8.230459882004652, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9564, "loss": 4.1067423820495605, "lr": 2.3916615324252617e-05, "elapsed_sec": 79356.6908519268, "step_time_sec": 8.231018659018446, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9565, "loss": 3.979738235473633, "lr": 2.3854448006232337e-05, "elapsed_sec": 79364.92191576958, "step_time_sec": 8.230857320013456, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9566, "loss": 3.796149253845215, "lr": 2.3792408930688176e-05, "elapsed_sec": 79373.15418124199, "step_time_sec": 8.232119544001762, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9567, "loss": 3.8466625213623047, "lr": 2.373049819566673e-05, "elapsed_sec": 79381.38493013382, "step_time_sec": 8.230599694012199, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9568, "loss": 4.309581756591797, "lr": 2.3668715899011727e-05, "elapsed_sec": 79389.61607336998, "step_time_sec": 8.230994821002241, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9569, "loss": 3.899661064147949, "lr": 2.3607062138364013e-05, "elapsed_sec": 79397.84534025192, "step_time_sec": 8.229066111962311, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9570, "loss": 4.013304710388184, "lr": 2.3545537011161148e-05, "elapsed_sec": 79406.07233572006, "step_time_sec": 8.226825763005763, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9571, "loss": 3.8349037170410156, "lr": 2.3484140614637556e-05, "elapsed_sec": 79414.30352783203, "step_time_sec": 8.231004400993697, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9572, "loss": 4.1779093742370605, "lr": 2.342287304582405e-05, "elapsed_sec": 79422.53384399414, "step_time_sec": 8.230188335990533, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9573, "loss": 3.8933417797088623, "lr": 2.3361734401547986e-05, "elapsed_sec": 79430.76524209976, "step_time_sec": 8.231224394985475, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9574, "loss": 3.957371473312378, "lr": 2.3300724778432912e-05, "elapsed_sec": 79438.9957075119, "step_time_sec": 8.230284759949427, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9575, "loss": 3.8334062099456787, "lr": 2.323984427289846e-05, "elapsed_sec": 79447.2257361412, "step_time_sec": 8.22990790498443, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9576, "loss": 3.845815896987915, "lr": 2.317909298116021e-05, "elapsed_sec": 79455.45667290688, "step_time_sec": 8.230780719022732, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9577, "loss": 3.973259687423706, "lr": 2.311847099922955e-05, "elapsed_sec": 79463.6870610714, "step_time_sec": 8.23014138196595, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9578, "loss": 3.9948363304138184, "lr": 2.3057978422913476e-05, "elapsed_sec": 79471.9185872078, "step_time_sec": 8.23136832396267, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9579, "loss": 3.829536199569702, "lr": 2.2997615347814506e-05, "elapsed_sec": 79480.14641594887, "step_time_sec": 8.227668064995669, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9580, "loss": 3.8874521255493164, "lr": 2.293738186933045e-05, "elapsed_sec": 79488.3743300438, "step_time_sec": 8.227790695964359, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9581, "loss": 4.018062591552734, "lr": 2.2877278082654395e-05, "elapsed_sec": 79496.60322880745, "step_time_sec": 8.228676992002875, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9582, "loss": 3.7945797443389893, "lr": 2.2817304082774316e-05, "elapsed_sec": 79504.8337404728, "step_time_sec": 8.230347028991673, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9583, "loss": 3.966374635696411, "lr": 2.2757459964473237e-05, "elapsed_sec": 79513.06528377533, "step_time_sec": 8.23138381697936, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9584, "loss": 3.926013231277466, "lr": 2.2697745822328758e-05, "elapsed_sec": 79521.29543018341, "step_time_sec": 8.230009868973866, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9585, "loss": 4.046753883361816, "lr": 2.2638161750713205e-05, "elapsed_sec": 79529.52535700798, "step_time_sec": 8.22974384704139, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9586, "loss": 3.9247663021087646, "lr": 2.2578707843793257e-05, "elapsed_sec": 79537.75619721413, "step_time_sec": 8.230723569984548, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9587, "loss": 4.016429901123047, "lr": 2.2519384195529898e-05, "elapsed_sec": 79545.98627758026, "step_time_sec": 8.229934617003892, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9588, "loss": 3.923535108566284, "lr": 2.2460190899678266e-05, "elapsed_sec": 79554.21840238571, "step_time_sec": 8.231888242997229, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9589, "loss": 3.960334300994873, "lr": 2.2401128049787466e-05, "elapsed_sec": 79562.44717645645, "step_time_sec": 8.228591763006989, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9590, "loss": 3.9007134437561035, "lr": 2.234219573920046e-05, "elapsed_sec": 79570.67737340927, "step_time_sec": 8.230048414960038, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9591, "loss": 3.959319829940796, "lr": 2.2283394061053912e-05, "elapsed_sec": 79578.90802598, "step_time_sec": 8.230490664951503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9592, "loss": 3.970574140548706, "lr": 2.2224723108278008e-05, "elapsed_sec": 79587.13752245903, "step_time_sec": 8.22933084401302, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9593, "loss": 3.8909976482391357, "lr": 2.216618297359641e-05, "elapsed_sec": 79595.36787605286, "step_time_sec": 8.230273562017828, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9594, "loss": 3.9077084064483643, "lr": 2.2107773749525897e-05, "elapsed_sec": 79603.59723424911, "step_time_sec": 8.229140922951046, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9595, "loss": 3.900256395339966, "lr": 2.2049495528376523e-05, "elapsed_sec": 79611.82922244072, "step_time_sec": 8.231832902005408, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9596, "loss": 4.059211254119873, "lr": 2.1991348402251154e-05, "elapsed_sec": 79620.06035542488, "step_time_sec": 8.230968551011756, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9597, "loss": 3.9260878562927246, "lr": 2.1933332463045573e-05, "elapsed_sec": 79628.29090952873, "step_time_sec": 8.230462873005308, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9598, "loss": 3.8998732566833496, "lr": 2.187544780244822e-05, "elapsed_sec": 79636.52208161354, "step_time_sec": 8.230986518028658, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9599, "loss": 4.105673313140869, "lr": 2.1817694511940008e-05, "elapsed_sec": 79644.75291776657, "step_time_sec": 8.230572130996734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9600, "loss": 3.9096925258636475, "lr": 2.1760072682794294e-05, "elapsed_sec": 79652.98160219193, "step_time_sec": 8.228587962046731, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9601, "loss": 3.9849941730499268, "lr": 2.170258240607663e-05, "elapsed_sec": 79661.21047449112, "step_time_sec": 8.228667849034537, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9602, "loss": 3.901670455932617, "lr": 2.1645223772644708e-05, "elapsed_sec": 79669.43983578682, "step_time_sec": 8.229256246995647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9603, "loss": 3.805030345916748, "lr": 2.158799687314812e-05, "elapsed_sec": 79677.6705994606, "step_time_sec": 8.23061004804913, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9604, "loss": 3.903045654296875, "lr": 2.1530901798028297e-05, "elapsed_sec": 79685.9017636776, "step_time_sec": 8.230937548039947, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9605, "loss": 3.833569049835205, "lr": 2.1473938637518357e-05, "elapsed_sec": 79694.13261961937, "step_time_sec": 8.230748982983641, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9606, "loss": 3.9017388820648193, "lr": 2.1417107481642844e-05, "elapsed_sec": 79702.36376237869, "step_time_sec": 8.230918037996162, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9607, "loss": 3.955277442932129, "lr": 2.136040842021783e-05, "elapsed_sec": 79710.59437394142, "step_time_sec": 8.230468599998858, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9608, "loss": 3.9649059772491455, "lr": 2.1303841542850476e-05, "elapsed_sec": 79718.8244445324, "step_time_sec": 8.229895172000397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9609, "loss": 3.958897590637207, "lr": 2.1247406938939145e-05, "elapsed_sec": 79736.35568928719, "step_time_sec": 17.531149696966168, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9610, "loss": 3.9068782329559326, "lr": 2.119110469767311e-05, "elapsed_sec": 79744.57200932503, "step_time_sec": 8.216101113008335, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9611, "loss": 3.824479818344116, "lr": 2.1134934908032465e-05, "elapsed_sec": 79752.78937029839, "step_time_sec": 8.217243051971309, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9612, "loss": 3.8584437370300293, "lr": 2.1078897658787956e-05, "elapsed_sec": 79761.02110767365, "step_time_sec": 8.23154871998122, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9613, "loss": 3.982478380203247, "lr": 2.1022993038500916e-05, "elapsed_sec": 79769.25188136101, "step_time_sec": 8.230590104998555, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9614, "loss": 3.884993076324463, "lr": 2.096722113552303e-05, "elapsed_sec": 79777.48246574402, "step_time_sec": 8.230428136012051, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9615, "loss": 3.867100238800049, "lr": 2.091158203799622e-05, "elapsed_sec": 79785.71250653267, "step_time_sec": 8.229911391041242, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9616, "loss": 3.9344749450683594, "lr": 2.085607583385256e-05, "elapsed_sec": 79793.94452953339, "step_time_sec": 8.231818758009467, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9617, "loss": 3.9287431240081787, "lr": 2.0800702610814118e-05, "elapsed_sec": 79802.17436909676, "step_time_sec": 8.229659278993495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9618, "loss": 3.868208408355713, "lr": 2.0745462456392715e-05, "elapsed_sec": 79810.40435814857, "step_time_sec": 8.229913058981765, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9619, "loss": 3.9555904865264893, "lr": 2.0690355457889963e-05, "elapsed_sec": 79818.63632440567, "step_time_sec": 8.231740212999284, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9620, "loss": 3.935736894607544, "lr": 2.0635381702396973e-05, "elapsed_sec": 79826.86612558365, "step_time_sec": 8.229641063022427, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9621, "loss": 3.992995023727417, "lr": 2.05805412767943e-05, "elapsed_sec": 79835.09623169899, "step_time_sec": 8.229948785970919, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9622, "loss": 3.8098714351654053, "lr": 2.0525834267751796e-05, "elapsed_sec": 79843.32681560516, "step_time_sec": 8.230384219030384, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9623, "loss": 3.8080074787139893, "lr": 2.0471260761728438e-05, "elapsed_sec": 79851.5569498539, "step_time_sec": 8.230004347045906, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9624, "loss": 3.9792239665985107, "lr": 2.0416820844972237e-05, "elapsed_sec": 79859.78699016571, "step_time_sec": 8.229821917018853, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9625, "loss": 3.906609296798706, "lr": 2.0362514603520054e-05, "elapsed_sec": 79868.01716661453, "step_time_sec": 8.230044148978777, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9626, "loss": 4.045366287231445, "lr": 2.03083421231975e-05, "elapsed_sec": 79876.2478826046, "step_time_sec": 8.230530108034145, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9627, "loss": 3.895460844039917, "lr": 2.025430348961879e-05, "elapsed_sec": 79884.47785449028, "step_time_sec": 8.229839424951933, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9628, "loss": 3.9335336685180664, "lr": 2.0200398788186653e-05, "elapsed_sec": 79892.70693826675, "step_time_sec": 8.22888106800383, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9629, "loss": 3.8444197177886963, "lr": 2.0146628104092066e-05, "elapsed_sec": 79900.93305301666, "step_time_sec": 8.225994987995364, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9630, "loss": 3.9378180503845215, "lr": 2.0092991522314276e-05, "elapsed_sec": 79909.16454005241, "step_time_sec": 8.231243785994593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9631, "loss": 3.8586809635162354, "lr": 2.0039489127620543e-05, "elapsed_sec": 79917.39467811584, "step_time_sec": 8.22999715298647, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9632, "loss": 4.056429386138916, "lr": 1.99861210045661e-05, "elapsed_sec": 79925.62432003021, "step_time_sec": 8.22949595702812, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9633, "loss": 3.8303873538970947, "lr": 1.9932887237493942e-05, "elapsed_sec": 79933.85481405258, "step_time_sec": 8.230317929002922, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9634, "loss": 3.921485424041748, "lr": 1.9879787910534757e-05, "elapsed_sec": 79942.0854871273, "step_time_sec": 8.230503808008507, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9635, "loss": 4.023659706115723, "lr": 1.9826823107606748e-05, "elapsed_sec": 79950.31632637978, "step_time_sec": 8.230680005042814, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9636, "loss": 3.9806149005889893, "lr": 1.9773992912415528e-05, "elapsed_sec": 79958.54650759697, "step_time_sec": 8.230061343987472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9637, "loss": 3.8155086040496826, "lr": 1.9721297408453926e-05, "elapsed_sec": 79966.7768175602, "step_time_sec": 8.230142952001188, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9638, "loss": 3.988969087600708, "lr": 1.966873667900201e-05, "elapsed_sec": 79975.00785398483, "step_time_sec": 8.23088628903497, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9639, "loss": 4.138105869293213, "lr": 1.961631080712675e-05, "elapsed_sec": 79983.23948121071, "step_time_sec": 8.23146766499849, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9640, "loss": 3.939276695251465, "lr": 1.9564019875682027e-05, "elapsed_sec": 79991.46991252899, "step_time_sec": 8.230255676957313, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9641, "loss": 4.112499237060547, "lr": 1.951186396730846e-05, "elapsed_sec": 79999.70175552368, "step_time_sec": 8.231653655995615, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9642, "loss": 3.8801608085632324, "lr": 1.9459843164433284e-05, "elapsed_sec": 80007.93161773682, "step_time_sec": 8.229787785967346, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9643, "loss": 3.972658395767212, "lr": 1.9407957549270194e-05, "elapsed_sec": 80016.16312909126, "step_time_sec": 8.231253348989412, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9644, "loss": 3.9015166759490967, "lr": 1.9356207203819236e-05, "elapsed_sec": 80024.39342308044, "step_time_sec": 8.230153349984903, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9645, "loss": 3.8320109844207764, "lr": 1.9304592209866717e-05, "elapsed_sec": 80032.62463021278, "step_time_sec": 8.231049827008974, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9646, "loss": 3.8037564754486084, "lr": 1.9253112648984975e-05, "elapsed_sec": 80040.85521554947, "step_time_sec": 8.230458367965184, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9647, "loss": 3.95276141166687, "lr": 1.9201768602532325e-05, "elapsed_sec": 80049.08705639839, "step_time_sec": 8.231650046014693, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9648, "loss": 3.7964532375335693, "lr": 1.9150560151652985e-05, "elapsed_sec": 80057.31746697426, "step_time_sec": 8.23021350504132, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9649, "loss": 3.8013768196105957, "lr": 1.9099487377276752e-05, "elapsed_sec": 80065.54834985733, "step_time_sec": 8.230747735011391, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9650, "loss": 3.839395046234131, "lr": 1.904855036011911e-05, "elapsed_sec": 80073.77899718285, "step_time_sec": 8.230486416025087, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9651, "loss": 3.945410966873169, "lr": 1.8997749180680944e-05, "elapsed_sec": 80082.00914120674, "step_time_sec": 8.23000190797029, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9652, "loss": 3.8641273975372314, "lr": 1.8947083919248456e-05, "elapsed_sec": 80090.24069190025, "step_time_sec": 8.231361935962923, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9653, "loss": 3.8532423973083496, "lr": 1.889655465589305e-05, "elapsed_sec": 80098.47172021866, "step_time_sec": 8.230920767993666, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9654, "loss": 3.859632968902588, "lr": 1.8846161470471216e-05, "elapsed_sec": 80106.70292162895, "step_time_sec": 8.231005291978363, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9655, "loss": 3.6660265922546387, "lr": 1.879590444262437e-05, "elapsed_sec": 80114.93422555923, "step_time_sec": 8.231192732986528, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9656, "loss": 3.8648433685302734, "lr": 1.874578365177874e-05, "elapsed_sec": 80123.16449332237, "step_time_sec": 8.230040962982457, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9657, "loss": 4.005981922149658, "lr": 1.8695799177145247e-05, "elapsed_sec": 80131.39283466339, "step_time_sec": 8.22817060694797, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9658, "loss": 3.929623603820801, "lr": 1.864595109771941e-05, "elapsed_sec": 80139.6232392788, "step_time_sec": 8.230247644998599, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9659, "loss": 3.966996431350708, "lr": 1.8596239492281126e-05, "elapsed_sec": 80147.85440325737, "step_time_sec": 8.231014431046788, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9660, "loss": 3.99814772605896, "lr": 1.8546664439394684e-05, "elapsed_sec": 80156.0857758522, "step_time_sec": 8.231192913022824, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9661, "loss": 3.971198797225952, "lr": 1.8497226017408473e-05, "elapsed_sec": 80164.31638360023, "step_time_sec": 8.230422938999254, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9662, "loss": 4.100025653839111, "lr": 1.844792430445505e-05, "elapsed_sec": 80172.54744172096, "step_time_sec": 8.23088101099711, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9663, "loss": 3.7684574127197266, "lr": 1.8398759378450864e-05, "elapsed_sec": 80180.77890849113, "step_time_sec": 8.231317784986459, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9664, "loss": 4.002899646759033, "lr": 1.8349731317096165e-05, "elapsed_sec": 80189.0089199543, "step_time_sec": 8.22988553502364, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9665, "loss": 3.9556009769439697, "lr": 1.830084019787495e-05, "elapsed_sec": 80197.23905825615, "step_time_sec": 8.229912404029164, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9666, "loss": 3.888631820678711, "lr": 1.8252086098054773e-05, "elapsed_sec": 80205.47006559372, "step_time_sec": 8.230880418035667, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9667, "loss": 3.9097185134887695, "lr": 1.8203469094686626e-05, "elapsed_sec": 80213.70204210281, "step_time_sec": 8.231794950028416, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9668, "loss": 3.9682607650756836, "lr": 1.8154989264604854e-05, "elapsed_sec": 80221.9322142601, "step_time_sec": 8.230059919005726, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9669, "loss": 3.847616195678711, "lr": 1.8106646684427002e-05, "elapsed_sec": 80230.16277599335, "step_time_sec": 8.230359229026362, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9670, "loss": 4.0998406410217285, "lr": 1.8058441430553754e-05, "elapsed_sec": 80238.3933532238, "step_time_sec": 8.230447815032676, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9671, "loss": 3.855832815170288, "lr": 1.801037357916865e-05, "elapsed_sec": 80246.62402391434, "step_time_sec": 8.230470123002306, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9672, "loss": 3.9136414527893066, "lr": 1.7962443206238218e-05, "elapsed_sec": 80254.85443234444, "step_time_sec": 8.230247095052619, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9673, "loss": 4.010684013366699, "lr": 1.7914650387511576e-05, "elapsed_sec": 80263.08540987968, "step_time_sec": 8.230860857001971, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9674, "loss": 3.8803515434265137, "lr": 1.7866995198520586e-05, "elapsed_sec": 80271.31537413597, "step_time_sec": 8.229761009046342, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9675, "loss": 4.0636725425720215, "lr": 1.78194777145795e-05, "elapsed_sec": 80279.54541349411, "step_time_sec": 8.229920267011039, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9676, "loss": 3.9003148078918457, "lr": 1.7772098010784986e-05, "elapsed_sec": 80287.77624869347, "step_time_sec": 8.230630963982549, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9677, "loss": 3.8690690994262695, "lr": 1.772485616201596e-05, "elapsed_sec": 80296.00677680969, "step_time_sec": 8.23037619504612, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9678, "loss": 3.806309938430786, "lr": 1.767775224293345e-05, "elapsed_sec": 80304.23764228821, "step_time_sec": 8.23070864501642, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9679, "loss": 4.104711055755615, "lr": 1.7630786327980538e-05, "elapsed_sec": 80312.46753525734, "step_time_sec": 8.229743630043231, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9680, "loss": 3.913907766342163, "lr": 1.7583958491382154e-05, "elapsed_sec": 80320.69878935814, "step_time_sec": 8.231096287025139, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9681, "loss": 3.984950304031372, "lr": 1.7537268807145053e-05, "elapsed_sec": 80328.92952489853, "step_time_sec": 8.230578396993224, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9682, "loss": 4.00977897644043, "lr": 1.7490717349057675e-05, "elapsed_sec": 80337.15980577469, "step_time_sec": 8.230135929014068, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9683, "loss": 3.9272303581237793, "lr": 1.74443041906899e-05, "elapsed_sec": 80345.39051675797, "step_time_sec": 8.230522394995205, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9684, "loss": 3.9286131858825684, "lr": 1.7398029405393168e-05, "elapsed_sec": 80353.620459795, "step_time_sec": 8.229784481984098, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9685, "loss": 3.8977694511413574, "lr": 1.7351893066300128e-05, "elapsed_sec": 80361.85259413719, "step_time_sec": 8.23196597903734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9686, "loss": 4.17830228805542, "lr": 1.730589524632472e-05, "elapsed_sec": 80370.0831580162, "step_time_sec": 8.230484968982637, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9687, "loss": 3.9720306396484375, "lr": 1.7260036018161893e-05, "elapsed_sec": 80378.31166696548, "step_time_sec": 8.228316145017743, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9688, "loss": 3.9890129566192627, "lr": 1.721431545428762e-05, "elapsed_sec": 80386.5411040783, "step_time_sec": 8.229228170006536, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9689, "loss": 3.7553250789642334, "lr": 1.716873362695869e-05, "elapsed_sec": 80394.77233409882, "step_time_sec": 8.231075444957241, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9690, "loss": 3.957719087600708, "lr": 1.712329060821266e-05, "elapsed_sec": 80403.00238227844, "step_time_sec": 8.229902727005538, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9691, "loss": 3.8693478107452393, "lr": 1.70779864698677e-05, "elapsed_sec": 80411.23080301285, "step_time_sec": 8.228256781003438, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9692, "loss": 3.9189956188201904, "lr": 1.7032821283522485e-05, "elapsed_sec": 80419.45912241936, "step_time_sec": 8.228118831990287, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9693, "loss": 3.943286418914795, "lr": 1.698779512055609e-05, "elapsed_sec": 80427.68870615959, "step_time_sec": 8.229507609037682, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9694, "loss": 3.8663737773895264, "lr": 1.6942908052127944e-05, "elapsed_sec": 80435.91903400421, "step_time_sec": 8.230118494015187, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9695, "loss": 3.8873021602630615, "lr": 1.6898160149177526e-05, "elapsed_sec": 80444.1494922638, "step_time_sec": 8.230293912056368, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9696, "loss": 3.7957842350006104, "lr": 1.685355148242451e-05, "elapsed_sec": 80452.37957000732, "step_time_sec": 8.229926448024344, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9697, "loss": 3.8520569801330566, "lr": 1.68090821223684e-05, "elapsed_sec": 80460.61042642593, "step_time_sec": 8.230670737975743, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9698, "loss": 3.949327230453491, "lr": 1.676475213928863e-05, "elapsed_sec": 80468.84072422981, "step_time_sec": 8.230146257032175, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9699, "loss": 4.1204752922058105, "lr": 1.672056160324434e-05, "elapsed_sec": 80477.07200193405, "step_time_sec": 8.231153176981024, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9700, "loss": 3.9189655780792236, "lr": 1.667651058407425e-05, "elapsed_sec": 80485.30178642273, "step_time_sec": 8.229588721995242, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9701, "loss": 3.928295373916626, "lr": 1.6632599151396634e-05, "elapsed_sec": 80493.5326898098, "step_time_sec": 8.230762088031042, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9702, "loss": 3.9542248249053955, "lr": 1.6588827374609144e-05, "elapsed_sec": 80501.76363945007, "step_time_sec": 8.230721375031862, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9703, "loss": 3.9552226066589355, "lr": 1.6545195322888707e-05, "elapsed_sec": 80509.99394202232, "step_time_sec": 8.230175675998908, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9704, "loss": 3.805137872695923, "lr": 1.6501703065191453e-05, "elapsed_sec": 80518.22573876381, "step_time_sec": 8.231705044046976, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9705, "loss": 3.875199317932129, "lr": 1.645835067025256e-05, "elapsed_sec": 80526.4561882019, "step_time_sec": 8.230237804993521, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9706, "loss": 3.9231791496276855, "lr": 1.6415138206586208e-05, "elapsed_sec": 80534.68758630753, "step_time_sec": 8.231296431971714, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9707, "loss": 3.8063857555389404, "lr": 1.637206574248536e-05, "elapsed_sec": 80542.91914534569, "step_time_sec": 8.231319108977914, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9708, "loss": 3.9715709686279297, "lr": 1.6329133346021785e-05, "elapsed_sec": 80551.14926028252, "step_time_sec": 8.230005157005507, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9709, "loss": 4.012633323669434, "lr": 1.6286341085045864e-05, "elapsed_sec": 80559.3806977272, "step_time_sec": 8.231243729009293, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9710, "loss": 3.967766046524048, "lr": 1.6243689027186507e-05, "elapsed_sec": 80567.61088109016, "step_time_sec": 8.23002647398971, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9711, "loss": 4.0246758460998535, "lr": 1.6201177239851054e-05, "elapsed_sec": 80575.84228730202, "step_time_sec": 8.231249799020588, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9712, "loss": 4.013139724731445, "lr": 1.6158805790225157e-05, "elapsed_sec": 80584.0735552311, "step_time_sec": 8.231116694980301, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9713, "loss": 3.7401671409606934, "lr": 1.611657474527267e-05, "elapsed_sec": 80592.30470728874, "step_time_sec": 8.231039502017666, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9714, "loss": 3.904189348220825, "lr": 1.607448417173557e-05, "elapsed_sec": 80600.53634428978, "step_time_sec": 8.231454745982774, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9715, "loss": 3.7860524654388428, "lr": 1.6032534136133827e-05, "elapsed_sec": 80608.76736092567, "step_time_sec": 8.230814683018252, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9716, "loss": 3.8720550537109375, "lr": 1.599072470476528e-05, "elapsed_sec": 80616.99567842484, "step_time_sec": 8.228152572002728, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9717, "loss": 4.007847785949707, "lr": 1.5949055943705593e-05, "elapsed_sec": 80625.22405719757, "step_time_sec": 8.228236914961599, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9718, "loss": 3.9531407356262207, "lr": 1.59075279188081e-05, "elapsed_sec": 80633.45389533043, "step_time_sec": 8.229744224983733, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9719, "loss": 3.7790110111236572, "lr": 1.5866140695703725e-05, "elapsed_sec": 80641.68387365341, "step_time_sec": 8.229734437016305, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9720, "loss": 3.9057488441467285, "lr": 1.5824894339800838e-05, "elapsed_sec": 80649.91569328308, "step_time_sec": 8.231674532988109, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9721, "loss": 3.9577622413635254, "lr": 1.5783788916285216e-05, "elapsed_sec": 80658.14647626877, "step_time_sec": 8.230633091996424, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9722, "loss": 3.837811231613159, "lr": 1.574282449011989e-05, "elapsed_sec": 80666.37718343735, "step_time_sec": 8.230581602023449, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9723, "loss": 3.8705413341522217, "lr": 1.5702001126045058e-05, "elapsed_sec": 80674.60786867142, "step_time_sec": 8.230485990992747, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9724, "loss": 3.7641732692718506, "lr": 1.5661318888578e-05, "elapsed_sec": 80682.83802938461, "step_time_sec": 8.230037016037386, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9725, "loss": 3.9221606254577637, "lr": 1.5620777842012934e-05, "elapsed_sec": 80691.0693731308, "step_time_sec": 8.231184406962711, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9726, "loss": 3.829725742340088, "lr": 1.5580378050420945e-05, "elapsed_sec": 80699.2989654541, "step_time_sec": 8.229399636969902, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9727, "loss": 3.901383638381958, "lr": 1.5540119577649903e-05, "elapsed_sec": 80707.52938747406, "step_time_sec": 8.230290320992935, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9728, "loss": 4.170760154724121, "lr": 1.550000248732433e-05, "elapsed_sec": 80715.76019525528, "step_time_sec": 8.230589838989545, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9729, "loss": 3.944826364517212, "lr": 1.546002684284528e-05, "elapsed_sec": 80723.9901380539, "step_time_sec": 8.22978418204002, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9730, "loss": 3.8672187328338623, "lr": 1.5420192707390286e-05, "elapsed_sec": 80732.22028255463, "step_time_sec": 8.229998793976847, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9731, "loss": 3.878934383392334, "lr": 1.538050014391325e-05, "elapsed_sec": 80740.4490852356, "step_time_sec": 8.228646251023747, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9732, "loss": 3.8386762142181396, "lr": 1.53409492151443e-05, "elapsed_sec": 80748.68012332916, "step_time_sec": 8.230914306943305, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9733, "loss": 3.962251663208008, "lr": 1.5301539983589772e-05, "elapsed_sec": 80756.91070437431, "step_time_sec": 8.230347608972806, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9734, "loss": 3.7706682682037354, "lr": 1.5262272511532017e-05, "elapsed_sec": 80765.14084792137, "step_time_sec": 8.230002927943133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9735, "loss": 3.7548675537109375, "lr": 1.522314686102938e-05, "elapsed_sec": 80773.37174296379, "step_time_sec": 8.230741934967227, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9736, "loss": 4.079793930053711, "lr": 1.5184163093916045e-05, "elapsed_sec": 80781.60169816017, "step_time_sec": 8.229802943009418, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9737, "loss": 3.8363089561462402, "lr": 1.5145321271802025e-05, "elapsed_sec": 80789.83258676529, "step_time_sec": 8.230711265990976, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9738, "loss": 4.015707969665527, "lr": 1.510662145607289e-05, "elapsed_sec": 80798.06347441673, "step_time_sec": 8.230727030022535, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9739, "loss": 3.7228636741638184, "lr": 1.5068063707889908e-05, "elapsed_sec": 80806.29437494278, "step_time_sec": 8.230774433992337, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9740, "loss": 3.9385268688201904, "lr": 1.5029648088189749e-05, "elapsed_sec": 80814.52540206909, "step_time_sec": 8.230849782994483, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9741, "loss": 3.765470266342163, "lr": 1.4991374657684469e-05, "elapsed_sec": 80822.75590348244, "step_time_sec": 8.230379140994046, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9742, "loss": 3.8084681034088135, "lr": 1.4953243476861421e-05, "elapsed_sec": 80830.98486328125, "step_time_sec": 8.22883354802616, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9743, "loss": 3.916717767715454, "lr": 1.4915254605983156e-05, "elapsed_sec": 80839.21440553665, "step_time_sec": 8.22930261399597, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9744, "loss": 3.8987364768981934, "lr": 1.4877408105087296e-05, "elapsed_sec": 80847.44288492203, "step_time_sec": 8.228350330027752, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9745, "loss": 3.8997836112976074, "lr": 1.483970403398648e-05, "elapsed_sec": 80855.67342543602, "step_time_sec": 8.2303536409745, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9746, "loss": 3.754328489303589, "lr": 1.4802142452268228e-05, "elapsed_sec": 80863.90445280075, "step_time_sec": 8.230903777992353, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9747, "loss": 4.026552677154541, "lr": 1.4764723419294926e-05, "elapsed_sec": 80872.13477659225, "step_time_sec": 8.23014244902879, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9748, "loss": 4.052819728851318, "lr": 1.4727446994203578e-05, "elapsed_sec": 80880.3626832962, "step_time_sec": 8.227756230975501, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9749, "loss": 3.8999016284942627, "lr": 1.4690313235905914e-05, "elapsed_sec": 80888.59135770798, "step_time_sec": 8.228552329004742, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9750, "loss": 3.7980024814605713, "lr": 1.4653322203088107e-05, "elapsed_sec": 80896.82175016403, "step_time_sec": 8.230216999014374, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9751, "loss": 3.865999937057495, "lr": 1.4616473954210834e-05, "elapsed_sec": 80905.05261421204, "step_time_sec": 8.230683240981307, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9752, "loss": 3.9038968086242676, "lr": 1.4579768547509091e-05, "elapsed_sec": 80913.28330659866, "step_time_sec": 8.230513922986574, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9753, "loss": 3.751495838165283, "lr": 1.4543206040992108e-05, "elapsed_sec": 80921.51380324364, "step_time_sec": 8.230401058972348, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9754, "loss": 3.8551363945007324, "lr": 1.4506786492443295e-05, "elapsed_sec": 80929.74476337433, "step_time_sec": 8.23077474301681, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9755, "loss": 3.841087579727173, "lr": 1.4470509959420134e-05, "elapsed_sec": 80937.97498965263, "step_time_sec": 8.23001626500627, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9756, "loss": 3.9732611179351807, "lr": 1.4434376499254066e-05, "elapsed_sec": 80946.20634317398, "step_time_sec": 8.231214403000195, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9757, "loss": 3.762559175491333, "lr": 1.4398386169050442e-05, "elapsed_sec": 80954.43663334846, "step_time_sec": 8.230197414988652, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9758, "loss": 3.8600213527679443, "lr": 1.436253902568838e-05, "elapsed_sec": 80962.66756439209, "step_time_sec": 8.23072679003235, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9759, "loss": 3.8299319744110107, "lr": 1.4326835125820773e-05, "elapsed_sec": 80970.89826965332, "step_time_sec": 8.230535683047492, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9760, "loss": 3.8839080333709717, "lr": 1.429127452587403e-05, "elapsed_sec": 80979.12668585777, "step_time_sec": 8.228225698985625, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9761, "loss": 3.8623650074005127, "lr": 1.4255857282048188e-05, "elapsed_sec": 80987.3568687439, "step_time_sec": 8.23003198200604, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9762, "loss": 4.051156520843506, "lr": 1.4220583450316626e-05, "elapsed_sec": 80995.5861916542, "step_time_sec": 8.229176929977257, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9763, "loss": 3.8915719985961914, "lr": 1.4185453086426185e-05, "elapsed_sec": 81003.81696724892, "step_time_sec": 8.23065179400146, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9764, "loss": 3.963529109954834, "lr": 1.4150466245896875e-05, "elapsed_sec": 81012.0470483303, "step_time_sec": 8.229867422021925, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9765, "loss": 3.92864727973938, "lr": 1.4115622984021944e-05, "elapsed_sec": 81020.27669429779, "step_time_sec": 8.229473701969255, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9766, "loss": 3.8758015632629395, "lr": 1.408092335586768e-05, "elapsed_sec": 81028.50722122192, "step_time_sec": 8.230381414992735, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9767, "loss": 3.9347734451293945, "lr": 1.4046367416273403e-05, "elapsed_sec": 81036.73778533936, "step_time_sec": 8.230399592022877, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9768, "loss": 3.885828971862793, "lr": 1.4011955219851336e-05, "elapsed_sec": 81044.9692299366, "step_time_sec": 8.231304319982883, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9769, "loss": 4.057837009429932, "lr": 1.3977686820986544e-05, "elapsed_sec": 81053.20023059845, "step_time_sec": 8.230832126981113, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9770, "loss": 3.7399425506591797, "lr": 1.3943562273836798e-05, "elapsed_sec": 81061.43036198616, "step_time_sec": 8.22998068697052, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9771, "loss": 3.810839891433716, "lr": 1.3909581632332604e-05, "elapsed_sec": 81069.66025614738, "step_time_sec": 8.22973619500408, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9772, "loss": 3.9432108402252197, "lr": 1.3875744950176929e-05, "elapsed_sec": 81077.89072155952, "step_time_sec": 8.230325269978493, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9773, "loss": 3.7604026794433594, "lr": 1.3842052280845327e-05, "elapsed_sec": 81086.12137675285, "step_time_sec": 8.23047350696288, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9774, "loss": 3.948772668838501, "lr": 1.3808503677585681e-05, "elapsed_sec": 81094.35166573524, "step_time_sec": 8.230111506010871, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9775, "loss": 4.004279613494873, "lr": 1.3775099193418239e-05, "elapsed_sec": 81102.58037948608, "step_time_sec": 8.228612095001154, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9776, "loss": 3.8712520599365234, "lr": 1.3741838881135463e-05, "elapsed_sec": 81110.80870127678, "step_time_sec": 8.228142312029377, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9777, "loss": 3.891082525253296, "lr": 1.3708722793301978e-05, "elapsed_sec": 81119.0355181694, "step_time_sec": 8.226652922981884, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9778, "loss": 3.9171221256256104, "lr": 1.3675750982254448e-05, "elapsed_sec": 81127.26692152023, "step_time_sec": 8.231206515978556, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9779, "loss": 3.9166359901428223, "lr": 1.3642923500101561e-05, "elapsed_sec": 81135.4975233078, "step_time_sec": 8.230518286989536, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9780, "loss": 3.925676107406616, "lr": 1.361024039872388e-05, "elapsed_sec": 81143.72651362419, "step_time_sec": 8.22880735999206, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9781, "loss": 3.919654369354248, "lr": 1.3577701729773789e-05, "elapsed_sec": 81151.95601391792, "step_time_sec": 8.229284671018831, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9782, "loss": 3.8507261276245117, "lr": 1.3545307544675429e-05, "elapsed_sec": 81160.18602204323, "step_time_sec": 8.229836614977103, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9783, "loss": 3.858675241470337, "lr": 1.3513057894624601e-05, "elapsed_sec": 81168.41606807709, "step_time_sec": 8.229933176015038, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9784, "loss": 4.0090155601501465, "lr": 1.3480952830588633e-05, "elapsed_sec": 81176.64513492584, "step_time_sec": 8.228875224012882, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9785, "loss": 3.91740083694458, "lr": 1.3448992403306431e-05, "elapsed_sec": 81184.87328124046, "step_time_sec": 8.227982467971742, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9786, "loss": 3.9107015132904053, "lr": 1.3417176663288232e-05, "elapsed_sec": 81193.10165333748, "step_time_sec": 8.228220675024204, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9787, "loss": 3.941312551498413, "lr": 1.3385505660815682e-05, "elapsed_sec": 81201.33124279976, "step_time_sec": 8.229438639013097, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9788, "loss": 3.8481483459472656, "lr": 1.3353979445941624e-05, "elapsed_sec": 81209.56250405312, "step_time_sec": 8.231088500993792, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9789, "loss": 4.017155170440674, "lr": 1.3322598068490128e-05, "elapsed_sec": 81217.79247808456, "step_time_sec": 8.22985405701911, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9790, "loss": 3.9194271564483643, "lr": 1.329136157805633e-05, "elapsed_sec": 81226.02332520485, "step_time_sec": 8.23068337602308, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9791, "loss": 3.8873021602630615, "lr": 1.3260270024006398e-05, "elapsed_sec": 81234.25477576256, "step_time_sec": 8.231278093997389, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9792, "loss": 3.9396684169769287, "lr": 1.3229323455477443e-05, "elapsed_sec": 81242.48536634445, "step_time_sec": 8.230482554004993, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9793, "loss": 3.92446231842041, "lr": 1.319852192137744e-05, "elapsed_sec": 81250.71635937691, "step_time_sec": 8.230799593962729, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9794, "loss": 4.0482892990112305, "lr": 1.3167865470385133e-05, "elapsed_sec": 81258.94808483124, "step_time_sec": 8.231607905996498, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9795, "loss": 3.9449479579925537, "lr": 1.313735415095004e-05, "elapsed_sec": 81267.17846179008, "step_time_sec": 8.230152437987272, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9796, "loss": 3.951085090637207, "lr": 1.3106988011292203e-05, "elapsed_sec": 81275.40909934044, "step_time_sec": 8.230471463000868, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9797, "loss": 3.8395607471466064, "lr": 1.3076767099402355e-05, "elapsed_sec": 81283.63991689682, "step_time_sec": 8.23065028100973, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9798, "loss": 3.95662784576416, "lr": 1.3046691463041583e-05, "elapsed_sec": 81291.8703660965, "step_time_sec": 8.23030628199922, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9799, "loss": 3.952653646469116, "lr": 1.3016761149741478e-05, "elapsed_sec": 81300.10053515434, "step_time_sec": 8.230018009024207, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9800, "loss": 3.9082674980163574, "lr": 1.2986976206803908e-05, "elapsed_sec": 81308.33161354065, "step_time_sec": 8.230930492980406, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9801, "loss": 3.8915743827819824, "lr": 1.295733668130101e-05, "elapsed_sec": 81316.56206226349, "step_time_sec": 8.230305694043636, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9802, "loss": 3.9770045280456543, "lr": 1.2927842620075123e-05, "elapsed_sec": 81324.79267072678, "step_time_sec": 8.23041351599386, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9803, "loss": 3.829777479171753, "lr": 1.2898494069738668e-05, "elapsed_sec": 81333.02345180511, "step_time_sec": 8.230674482998438, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9804, "loss": 3.9408700466156006, "lr": 1.286929107667411e-05, "elapsed_sec": 81341.2536892891, "step_time_sec": 8.23003316600807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9805, "loss": 3.945939302444458, "lr": 1.2840233687033876e-05, "elapsed_sec": 81349.48553061485, "step_time_sec": 8.231653349997941, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9806, "loss": 3.7798280715942383, "lr": 1.2811321946740271e-05, "elapsed_sec": 81357.71621131897, "step_time_sec": 8.230598596972413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9807, "loss": 4.045722007751465, "lr": 1.278255590148545e-05, "elapsed_sec": 81365.94729542732, "step_time_sec": 8.230882858973928, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9808, "loss": 3.963787078857422, "lr": 1.2753935596731268e-05, "elapsed_sec": 81374.17840123177, "step_time_sec": 8.230985629023053, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9809, "loss": 3.953977584838867, "lr": 1.2725461077709276e-05, "elapsed_sec": 81382.40759301186, "step_time_sec": 8.228932321013417, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9810, "loss": 3.9575564861297607, "lr": 1.2697132389420624e-05, "elapsed_sec": 81390.63554430008, "step_time_sec": 8.227787837968208, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9811, "loss": 3.9539248943328857, "lr": 1.2668949576635975e-05, "elapsed_sec": 81398.86431884766, "step_time_sec": 8.228627520031296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9812, "loss": 3.9865994453430176, "lr": 1.2640912683895483e-05, "elapsed_sec": 81407.09345388412, "step_time_sec": 8.228975619014818, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9813, "loss": 4.150331020355225, "lr": 1.2613021755508649e-05, "elapsed_sec": 81415.32190465927, "step_time_sec": 8.228301275987178, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9814, "loss": 3.833965301513672, "lr": 1.258527683555432e-05, "elapsed_sec": 81423.55347752571, "step_time_sec": 8.231422623968683, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9815, "loss": 4.02117919921875, "lr": 1.255767796788058e-05, "elapsed_sec": 81431.78380799294, "step_time_sec": 8.230174130003434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9816, "loss": 3.9634783267974854, "lr": 1.2530225196104691e-05, "elapsed_sec": 81440.01212453842, "step_time_sec": 8.22816027799854, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9817, "loss": 3.9290456771850586, "lr": 1.2502918563613046e-05, "elapsed_sec": 81448.24195504189, "step_time_sec": 8.229690802050754, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9818, "loss": 3.9394195079803467, "lr": 1.2475758113561052e-05, "elapsed_sec": 81456.46953177452, "step_time_sec": 8.227383008983452, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9819, "loss": 4.075735569000244, "lr": 1.2448743888873096e-05, "elapsed_sec": 81464.70111083984, "step_time_sec": 8.231486039003357, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9820, "loss": 3.8194308280944824, "lr": 1.2421875932242482e-05, "elapsed_sec": 81472.93162417412, "step_time_sec": 8.230311957013328, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9821, "loss": 3.8725361824035645, "lr": 1.2395154286131338e-05, "elapsed_sec": 81481.16070818901, "step_time_sec": 8.22897081496194, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9822, "loss": 3.7893385887145996, "lr": 1.2368578992770573e-05, "elapsed_sec": 81489.39108514786, "step_time_sec": 8.230130131007172, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9823, "loss": 4.020795822143555, "lr": 1.2342150094159782e-05, "elapsed_sec": 81497.62161660194, "step_time_sec": 8.230378702981398, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9824, "loss": 3.941283702850342, "lr": 1.2315867632067234e-05, "elapsed_sec": 81505.85113739967, "step_time_sec": 8.229352854017634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9825, "loss": 3.809279441833496, "lr": 1.2289731648029715e-05, "elapsed_sec": 81514.08041357994, "step_time_sec": 8.22919002798153, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9826, "loss": 3.8986616134643555, "lr": 1.2263742183352595e-05, "elapsed_sec": 81522.31088328362, "step_time_sec": 8.230241665034555, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9827, "loss": 3.8689825534820557, "lr": 1.2237899279109588e-05, "elapsed_sec": 81530.5420525074, "step_time_sec": 8.231008781993296, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9828, "loss": 3.888293504714966, "lr": 1.2212202976142873e-05, "elapsed_sec": 81538.77243185043, "step_time_sec": 8.230222047015559, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9829, "loss": 3.766948938369751, "lr": 1.2186653315062884e-05, "elapsed_sec": 81547.0021018982, "step_time_sec": 8.229513997968752, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9830, "loss": 3.998818874359131, "lr": 1.216125033624832e-05, "elapsed_sec": 81555.23273563385, "step_time_sec": 8.230537729978096, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9831, "loss": 3.9240829944610596, "lr": 1.213599407984607e-05, "elapsed_sec": 81563.46323180199, "step_time_sec": 8.230277398019098, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9832, "loss": 4.022337913513184, "lr": 1.2110884585771133e-05, "elapsed_sec": 81571.6936788559, "step_time_sec": 8.230291851970833, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9833, "loss": 3.9183855056762695, "lr": 1.208592189370656e-05, "elapsed_sec": 81579.92405295372, "step_time_sec": 8.230222126992885, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9834, "loss": 3.895510196685791, "lr": 1.2061106043103415e-05, "elapsed_sec": 81588.15431141853, "step_time_sec": 8.230161482992116, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9835, "loss": 3.865142345428467, "lr": 1.2036437073180663e-05, "elapsed_sec": 81596.38435578346, "step_time_sec": 8.229856420017313, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9836, "loss": 3.864797592163086, "lr": 1.2011915022925203e-05, "elapsed_sec": 81604.61508536339, "step_time_sec": 8.230514817987569, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9837, "loss": 3.730851173400879, "lr": 1.1987539931091631e-05, "elapsed_sec": 81612.84545397758, "step_time_sec": 8.230205119994935, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9838, "loss": 4.076078414916992, "lr": 1.1963311836202423e-05, "elapsed_sec": 81621.07649612427, "step_time_sec": 8.230914446001407, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9839, "loss": 3.9519948959350586, "lr": 1.1939230776547615e-05, "elapsed_sec": 81629.30678200722, "step_time_sec": 8.230146171001252, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9840, "loss": 3.9789724349975586, "lr": 1.191529679018497e-05, "elapsed_sec": 81637.53742742538, "step_time_sec": 8.230431517993566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9841, "loss": 3.857978582382202, "lr": 1.1891509914939764e-05, "elapsed_sec": 81645.76746702194, "step_time_sec": 8.229884335014503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9842, "loss": 4.050124168395996, "lr": 1.1867870188404768e-05, "elapsed_sec": 81653.99904513359, "step_time_sec": 8.231424364028499, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9843, "loss": 3.9387264251708984, "lr": 1.1844377647940238e-05, "elapsed_sec": 81662.23019456863, "step_time_sec": 8.230995770019945, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9844, "loss": 3.855825662612915, "lr": 1.1821032330673794e-05, "elapsed_sec": 81670.4601700306, "step_time_sec": 8.229823592002504, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9845, "loss": 3.878720283508301, "lr": 1.1797834273500393e-05, "elapsed_sec": 81678.69113469124, "step_time_sec": 8.230810034961905, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9846, "loss": 3.9887754917144775, "lr": 1.1774783513082247e-05, "elapsed_sec": 81686.92109775543, "step_time_sec": 8.229807807016186, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9847, "loss": 4.03014612197876, "lr": 1.1751880085848784e-05, "elapsed_sec": 81695.1529109478, "step_time_sec": 8.231659009994473, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9848, "loss": 3.855855941772461, "lr": 1.1729124027996631e-05, "elapsed_sec": 81703.38396430016, "step_time_sec": 8.230911415012088, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9849, "loss": 3.984201669692993, "lr": 1.1706515375489426e-05, "elapsed_sec": 81711.61497306824, "step_time_sec": 8.230795232986566, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9850, "loss": 3.771528720855713, "lr": 1.1684054164057939e-05, "elapsed_sec": 81719.84577155113, "step_time_sec": 8.230645069037564, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9851, "loss": 3.976742744445801, "lr": 1.1661740429199848e-05, "elapsed_sec": 81728.0766851902, "step_time_sec": 8.23076609801501, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9852, "loss": 3.901597023010254, "lr": 1.1639574206179813e-05, "elapsed_sec": 81736.30795645714, "step_time_sec": 8.231130397005472, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9853, "loss": 3.9086718559265137, "lr": 1.161755553002933e-05, "elapsed_sec": 81744.53754782677, "step_time_sec": 8.229483470960986, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9854, "loss": 3.9031074047088623, "lr": 1.1595684435546743e-05, "elapsed_sec": 81752.76893305779, "step_time_sec": 8.231182846997399, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9855, "loss": 3.9949896335601807, "lr": 1.1573960957297139e-05, "elapsed_sec": 81760.99958777428, "step_time_sec": 8.230511169997044, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9856, "loss": 3.6712827682495117, "lr": 1.1552385129612296e-05, "elapsed_sec": 81769.23070287704, "step_time_sec": 8.23099051200552, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9857, "loss": 3.9910874366760254, "lr": 1.1530956986590694e-05, "elapsed_sec": 81777.46168088913, "step_time_sec": 8.230737345991656, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9858, "loss": 4.055098056793213, "lr": 1.1509676562097358e-05, "elapsed_sec": 81785.69162082672, "step_time_sec": 8.229781128000468, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9859, "loss": 3.9463884830474854, "lr": 1.1488543889763884e-05, "elapsed_sec": 81793.92231726646, "step_time_sec": 8.230540085001849, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9860, "loss": 3.8981454372406006, "lr": 1.146755900298839e-05, "elapsed_sec": 81802.15300798416, "step_time_sec": 8.230524909042288, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9861, "loss": 3.8288941383361816, "lr": 1.144672193493536e-05, "elapsed_sec": 81810.38414263725, "step_time_sec": 8.230990218988154, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9862, "loss": 3.9061672687530518, "lr": 1.1426032718535738e-05, "elapsed_sec": 81818.61467242241, "step_time_sec": 8.23040887498064, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9863, "loss": 3.830016851425171, "lr": 1.1405491386486753e-05, "elapsed_sec": 81826.84578442574, "step_time_sec": 8.230950637022033, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9864, "loss": 4.01065731048584, "lr": 1.138509797125196e-05, "elapsed_sec": 81835.0780813694, "step_time_sec": 8.232162084023003, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9865, "loss": 3.885744333267212, "lr": 1.1364852505061109e-05, "elapsed_sec": 81843.30864787102, "step_time_sec": 8.230385951988865, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9866, "loss": 3.978891611099243, "lr": 1.1344755019910147e-05, "elapsed_sec": 81851.53915190697, "step_time_sec": 8.230314115993679, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9867, "loss": 3.9672629833221436, "lr": 1.1324805547561156e-05, "elapsed_sec": 81859.76935744286, "step_time_sec": 8.23001225903863, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9868, "loss": 4.006011486053467, "lr": 1.130500411954229e-05, "elapsed_sec": 81867.9984035492, "step_time_sec": 8.228967418021057, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9869, "loss": 3.923706293106079, "lr": 1.1285350767147744e-05, "elapsed_sec": 81876.23027181625, "step_time_sec": 8.23165152897127, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9870, "loss": 3.870771646499634, "lr": 1.1265845521437668e-05, "elapsed_sec": 81884.46069312096, "step_time_sec": 8.230299715010915, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9871, "loss": 3.866888999938965, "lr": 1.1246488413238176e-05, "elapsed_sec": 81892.69194984436, "step_time_sec": 8.231056869029999, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9872, "loss": 4.0137434005737305, "lr": 1.122727947314127e-05, "elapsed_sec": 81900.9214129448, "step_time_sec": 8.22937216097489, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9873, "loss": 3.789302110671997, "lr": 1.1208218731504736e-05, "elapsed_sec": 81909.15263652802, "step_time_sec": 8.230999930994585, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9874, "loss": 3.8080508708953857, "lr": 1.1189306218452217e-05, "elapsed_sec": 81917.38304662704, "step_time_sec": 8.230264956015162, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9875, "loss": 3.898402690887451, "lr": 1.1170541963873016e-05, "elapsed_sec": 81925.61342597008, "step_time_sec": 8.230256095004734, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9876, "loss": 3.8368473052978516, "lr": 1.1151925997422211e-05, "elapsed_sec": 81933.84389448166, "step_time_sec": 8.230261365999468, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9877, "loss": 3.97216796875, "lr": 1.1133458348520466e-05, "elapsed_sec": 81942.07411146164, "step_time_sec": 8.23005422798451, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9878, "loss": 3.7932288646698, "lr": 1.1115139046354059e-05, "elapsed_sec": 81950.30550408363, "step_time_sec": 8.231228179996833, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9879, "loss": 3.982997417449951, "lr": 1.1096968119874845e-05, "elapsed_sec": 81958.53592538834, "step_time_sec": 8.230343512026593, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9880, "loss": 3.9028940200805664, "lr": 1.107894559780016e-05, "elapsed_sec": 81966.76659297943, "step_time_sec": 8.230427350034006, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9881, "loss": 3.8807425498962402, "lr": 1.106107150861279e-05, "elapsed_sec": 81974.99723386765, "step_time_sec": 8.23051674303133, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9882, "loss": 3.824925184249878, "lr": 1.1043345880560982e-05, "elapsed_sec": 81983.22831082344, "step_time_sec": 8.230894840031397, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9883, "loss": 4.046975135803223, "lr": 1.1025768741658304e-05, "elapsed_sec": 81991.45961904526, "step_time_sec": 8.231170610990375, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9884, "loss": 4.018844127655029, "lr": 1.1008340119683703e-05, "elapsed_sec": 81999.69026207924, "step_time_sec": 8.230489898007363, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9885, "loss": 4.129218578338623, "lr": 1.0991060042181349e-05, "elapsed_sec": 82007.921697855, "step_time_sec": 8.231239062966779, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9886, "loss": 3.8413658142089844, "lr": 1.097392853646073e-05, "elapsed_sec": 82016.15267276764, "step_time_sec": 8.230875393026508, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9887, "loss": 3.907125234603882, "lr": 1.0956945629596444e-05, "elapsed_sec": 82024.38275647163, "step_time_sec": 8.229924887011293, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9888, "loss": 3.7478373050689697, "lr": 1.0940111348428306e-05, "elapsed_sec": 82032.61232280731, "step_time_sec": 8.22936863801442, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9889, "loss": 3.999967336654663, "lr": 1.092342571956123e-05, "elapsed_sec": 82040.84129166603, "step_time_sec": 8.22882496099919, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9890, "loss": 3.9760842323303223, "lr": 1.090688876936518e-05, "elapsed_sec": 82049.07035207748, "step_time_sec": 8.228865478013176, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9891, "loss": 3.930811882019043, "lr": 1.0890500523975174e-05, "elapsed_sec": 82057.29962849617, "step_time_sec": 8.229138172988314, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9892, "loss": 3.940798044204712, "lr": 1.0874261009291203e-05, "elapsed_sec": 82065.52988409996, "step_time_sec": 8.230156217003241, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9893, "loss": 3.945483446121216, "lr": 1.0858170250978193e-05, "elapsed_sec": 82073.76062345505, "step_time_sec": 8.230548633961007, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9894, "loss": 3.815439462661743, "lr": 1.0842228274465986e-05, "elapsed_sec": 82081.99062418938, "step_time_sec": 8.229803198948503, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9895, "loss": 3.9457590579986572, "lr": 1.0826435104949308e-05, "elapsed_sec": 82090.22031927109, "step_time_sec": 8.22955708799418, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9896, "loss": 3.9392080307006836, "lr": 1.081079076738769e-05, "elapsed_sec": 82098.45018386841, "step_time_sec": 8.229702567972708, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9897, "loss": 3.908663034439087, "lr": 1.079529528650543e-05, "elapsed_sec": 82106.68149900436, "step_time_sec": 8.231167696008924, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9898, "loss": 3.9904749393463135, "lr": 1.0779948686791627e-05, "elapsed_sec": 82114.91301894188, "step_time_sec": 8.231365762010682, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9899, "loss": 3.947360038757324, "lr": 1.0764750992500031e-05, "elapsed_sec": 82123.14344787598, "step_time_sec": 8.230240051052533, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9900, "loss": 4.035055160522461, "lr": 1.0749702227649095e-05, "elapsed_sec": 82131.37499189377, "step_time_sec": 8.231407813029364, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9901, "loss": 3.9709486961364746, "lr": 1.0734802416021895e-05, "elapsed_sec": 82139.60568761826, "step_time_sec": 8.230567659949884, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9902, "loss": 3.9946088790893555, "lr": 1.0720051581166105e-05, "elapsed_sec": 82147.83601117134, "step_time_sec": 8.230122470995411, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9903, "loss": 3.9066131114959717, "lr": 1.0705449746393952e-05, "elapsed_sec": 82156.0658762455, "step_time_sec": 8.229773648956325, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9904, "loss": 4.0316925048828125, "lr": 1.069099693478218e-05, "elapsed_sec": 82164.29683995247, "step_time_sec": 8.230737639009021, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9905, "loss": 3.900740623474121, "lr": 1.067669316917202e-05, "elapsed_sec": 82172.52702307701, "step_time_sec": 8.230016325018369, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9906, "loss": 3.9278831481933594, "lr": 1.0662538472169156e-05, "elapsed_sec": 82180.75709962845, "step_time_sec": 8.229948369029444, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9907, "loss": 3.953660488128662, "lr": 1.0648532866143676e-05, "elapsed_sec": 82188.98703098297, "step_time_sec": 8.229776848980691, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9908, "loss": 3.914501190185547, "lr": 1.0634676373230051e-05, "elapsed_sec": 82197.21582961082, "step_time_sec": 8.22858434804948, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9909, "loss": 3.9376001358032227, "lr": 1.0620969015327084e-05, "elapsed_sec": 82205.44510531425, "step_time_sec": 8.22912993101636, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9910, "loss": 4.0739898681640625, "lr": 1.0607410814097899e-05, "elapsed_sec": 82213.6743760109, "step_time_sec": 8.229140916024335, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9911, "loss": 3.897223711013794, "lr": 1.059400179096988e-05, "elapsed_sec": 82221.90535855293, "step_time_sec": 8.230779775010888, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9912, "loss": 4.007583141326904, "lr": 1.0580741967134664e-05, "elapsed_sec": 82230.13501954079, "step_time_sec": 8.229517464991659, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9913, "loss": 3.7711408138275146, "lr": 1.0567631363548059e-05, "elapsed_sec": 82238.36615347862, "step_time_sec": 8.230970502016135, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9914, "loss": 3.9393999576568604, "lr": 1.0554670000930098e-05, "elapsed_sec": 82246.59644269943, "step_time_sec": 8.230194535048213, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9915, "loss": 4.049795150756836, "lr": 1.0541857899764914e-05, "elapsed_sec": 82254.8260819912, "step_time_sec": 8.229449987993576, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9916, "loss": 3.9309239387512207, "lr": 1.052919508030075e-05, "elapsed_sec": 82263.05401277542, "step_time_sec": 8.22772965504555, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9917, "loss": 3.8669939041137695, "lr": 1.0516681562549958e-05, "elapsed_sec": 82271.28151035309, "step_time_sec": 8.227306675980799, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9918, "loss": 3.849242925643921, "lr": 1.0504317366288883e-05, "elapsed_sec": 82279.51285552979, "step_time_sec": 8.231188357982319, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9919, "loss": 4.091063022613525, "lr": 1.049210251105792e-05, "elapsed_sec": 82287.74363732338, "step_time_sec": 8.2306852779584, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9920, "loss": 3.980698585510254, "lr": 1.0480037016161428e-05, "elapsed_sec": 82295.9743463993, "step_time_sec": 8.230483573977835, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9921, "loss": 3.918017625808716, "lr": 1.0468120900667733e-05, "elapsed_sec": 82304.2061328888, "step_time_sec": 8.23165751597844, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9922, "loss": 3.9209539890289307, "lr": 1.0456354183409052e-05, "elapsed_sec": 82312.43701028824, "step_time_sec": 8.230715544021223, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9923, "loss": 3.8602330684661865, "lr": 1.0444736882981529e-05, "elapsed_sec": 82320.66813325882, "step_time_sec": 8.23093833803432, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9924, "loss": 3.8846614360809326, "lr": 1.0433269017745143e-05, "elapsed_sec": 82328.89837121964, "step_time_sec": 8.230109440977685, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9925, "loss": 3.9036316871643066, "lr": 1.0421950605823727e-05, "elapsed_sec": 82337.12995266914, "step_time_sec": 8.23143040802097, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9926, "loss": 3.919706344604492, "lr": 1.0410781665104875e-05, "elapsed_sec": 82345.35993862152, "step_time_sec": 8.229783907008823, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9927, "loss": 3.7551138401031494, "lr": 1.0399762213240027e-05, "elapsed_sec": 82353.59030270576, "step_time_sec": 8.23021243995754, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9928, "loss": 3.8701183795928955, "lr": 1.0388892267644301e-05, "elapsed_sec": 82361.82070946693, "step_time_sec": 8.230237553012557, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9929, "loss": 3.924978733062744, "lr": 1.0378171845496578e-05, "elapsed_sec": 82370.05187630653, "step_time_sec": 8.231011816998944, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9930, "loss": 3.9698774814605713, "lr": 1.0367600963739422e-05, "elapsed_sec": 82378.28291130066, "step_time_sec": 8.230941945977975, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9931, "loss": 3.935230255126953, "lr": 1.035717963907905e-05, "elapsed_sec": 82386.51358485222, "step_time_sec": 8.230427892005537, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9932, "loss": 3.979585647583008, "lr": 1.0346907887985344e-05, "elapsed_sec": 82394.74437069893, "step_time_sec": 8.230684847047087, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9933, "loss": 3.886031150817871, "lr": 1.0336785726691766e-05, "elapsed_sec": 82402.97452282906, "step_time_sec": 8.22995260200696, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9934, "loss": 3.9487111568450928, "lr": 1.0326813171195394e-05, "elapsed_sec": 82411.20549154282, "step_time_sec": 8.230841254990082, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9935, "loss": 4.157890319824219, "lr": 1.0316990237256873e-05, "elapsed_sec": 82419.43638300896, "step_time_sec": 8.230694250960369, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9936, "loss": 3.9168694019317627, "lr": 1.030731694040035e-05, "elapsed_sec": 82427.6669576168, "step_time_sec": 8.230436828976963, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9937, "loss": 3.8418428897857666, "lr": 1.0297793295913534e-05, "elapsed_sec": 82435.89847111702, "step_time_sec": 8.231313491996843, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9938, "loss": 3.9691355228424072, "lr": 1.0288419318847575e-05, "elapsed_sec": 82444.12958049774, "step_time_sec": 8.23098558402853, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9939, "loss": 3.926851987838745, "lr": 1.0279195024017126e-05, "elapsed_sec": 82452.36035561562, "step_time_sec": 8.230582387011964, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9940, "loss": 4.0467915534973145, "lr": 1.0270120426000264e-05, "elapsed_sec": 82460.59121394157, "step_time_sec": 8.230721361003816, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9941, "loss": 4.02380895614624, "lr": 1.026119553913849e-05, "elapsed_sec": 82468.82138586044, "step_time_sec": 8.230023743992206, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9942, "loss": 3.9445226192474365, "lr": 1.0252420377536694e-05, "elapsed_sec": 82477.04949402809, "step_time_sec": 8.227915282011963, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9943, "loss": 3.951421022415161, "lr": 1.024379495506315e-05, "elapsed_sec": 82485.27803659439, "step_time_sec": 8.228393504046835, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9944, "loss": 3.986184597015381, "lr": 1.0235319285349488e-05, "elapsed_sec": 82493.51067018509, "step_time_sec": 8.232463150983676, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9945, "loss": 4.033551216125488, "lr": 1.0226993381790648e-05, "elapsed_sec": 82501.74124670029, "step_time_sec": 8.230437954014633, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9946, "loss": 4.148901462554932, "lr": 1.0218817257544904e-05, "elapsed_sec": 82509.96953082085, "step_time_sec": 8.228172715986148, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9947, "loss": 4.105860710144043, "lr": 1.0210790925533799e-05, "elapsed_sec": 82518.19780349731, "step_time_sec": 8.228071797988378, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9948, "loss": 3.9176645278930664, "lr": 1.020291439844216e-05, "elapsed_sec": 82526.4257376194, "step_time_sec": 8.227788819989655, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9949, "loss": 3.8262863159179688, "lr": 1.0195187688718049e-05, "elapsed_sec": 82534.65690803528, "step_time_sec": 8.230983400018886, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9950, "loss": 4.0130696296691895, "lr": 1.0187610808572769e-05, "elapsed_sec": 82542.88725757599, "step_time_sec": 8.23020093102241, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9951, "loss": 3.8501181602478027, "lr": 1.0180183769980811e-05, "elapsed_sec": 82551.11738610268, "step_time_sec": 8.230008970014751, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9952, "loss": 3.9096148014068604, "lr": 1.0172906584679886e-05, "elapsed_sec": 82559.3457019329, "step_time_sec": 8.22811698098667, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9953, "loss": 3.9829494953155518, "lr": 1.0165779264170868e-05, "elapsed_sec": 82567.57435369492, "step_time_sec": 8.228543180972338, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9954, "loss": 3.8739726543426514, "lr": 1.0158801819717752e-05, "elapsed_sec": 82575.80595326424, "step_time_sec": 8.231398426985834, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9955, "loss": 3.808727741241455, "lr": 1.015197426234772e-05, "elapsed_sec": 82584.03624033928, "step_time_sec": 8.23018851998495, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9956, "loss": 3.9082517623901367, "lr": 1.0145296602851031e-05, "elapsed_sec": 82592.33277010918, "step_time_sec": 8.237807905010413, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9957, "loss": 3.866055488586426, "lr": 1.0138768851781073e-05, "elapsed_sec": 82600.5630660057, "step_time_sec": 8.230122052016668, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9958, "loss": 4.069762706756592, "lr": 1.0132391019454306e-05, "elapsed_sec": 82608.79460477829, "step_time_sec": 8.231381272955332, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9959, "loss": 3.804232597351074, "lr": 1.0126163115950254e-05, "elapsed_sec": 82617.02501010895, "step_time_sec": 8.23026156600099, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9960, "loss": 3.9817936420440674, "lr": 1.01200851511115e-05, "elapsed_sec": 82625.25622582436, "step_time_sec": 8.231043843028601, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9961, "loss": 3.8671786785125732, "lr": 1.0114157134543673e-05, "elapsed_sec": 82633.48648810387, "step_time_sec": 8.23007556895027, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9962, "loss": 3.94728684425354, "lr": 1.01083790756154e-05, "elapsed_sec": 82641.7177875042, "step_time_sec": 8.23115362401586, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9963, "loss": 3.9832584857940674, "lr": 1.0102750983458341e-05, "elapsed_sec": 82649.94869971275, "step_time_sec": 8.230800146004185, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9964, "loss": 3.9973390102386475, "lr": 1.0097272866967118e-05, "elapsed_sec": 82658.17996263504, "step_time_sec": 8.231052076036576, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9965, "loss": 3.8176512718200684, "lr": 1.0091944734799374e-05, "elapsed_sec": 82666.41073513031, "step_time_sec": 8.230643754999619, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9966, "loss": 3.7768397331237793, "lr": 1.0086766595375664e-05, "elapsed_sec": 82674.6412396431, "step_time_sec": 8.230315701046493, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9967, "loss": 3.824215888977051, "lr": 1.008173845687955e-05, "elapsed_sec": 82682.87220835686, "step_time_sec": 8.230824391008355, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9968, "loss": 3.873689651489258, "lr": 1.0076860327257487e-05, "elapsed_sec": 82691.10232949257, "step_time_sec": 8.230002045980655, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9969, "loss": 3.9197006225585938, "lr": 1.0072132214218888e-05, "elapsed_sec": 82699.3329000473, "step_time_sec": 8.230418121966068, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9970, "loss": 4.029581069946289, "lr": 1.0067554125236051e-05, "elapsed_sec": 82707.56315922737, "step_time_sec": 8.230060945963487, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9971, "loss": 3.9366796016693115, "lr": 1.0063126067544194e-05, "elapsed_sec": 82715.79358458519, "step_time_sec": 8.230255652044434, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9972, "loss": 3.971559762954712, "lr": 1.0058848048141425e-05, "elapsed_sec": 82724.02145218849, "step_time_sec": 8.227729678968899, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9973, "loss": 3.779585599899292, "lr": 1.0054720073788732e-05, "elapsed_sec": 82732.25031757355, "step_time_sec": 8.228708943992388, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9974, "loss": 3.9590866565704346, "lr": 1.005074215100996e-05, "elapsed_sec": 82740.47993516922, "step_time_sec": 8.229463413998019, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9975, "loss": 3.873562812805176, "lr": 1.0046914286091828e-05, "elapsed_sec": 82748.71059632301, "step_time_sec": 8.23051092698006, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9976, "loss": 3.9756298065185547, "lr": 1.0043236485083884e-05, "elapsed_sec": 82756.94131469727, "step_time_sec": 8.230616959976032, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9977, "loss": 3.943371057510376, "lr": 1.0039708753798535e-05, "elapsed_sec": 82765.17036986351, "step_time_sec": 8.228904209041502, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9978, "loss": 3.816318988800049, "lr": 1.0036331097811015e-05, "elapsed_sec": 82773.39916992188, "step_time_sec": 8.228581538016442, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9979, "loss": 4.2118330001831055, "lr": 1.003310352245936e-05, "elapsed_sec": 82781.62893271446, "step_time_sec": 8.229605734988581, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9980, "loss": 3.8895840644836426, "lr": 1.003002603284445e-05, "elapsed_sec": 82789.86047673225, "step_time_sec": 8.231377214018721, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9981, "loss": 3.9262845516204834, "lr": 1.0027098633829934e-05, "elapsed_sec": 82798.09101247787, "step_time_sec": 8.230451298994012, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9982, "loss": 3.856168270111084, "lr": 1.0024321330042282e-05, "elapsed_sec": 82806.3185415268, "step_time_sec": 8.227318812976591, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9983, "loss": 4.141696453094482, "lr": 1.002169412587075e-05, "elapsed_sec": 82814.547549963, "step_time_sec": 8.22887032298604, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9984, "loss": 3.8475911617279053, "lr": 1.0019217025467364e-05, "elapsed_sec": 82822.77746796608, "step_time_sec": 8.22973084799014, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9985, "loss": 4.034787178039551, "lr": 1.0016890032746936e-05, "elapsed_sec": 82831.00789809227, "step_time_sec": 8.230290592007805, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9986, "loss": 3.8909647464752197, "lr": 1.0014713151387056e-05, "elapsed_sec": 82839.238011837, "step_time_sec": 8.22994392196415, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9987, "loss": 3.9801628589630127, "lr": 1.0012686384828071e-05, "elapsed_sec": 82847.4661514759, "step_time_sec": 8.227995021967217, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9988, "loss": 3.987400770187378, "lr": 1.0010809736273064e-05, "elapsed_sec": 82855.69529414177, "step_time_sec": 8.22896902897628, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9989, "loss": 3.911628246307373, "lr": 1.0009083208687907e-05, "elapsed_sec": 82863.92573690414, "step_time_sec": 8.230294973996934, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9990, "loss": 3.9149062633514404, "lr": 1.0007506804801204e-05, "elapsed_sec": 82872.15632128716, "step_time_sec": 8.230428243987262, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9991, "loss": 3.867527484893799, "lr": 1.0006080527104311e-05, "elapsed_sec": 82880.38768219948, "step_time_sec": 8.231230848992709, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9992, "loss": 3.947442054748535, "lr": 1.0004804377851291e-05, "elapsed_sec": 82888.61754083633, "step_time_sec": 8.229703383985907, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9993, "loss": 3.7268905639648438, "lr": 1.0003678359059001e-05, "elapsed_sec": 82896.84808325768, "step_time_sec": 8.230337830958888, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9994, "loss": 3.9420931339263916, "lr": 1.0002702472506991e-05, "elapsed_sec": 82905.07922458649, "step_time_sec": 8.231046197004616, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9995, "loss": 3.918195962905884, "lr": 1.0001876719737554e-05, "elapsed_sec": 82913.31020307541, "step_time_sec": 8.230766332009807, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9996, "loss": 3.8476171493530273, "lr": 1.0001201102055698e-05, "elapsed_sec": 82921.5417125225, "step_time_sec": 8.231361240032129, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9997, "loss": 3.9674370288848877, "lr": 1.0000675620529189e-05, "elapsed_sec": 82929.77315545082, "step_time_sec": 8.231285793008283, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9998, "loss": 3.884418249130249, "lr": 1.0000300275988479e-05, "elapsed_sec": 82938.00506711006, "step_time_sec": 8.231760244001634, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 9999, "loss": 4.05544376373291, "lr": 1.000007506902678e-05, "elapsed_sec": 82946.2359495163, "step_time_sec": 8.230723729997408, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": false, "validation_ran": false, "probe_ran": false, "probe_elapsed_sec": 0.0, "warmup_excluded": false, "peak_vram_bytes": 13270111744}
{"step": 10000, "loss": 4.080025672912598, "lr": 1e-05, "elapsed_sec": 82954.4666686058, "step_time_sec": 52.62291231501149, "effective_batch_tokens": 239904, "sequence_length": 2499, "batch_size": 6, "grad_accum_steps": 16, "checkpoint_saved": true, "validation_ran": true, "probe_ran": true, "probe_elapsed_sec": 0.8435643169796094, "warmup_excluded": false, "peak_vram_bytes": 13270111744}