diff --git "a/checkpoints/checkpoint-170/trainer_state.json" "b/checkpoints/checkpoint-170/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoints/checkpoint-170/trainer_state.json" @@ -0,0 +1,3434 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.34, + "eval_steps": 500, + "global_step": 170, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002, + "gl_detached_lm_loss": 0.4408648908138275, + "gl_distil_loss": 0.0004234774096403271, + "gl_dms_closed_frac": 0.002307269489392638, + "gl_dms_cr": 1.0023125410079956, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.0, + "gl_dms_target_frac": 0.0, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0004234774096403271, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 0.08518081903457642, + "learning_rate": 3e-05, + "loss": 0.0004, + "step": 1 + }, + { + "epoch": 0.004, + "gl_detached_lm_loss": 0.47212445735931396, + "gl_distil_loss": 0.0008152268128469586, + "gl_dms_closed_frac": 0.0022805265616625547, + "gl_dms_cr": 1.0022858381271362, + "gl_dms_loss": 0.026290902867913246, + "gl_dms_target_cr": 1.029411792755127, + "gl_dms_target_frac": 0.02857142873108387, + "gl_eos_tokens": 12.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.02710612863302231, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.5, + "gl_positions_without_loss_calculation": 12.5, + "grad_norm": 2.49080753326416, + "learning_rate": 3e-05, + "loss": 0.0271, + "step": 2 + }, + { + "epoch": 0.006, + "gl_detached_lm_loss": 0.4552858769893646, + "gl_distil_loss": 0.0005731006385758519, + "gl_dms_closed_frac": 0.011005931533873081, + "gl_dms_cr": 1.0111286640167236, + "gl_dms_loss": 0.04454962536692619, + "gl_dms_target_cr": 1.058823585510254, + "gl_dms_target_frac": 0.0555555559694767, + "gl_eos_tokens": 10.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.045122724026441574, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.5, + "gl_positions_without_loss_calculation": 10.5, + "grad_norm": 5.462989330291748, + "learning_rate": 3e-05, + "loss": 0.0451, + "step": 3 + }, + { + "epoch": 0.008, + "gl_detached_lm_loss": 0.4574431777000427, + "gl_distil_loss": 0.0012222880031913519, + "gl_dms_closed_frac": 0.04424896836280823, + "gl_dms_cr": 1.0463114976882935, + "gl_dms_loss": 0.036832112818956375, + "gl_dms_target_cr": 1.0882352590560913, + "gl_dms_target_frac": 0.0810810849070549, + "gl_eos_tokens": 9.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.03805439546704292, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.5, + "gl_positions_without_loss_calculation": 9.5, + "grad_norm": 15.773651123046875, + "learning_rate": 3e-05, + "loss": 0.0381, + "step": 4 + }, + { + "epoch": 0.01, + "gl_detached_lm_loss": 0.48430225253105164, + "gl_distil_loss": 0.0016105370596051216, + "gl_dms_closed_frac": 0.1391979455947876, + "gl_dms_cr": 1.1620138883590698, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.1176470518112183, + "gl_dms_target_frac": 0.10526316612958908, + "gl_eos_tokens": 13.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0016105370596051216, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32754.75, + "gl_positions_without_loss_calculation": 13.25, + "grad_norm": 1.219625473022461, + "learning_rate": 3e-05, + "loss": 0.0016, + "step": 5 + }, + { + "epoch": 0.012, + "gl_detached_lm_loss": 0.45663878321647644, + "gl_distil_loss": 0.003948591183871031, + "gl_dms_closed_frac": 0.2567480504512787, + "gl_dms_cr": 1.3462255001068115, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.1470588445663452, + "gl_dms_target_frac": 0.12820513546466827, + "gl_eos_tokens": 13.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.003948591183871031, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32754.25, + "gl_positions_without_loss_calculation": 13.75, + "grad_norm": 6.573792457580566, + "learning_rate": 3e-05, + "loss": 0.0039, + "step": 6 + }, + { + "epoch": 0.014, + "gl_detached_lm_loss": 0.4690469801425934, + "gl_distil_loss": 0.005540270358324051, + "gl_dms_closed_frac": 0.34524545073509216, + "gl_dms_cr": 1.5293982028961182, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.1764706373214722, + "gl_dms_target_frac": 0.14999999105930328, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005540270358324051, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 4.326766014099121, + "learning_rate": 3e-05, + "loss": 0.0055, + "step": 7 + }, + { + "epoch": 0.016, + "gl_detached_lm_loss": 0.4756607413291931, + "gl_distil_loss": 0.00488277105614543, + "gl_dms_closed_frac": 0.41206902265548706, + "gl_dms_cr": 1.7022228240966797, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.20588219165802, + "gl_dms_target_frac": 0.17073169350624084, + "gl_eos_tokens": 11.625, + "gl_input_tokens": 32768.0, + "gl_loss": 0.00488277105614543, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.375, + "gl_positions_without_loss_calculation": 11.625, + "grad_norm": 2.491706132888794, + "learning_rate": 3e-05, + "loss": 0.0049, + "step": 8 + }, + { + "epoch": 0.018, + "gl_detached_lm_loss": 0.47128596901893616, + "gl_distil_loss": 0.004345947876572609, + "gl_dms_closed_frac": 0.43498629331588745, + "gl_dms_cr": 1.7745075225830078, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.235294222831726, + "gl_dms_target_frac": 0.190476194024086, + "gl_eos_tokens": 10.625, + "gl_input_tokens": 32768.0, + "gl_loss": 0.004345947876572609, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.375, + "gl_positions_without_loss_calculation": 10.625, + "grad_norm": 4.466060161590576, + "learning_rate": 3e-05, + "loss": 0.0043, + "step": 9 + }, + { + "epoch": 0.02, + "gl_detached_lm_loss": 0.47395652532577515, + "gl_distil_loss": 0.005074856802821159, + "gl_dms_closed_frac": 0.43777039647102356, + "gl_dms_cr": 1.780246376991272, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.264705777168274, + "gl_dms_target_frac": 0.20930232107639313, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005074856802821159, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 7.389776229858398, + "learning_rate": 3e-05, + "loss": 0.0051, + "step": 10 + }, + { + "epoch": 0.022, + "gl_detached_lm_loss": 0.4599134922027588, + "gl_distil_loss": 0.004142467863857746, + "gl_dms_closed_frac": 0.4546947777271271, + "gl_dms_cr": 1.8353639841079712, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.29411780834198, + "gl_dms_target_frac": 0.22727273404598236, + "gl_eos_tokens": 10.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.004142467863857746, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.75, + "gl_positions_without_loss_calculation": 10.25, + "grad_norm": 5.261406421661377, + "learning_rate": 3e-05, + "loss": 0.0041, + "step": 11 + }, + { + "epoch": 0.024, + "gl_detached_lm_loss": 0.47876226902008057, + "gl_distil_loss": 0.0035050029400736094, + "gl_dms_closed_frac": 0.43767115473747253, + "gl_dms_cr": 1.779110312461853, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.3235293626785278, + "gl_dms_target_frac": 0.24444445967674255, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0035050029400736094, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 1.726235032081604, + "learning_rate": 3e-05, + "loss": 0.0035, + "step": 12 + }, + { + "epoch": 0.026, + "gl_detached_lm_loss": 0.47314175963401794, + "gl_distil_loss": 0.00313362805172801, + "gl_dms_closed_frac": 0.42793411016464233, + "gl_dms_cr": 1.7503966093063354, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.3529412746429443, + "gl_dms_target_frac": 0.2608695328235626, + "gl_eos_tokens": 9.125, + "gl_input_tokens": 32768.0, + "gl_loss": 0.00313362805172801, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.875, + "gl_positions_without_loss_calculation": 9.125, + "grad_norm": 1.3173593282699585, + "learning_rate": 3e-05, + "loss": 0.0031, + "step": 13 + }, + { + "epoch": 0.028, + "gl_detached_lm_loss": 0.46165570616722107, + "gl_distil_loss": 0.0026301750913262367, + "gl_dms_closed_frac": 0.4108446538448334, + "gl_dms_cr": 1.6978795528411865, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.3823529481887817, + "gl_dms_target_frac": 0.27659574151039124, + "gl_eos_tokens": 12.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0026301750913262367, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.75, + "gl_positions_without_loss_calculation": 12.25, + "grad_norm": 2.1748335361480713, + "learning_rate": 3e-05, + "loss": 0.0026, + "step": 14 + }, + { + "epoch": 0.03, + "gl_detached_lm_loss": 0.47738948464393616, + "gl_distil_loss": 0.00245979567989707, + "gl_dms_closed_frac": 0.3945341110229492, + "gl_dms_cr": 1.6518938541412354, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.4117648601531982, + "gl_dms_target_frac": 0.2916666567325592, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.00245979567989707, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 1.0410534143447876, + "learning_rate": 3e-05, + "loss": 0.0025, + "step": 15 + }, + { + "epoch": 0.032, + "gl_detached_lm_loss": 0.4363529086112976, + "gl_distil_loss": 0.002206941368058324, + "gl_dms_closed_frac": 0.3827410936355591, + "gl_dms_cr": 1.6204078197479248, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.441176414489746, + "gl_dms_target_frac": 0.30612245202064514, + "gl_eos_tokens": 10.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.002206941368058324, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.75, + "gl_positions_without_loss_calculation": 10.25, + "grad_norm": 1.933333396911621, + "learning_rate": 3e-05, + "loss": 0.0022, + "step": 16 + }, + { + "epoch": 0.034, + "gl_detached_lm_loss": 0.4372034966945648, + "gl_distil_loss": 0.0019222048576921225, + "gl_dms_closed_frac": 0.3600853383541107, + "gl_dms_cr": 1.563251256942749, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.470588207244873, + "gl_dms_target_frac": 0.31999996304512024, + "gl_eos_tokens": 10.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0019222048576921225, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.0, + "gl_positions_without_loss_calculation": 10.0, + "grad_norm": 1.3154582977294922, + "learning_rate": 3e-05, + "loss": 0.0019, + "step": 17 + }, + { + "epoch": 0.036, + "gl_detached_lm_loss": 0.4272920787334442, + "gl_distil_loss": 0.0017361550126224756, + "gl_dms_closed_frac": 0.3457123935222626, + "gl_dms_cr": 1.5294559001922607, + "gl_dms_loss": 0.003807865083217621, + "gl_dms_target_cr": 1.5, + "gl_dms_target_frac": 0.3333333134651184, + "gl_eos_tokens": 9.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.00554402032867074, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.25, + "gl_positions_without_loss_calculation": 9.75, + "grad_norm": 3.785482168197632, + "learning_rate": 3e-05, + "loss": 0.0055, + "step": 18 + }, + { + "epoch": 0.038, + "gl_detached_lm_loss": 0.4550657570362091, + "gl_distil_loss": 0.0017214431427419186, + "gl_dms_closed_frac": 0.3477359712123871, + "gl_dms_cr": 1.533597707748413, + "gl_dms_loss": 0.0037946999073028564, + "gl_dms_target_cr": 1.529411792755127, + "gl_dms_target_frac": 0.3461538553237915, + "gl_eos_tokens": 12.625, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005516142584383488, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.375, + "gl_positions_without_loss_calculation": 12.625, + "grad_norm": 7.753348350524902, + "learning_rate": 3e-05, + "loss": 0.0055, + "step": 19 + }, + { + "epoch": 0.04, + "gl_detached_lm_loss": 0.4646049737930298, + "gl_distil_loss": 0.0019962999504059553, + "gl_dms_closed_frac": 0.36188656091690063, + "gl_dms_cr": 1.5680888891220093, + "gl_dms_loss": 0.0048120878636837006, + "gl_dms_target_cr": 1.558823585510254, + "gl_dms_target_frac": 0.358490526676178, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0068083880469202995, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 15.435755729675293, + "learning_rate": 3e-05, + "loss": 0.0068, + "step": 20 + }, + { + "epoch": 0.042, + "gl_detached_lm_loss": 0.45961058139801025, + "gl_distil_loss": 0.001984530594199896, + "gl_dms_closed_frac": 0.395085871219635, + "gl_dms_cr": 1.653983473777771, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.5882351398468018, + "gl_dms_target_frac": 0.37037035822868347, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.001984530594199896, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 0.7058410048484802, + "learning_rate": 3e-05, + "loss": 0.002, + "step": 21 + }, + { + "epoch": 0.044, + "gl_detached_lm_loss": 0.4503985643386841, + "gl_distil_loss": 0.002159471856430173, + "gl_dms_closed_frac": 0.4215147793292999, + "gl_dms_cr": 1.7290366888046265, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.6176470518112183, + "gl_dms_target_frac": 0.38181817531585693, + "gl_eos_tokens": 10.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.002159471856430173, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.0, + "gl_positions_without_loss_calculation": 10.0, + "grad_norm": 1.5459787845611572, + "learning_rate": 3e-05, + "loss": 0.0022, + "step": 22 + }, + { + "epoch": 0.046, + "gl_detached_lm_loss": 0.5068718194961548, + "gl_distil_loss": 0.0023190875072032213, + "gl_dms_closed_frac": 0.4291229844093323, + "gl_dms_cr": 1.7522425651550293, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.6470587253570557, + "gl_dms_target_frac": 0.3928571045398712, + "gl_eos_tokens": 12.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0023190875072032213, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.5, + "gl_positions_without_loss_calculation": 12.5, + "grad_norm": 1.3756858110427856, + "learning_rate": 3e-05, + "loss": 0.0023, + "step": 23 + }, + { + "epoch": 0.048, + "gl_detached_lm_loss": 0.4678501486778259, + "gl_distil_loss": 0.0021665464155375957, + "gl_dms_closed_frac": 0.44796285033226013, + "gl_dms_cr": 1.8125988245010376, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.6764706373214722, + "gl_dms_target_frac": 0.403508722782135, + "gl_eos_tokens": 11.375, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0021665464155375957, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.625, + "gl_positions_without_loss_calculation": 11.375, + "grad_norm": 0.5030384063720703, + "learning_rate": 3e-05, + "loss": 0.0022, + "step": 24 + }, + { + "epoch": 0.05, + "gl_detached_lm_loss": 0.46251434087753296, + "gl_distil_loss": 0.0021796354558318853, + "gl_dms_closed_frac": 0.4480258822441101, + "gl_dms_cr": 1.8118644952774048, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.70588219165802, + "gl_dms_target_frac": 0.4137931168079376, + "gl_eos_tokens": 11.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0021796354558318853, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.25, + "gl_positions_without_loss_calculation": 11.75, + "grad_norm": 0.4940311908721924, + "learning_rate": 3e-05, + "loss": 0.0022, + "step": 25 + }, + { + "epoch": 0.052, + "gl_detached_lm_loss": 0.5336480736732483, + "gl_distil_loss": 0.0026741931214928627, + "gl_dms_closed_frac": 0.4484004080295563, + "gl_dms_cr": 1.8130683898925781, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.735294222831726, + "gl_dms_target_frac": 0.423728883266449, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0026741931214928627, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 1.8604220151901245, + "learning_rate": 3e-05, + "loss": 0.0027, + "step": 26 + }, + { + "epoch": 0.054, + "gl_detached_lm_loss": 0.49125808477401733, + "gl_distil_loss": 0.0021746917627751827, + "gl_dms_closed_frac": 0.45190396904945374, + "gl_dms_cr": 1.825452208518982, + "gl_dms_loss": 0.0002978183329105377, + "gl_dms_target_cr": 1.764705777168274, + "gl_dms_target_frac": 0.4333333671092987, + "gl_eos_tokens": 10.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0024725100956857204, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.0, + "gl_positions_without_loss_calculation": 10.0, + "grad_norm": 3.132777452468872, + "learning_rate": 3e-05, + "loss": 0.0025, + "step": 27 + }, + { + "epoch": 0.056, + "gl_detached_lm_loss": 0.44300004839897156, + "gl_distil_loss": 0.002110114088281989, + "gl_dms_closed_frac": 0.4698338508605957, + "gl_dms_cr": 1.8869034051895142, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.79411780834198, + "gl_dms_target_frac": 0.44262292981147766, + "gl_eos_tokens": 10.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.002110114088281989, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.0, + "gl_positions_without_loss_calculation": 10.0, + "grad_norm": 0.48894694447517395, + "learning_rate": 3e-05, + "loss": 0.0021, + "step": 28 + }, + { + "epoch": 0.058, + "gl_detached_lm_loss": 0.4883745610713959, + "gl_distil_loss": 0.0021808757446706295, + "gl_dms_closed_frac": 0.4731365442276001, + "gl_dms_cr": 1.8983519077301025, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.8235293626785278, + "gl_dms_target_frac": 0.4516129195690155, + "gl_eos_tokens": 11.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0021808757446706295, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.25, + "gl_positions_without_loss_calculation": 11.75, + "grad_norm": 0.7062388062477112, + "learning_rate": 3e-05, + "loss": 0.0022, + "step": 29 + }, + { + "epoch": 0.06, + "gl_detached_lm_loss": 0.46343907713890076, + "gl_distil_loss": 0.002232843078672886, + "gl_dms_closed_frac": 0.48099470138549805, + "gl_dms_cr": 1.92780601978302, + "gl_dms_loss": 0.00032526254653930664, + "gl_dms_target_cr": 1.8529412746429443, + "gl_dms_target_frac": 0.4603174328804016, + "gl_eos_tokens": 11.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0025581056252121925, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.0, + "gl_positions_without_loss_calculation": 11.0, + "grad_norm": 3.083817958831787, + "learning_rate": 3e-05, + "loss": 0.0026, + "step": 30 + }, + { + "epoch": 0.062, + "gl_detached_lm_loss": 0.4522785544395447, + "gl_distil_loss": 0.0021007806062698364, + "gl_dms_closed_frac": 0.4981619715690613, + "gl_dms_cr": 1.9931349754333496, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.8823529481887817, + "gl_dms_target_frac": 0.46875, + "gl_eos_tokens": 11.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0021007806062698364, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.75, + "gl_positions_without_loss_calculation": 11.25, + "grad_norm": 0.6077609062194824, + "learning_rate": 3e-05, + "loss": 0.0021, + "step": 31 + }, + { + "epoch": 0.064, + "gl_detached_lm_loss": 0.4637286067008972, + "gl_distil_loss": 0.002412236761301756, + "gl_dms_closed_frac": 0.5112192034721375, + "gl_dms_cr": 2.046431064605713, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.9117648601531982, + "gl_dms_target_frac": 0.47692304849624634, + "gl_eos_tokens": 10.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.002412236761301756, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.5, + "gl_positions_without_loss_calculation": 10.5, + "grad_norm": 0.4839731752872467, + "learning_rate": 3e-05, + "loss": 0.0024, + "step": 32 + }, + { + "epoch": 0.066, + "gl_detached_lm_loss": 0.48589426279067993, + "gl_distil_loss": 0.0024046343751251698, + "gl_dms_closed_frac": 0.5115869045257568, + "gl_dms_cr": 2.0482912063598633, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.941176414489746, + "gl_dms_target_frac": 0.4848484992980957, + "gl_eos_tokens": 11.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0024046343751251698, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.0, + "gl_positions_without_loss_calculation": 11.0, + "grad_norm": 0.4125555157661438, + "learning_rate": 3e-05, + "loss": 0.0024, + "step": 33 + }, + { + "epoch": 0.068, + "gl_detached_lm_loss": 0.4837048351764679, + "gl_distil_loss": 0.0023821976501494646, + "gl_dms_closed_frac": 0.5104748606681824, + "gl_dms_cr": 2.0433568954467773, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 1.9705884456634521, + "gl_dms_target_frac": 0.4925372898578644, + "gl_eos_tokens": 12.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0023821976501494646, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.0, + "gl_positions_without_loss_calculation": 12.0, + "grad_norm": 0.4393292963504791, + "learning_rate": 3e-05, + "loss": 0.0024, + "step": 34 + }, + { + "epoch": 0.07, + "gl_detached_lm_loss": 0.45272281765937805, + "gl_distil_loss": 0.002244414296001196, + "gl_dms_closed_frac": 0.5154764652252197, + "gl_dms_cr": 2.0640711784362793, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.0, + "gl_dms_target_frac": 0.5, + "gl_eos_tokens": 12.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.002244414296001196, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.0, + "gl_positions_without_loss_calculation": 12.0, + "grad_norm": 0.6267670392990112, + "learning_rate": 3e-05, + "loss": 0.0022, + "step": 35 + }, + { + "epoch": 0.072, + "gl_detached_lm_loss": 0.4794645309448242, + "gl_distil_loss": 0.0021888441406190395, + "gl_dms_closed_frac": 0.5047264099121094, + "gl_dms_cr": 2.0192503929138184, + "gl_dms_loss": 0.00308087095618248, + "gl_dms_target_cr": 2.029411554336548, + "gl_dms_target_frac": 0.5072464346885681, + "gl_eos_tokens": 9.875, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005269715096801519, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.125, + "gl_positions_without_loss_calculation": 9.875, + "grad_norm": 17.163253784179688, + "learning_rate": 3e-05, + "loss": 0.0053, + "step": 36 + }, + { + "epoch": 0.074, + "gl_detached_lm_loss": 0.4689008891582489, + "gl_distil_loss": 0.0023856312036514282, + "gl_dms_closed_frac": 0.5187268257141113, + "gl_dms_cr": 2.077929973602295, + "gl_dms_loss": 0.0002900660037994385, + "gl_dms_target_cr": 2.058823585510254, + "gl_dms_target_frac": 0.5142857432365417, + "gl_eos_tokens": 10.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0026756974402815104, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.75, + "gl_positions_without_loss_calculation": 10.25, + "grad_norm": 2.661439895629883, + "learning_rate": 3e-05, + "loss": 0.0027, + "step": 37 + }, + { + "epoch": 0.076, + "gl_detached_lm_loss": 0.44916558265686035, + "gl_distil_loss": 0.0023738506715744734, + "gl_dms_closed_frac": 0.538723349571228, + "gl_dms_cr": 2.168072462081909, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.08823561668396, + "gl_dms_target_frac": 0.5211267471313477, + "gl_eos_tokens": 10.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0023738506715744734, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.75, + "gl_positions_without_loss_calculation": 10.25, + "grad_norm": 0.7621412873268127, + "learning_rate": 3e-05, + "loss": 0.0024, + "step": 38 + }, + { + "epoch": 0.078, + "gl_detached_lm_loss": 0.464703232049942, + "gl_distil_loss": 0.0026064212433993816, + "gl_dms_closed_frac": 0.5474792122840881, + "gl_dms_cr": 2.210324764251709, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.117647171020508, + "gl_dms_target_frac": 0.5277777314186096, + "gl_eos_tokens": 12.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0026064212433993816, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.0, + "gl_positions_without_loss_calculation": 12.0, + "grad_norm": 1.6320092678070068, + "learning_rate": 3e-05, + "loss": 0.0026, + "step": 39 + }, + { + "epoch": 0.08, + "gl_detached_lm_loss": 0.4954476058483124, + "gl_distil_loss": 0.002672001253813505, + "gl_dms_closed_frac": 0.5522975921630859, + "gl_dms_cr": 2.233839273452759, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.1470587253570557, + "gl_dms_target_frac": 0.5342465043067932, + "gl_eos_tokens": 12.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.002672001253813505, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.75, + "gl_positions_without_loss_calculation": 12.25, + "grad_norm": 0.5290228128433228, + "learning_rate": 3e-05, + "loss": 0.0027, + "step": 40 + }, + { + "epoch": 0.082, + "gl_detached_lm_loss": 0.46069520711898804, + "gl_distil_loss": 0.0026594277005642653, + "gl_dms_closed_frac": 0.5577877759933472, + "gl_dms_cr": 2.261887788772583, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.1764705181121826, + "gl_dms_target_frac": 0.5405405163764954, + "gl_eos_tokens": 9.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0026594277005642653, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.25, + "gl_positions_without_loss_calculation": 9.75, + "grad_norm": 0.593712329864502, + "learning_rate": 3e-05, + "loss": 0.0027, + "step": 41 + }, + { + "epoch": 0.084, + "gl_detached_lm_loss": 0.4498485028743744, + "gl_distil_loss": 0.0026302323676645756, + "gl_dms_closed_frac": 0.5595740675926208, + "gl_dms_cr": 2.2711756229400635, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.2058823108673096, + "gl_dms_target_frac": 0.54666668176651, + "gl_eos_tokens": 11.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0026302323676645756, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.25, + "gl_positions_without_loss_calculation": 11.75, + "grad_norm": 1.180981159210205, + "learning_rate": 3e-05, + "loss": 0.0026, + "step": 42 + }, + { + "epoch": 0.086, + "gl_detached_lm_loss": 0.44525861740112305, + "gl_distil_loss": 0.0024244743399322033, + "gl_dms_closed_frac": 0.5586432814598083, + "gl_dms_cr": 2.266235828399658, + "gl_dms_loss": 0.0005865171551704407, + "gl_dms_target_cr": 2.2352941036224365, + "gl_dms_target_frac": 0.5526315569877625, + "gl_eos_tokens": 11.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.003010991495102644, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.25, + "gl_positions_without_loss_calculation": 11.75, + "grad_norm": 2.4405698776245117, + "learning_rate": 3e-05, + "loss": 0.003, + "step": 43 + }, + { + "epoch": 0.088, + "gl_detached_lm_loss": 0.45686984062194824, + "gl_distil_loss": 0.002607216825708747, + "gl_dms_closed_frac": 0.5671321749687195, + "gl_dms_cr": 2.3103342056274414, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.2647058963775635, + "gl_dms_target_frac": 0.5584415793418884, + "gl_eos_tokens": 11.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.002607216825708747, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.0, + "gl_positions_without_loss_calculation": 11.0, + "grad_norm": 1.1561832427978516, + "learning_rate": 3e-05, + "loss": 0.0026, + "step": 44 + }, + { + "epoch": 0.09, + "gl_detached_lm_loss": 0.47623857855796814, + "gl_distil_loss": 0.0026032254099845886, + "gl_dms_closed_frac": 0.5701911449432373, + "gl_dms_cr": 2.326831102371216, + "gl_dms_loss": 8.52048397064209e-05, + "gl_dms_target_cr": 2.2941176891326904, + "gl_dms_target_frac": 0.5641025900840759, + "gl_eos_tokens": 10.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0026884302496910095, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.0, + "gl_positions_without_loss_calculation": 10.0, + "grad_norm": 2.5896213054656982, + "learning_rate": 3e-05, + "loss": 0.0027, + "step": 45 + }, + { + "epoch": 0.092, + "gl_detached_lm_loss": 0.4935804307460785, + "gl_distil_loss": 0.002987999701872468, + "gl_dms_closed_frac": 0.581356406211853, + "gl_dms_cr": 2.389448881149292, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.3235294818878174, + "gl_dms_target_frac": 0.5696201920509338, + "gl_eos_tokens": 10.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.002987999701872468, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.5, + "gl_positions_without_loss_calculation": 10.5, + "grad_norm": 0.5520856380462646, + "learning_rate": 3e-05, + "loss": 0.003, + "step": 46 + }, + { + "epoch": 0.094, + "gl_detached_lm_loss": 0.47648417949676514, + "gl_distil_loss": 0.0030070601496845484, + "gl_dms_closed_frac": 0.5908368229866028, + "gl_dms_cr": 2.444538116455078, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.3529412746429443, + "gl_dms_target_frac": 0.574999988079071, + "gl_eos_tokens": 11.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0030070601496845484, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.0, + "gl_positions_without_loss_calculation": 11.0, + "grad_norm": 0.5767875909805298, + "learning_rate": 3e-05, + "loss": 0.003, + "step": 47 + }, + { + "epoch": 0.096, + "gl_detached_lm_loss": 0.4622802734375, + "gl_distil_loss": 0.002870834432542324, + "gl_dms_closed_frac": 0.5926159620285034, + "gl_dms_cr": 2.454831600189209, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.382352828979492, + "gl_dms_target_frac": 0.5802469253540039, + "gl_eos_tokens": 12.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.002870834432542324, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.75, + "gl_positions_without_loss_calculation": 12.25, + "grad_norm": 1.0462926626205444, + "learning_rate": 3e-05, + "loss": 0.0029, + "step": 48 + }, + { + "epoch": 0.098, + "gl_detached_lm_loss": 0.4593689739704132, + "gl_distil_loss": 0.0029501053504645824, + "gl_dms_closed_frac": 0.5951315760612488, + "gl_dms_cr": 2.4700350761413574, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.41176438331604, + "gl_dms_target_frac": 0.5853658318519592, + "gl_eos_tokens": 12.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0029501053504645824, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.25, + "gl_positions_without_loss_calculation": 12.75, + "grad_norm": 0.8309232592582703, + "learning_rate": 3e-05, + "loss": 0.003, + "step": 49 + }, + { + "epoch": 0.1, + "gl_detached_lm_loss": 0.46503978967666626, + "gl_distil_loss": 0.002882842207327485, + "gl_dms_closed_frac": 0.5900493860244751, + "gl_dms_cr": 2.4397754669189453, + "gl_dms_loss": 0.0024773404002189636, + "gl_dms_target_cr": 2.441176414489746, + "gl_dms_target_frac": 0.5903614163398743, + "gl_eos_tokens": 11.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005360182840377092, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.0, + "gl_positions_without_loss_calculation": 11.0, + "grad_norm": 9.821179389953613, + "learning_rate": 3e-05, + "loss": 0.0054, + "step": 50 + }, + { + "epoch": 0.102, + "gl_detached_lm_loss": 0.47805801033973694, + "gl_distil_loss": 0.0031700131949037313, + "gl_dms_closed_frac": 0.5995256304740906, + "gl_dms_cr": 2.4975829124450684, + "gl_dms_loss": 0.0009735524654388428, + "gl_dms_target_cr": 2.470588445663452, + "gl_dms_target_frac": 0.5952381491661072, + "gl_eos_tokens": 9.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.004143565893173218, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32759.0, + "gl_positions_without_loss_calculation": 9.0, + "grad_norm": 5.710433006286621, + "learning_rate": 3e-05, + "loss": 0.0041, + "step": 51 + }, + { + "epoch": 0.104, + "gl_detached_lm_loss": 0.4751797914505005, + "gl_distil_loss": 0.0030524739995598793, + "gl_dms_closed_frac": 0.612511157989502, + "gl_dms_cr": 2.581015110015869, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.5, + "gl_dms_target_frac": 0.5999999642372131, + "gl_eos_tokens": 10.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0030524739995598793, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.5, + "gl_positions_without_loss_calculation": 10.5, + "grad_norm": 0.5378996133804321, + "learning_rate": 3e-05, + "loss": 0.0031, + "step": 52 + }, + { + "epoch": 0.106, + "gl_detached_lm_loss": 0.47998565435409546, + "gl_distil_loss": 0.0036123180761933327, + "gl_dms_closed_frac": 0.6233031749725342, + "gl_dms_cr": 2.655303955078125, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.529411554336548, + "gl_dms_target_frac": 0.604651153087616, + "gl_eos_tokens": 9.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0036123180761933327, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.25, + "gl_positions_without_loss_calculation": 9.75, + "grad_norm": 0.7021176218986511, + "learning_rate": 3e-05, + "loss": 0.0036, + "step": 53 + }, + { + "epoch": 0.108, + "gl_detached_lm_loss": 0.4588249623775482, + "gl_distil_loss": 0.0031498277094215155, + "gl_dms_closed_frac": 0.6244292855262756, + "gl_dms_cr": 2.6633052825927734, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.558823585510254, + "gl_dms_target_frac": 0.6091954112052917, + "gl_eos_tokens": 13.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0031498277094215155, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.0, + "gl_positions_without_loss_calculation": 13.0, + "grad_norm": 0.5357586741447449, + "learning_rate": 3e-05, + "loss": 0.0031, + "step": 54 + }, + { + "epoch": 0.11, + "gl_detached_lm_loss": 0.45006063580513, + "gl_distil_loss": 0.0030964650213718414, + "gl_dms_closed_frac": 0.6261225342750549, + "gl_dms_cr": 2.6749215126037598, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.58823561668396, + "gl_dms_target_frac": 0.6136364340782166, + "gl_eos_tokens": 9.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0030964650213718414, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.25, + "gl_positions_without_loss_calculation": 9.75, + "grad_norm": 0.5736832022666931, + "learning_rate": 3e-05, + "loss": 0.0031, + "step": 55 + }, + { + "epoch": 0.112, + "gl_detached_lm_loss": 0.4804791212081909, + "gl_distil_loss": 0.003160958644002676, + "gl_dms_closed_frac": 0.6197777390480042, + "gl_dms_cr": 2.6310806274414062, + "gl_dms_loss": 0.002230919897556305, + "gl_dms_target_cr": 2.617647171020508, + "gl_dms_target_frac": 0.6179775595664978, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005391878541558981, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 9.701092720031738, + "learning_rate": 3e-05, + "loss": 0.0054, + "step": 56 + }, + { + "epoch": 0.114, + "gl_detached_lm_loss": 0.48314258456230164, + "gl_distil_loss": 0.003439662978053093, + "gl_dms_closed_frac": 0.6313677430152893, + "gl_dms_cr": 2.7136151790618896, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.6470587253570557, + "gl_dms_target_frac": 0.6222222447395325, + "gl_eos_tokens": 10.875, + "gl_input_tokens": 32768.0, + "gl_loss": 0.003439662978053093, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.125, + "gl_positions_without_loss_calculation": 10.875, + "grad_norm": 0.7246235013008118, + "learning_rate": 3e-05, + "loss": 0.0034, + "step": 57 + }, + { + "epoch": 0.116, + "gl_detached_lm_loss": 0.44782233238220215, + "gl_distil_loss": 0.0031148388516157866, + "gl_dms_closed_frac": 0.6334483623504639, + "gl_dms_cr": 2.7288801670074463, + "gl_dms_loss": 0.000607810914516449, + "gl_dms_target_cr": 2.6764702796936035, + "gl_dms_target_frac": 0.6263737082481384, + "gl_eos_tokens": 10.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0037226497661322355, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.0, + "gl_positions_without_loss_calculation": 10.0, + "grad_norm": 2.5122969150543213, + "learning_rate": 3e-05, + "loss": 0.0037, + "step": 58 + }, + { + "epoch": 0.118, + "gl_detached_lm_loss": 0.4750285744667053, + "gl_distil_loss": 0.003389698453247547, + "gl_dms_closed_frac": 0.6386871337890625, + "gl_dms_cr": 2.7682950496673584, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.7058825492858887, + "gl_dms_target_frac": 0.6304348111152649, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.003389698453247547, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 0.5368403792381287, + "learning_rate": 3e-05, + "loss": 0.0034, + "step": 59 + }, + { + "epoch": 0.12, + "gl_detached_lm_loss": 0.46681150794029236, + "gl_distil_loss": 0.0033188825473189354, + "gl_dms_closed_frac": 0.6449574828147888, + "gl_dms_cr": 2.817054033279419, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.7352941036224365, + "gl_dms_target_frac": 0.6344085335731506, + "gl_eos_tokens": 10.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0033188825473189354, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.75, + "gl_positions_without_loss_calculation": 10.25, + "grad_norm": 0.4873170554637909, + "learning_rate": 3e-05, + "loss": 0.0033, + "step": 60 + }, + { + "epoch": 0.122, + "gl_detached_lm_loss": 0.45119157433509827, + "gl_distil_loss": 0.0035455345641821623, + "gl_dms_closed_frac": 0.6473181843757629, + "gl_dms_cr": 2.8355660438537598, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.7647058963775635, + "gl_dms_target_frac": 0.6382978558540344, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0035455345641821623, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 0.7403936982154846, + "learning_rate": 3e-05, + "loss": 0.0035, + "step": 61 + }, + { + "epoch": 0.124, + "gl_detached_lm_loss": 0.47377100586891174, + "gl_distil_loss": 0.003390070516616106, + "gl_dms_closed_frac": 0.6446278691291809, + "gl_dms_cr": 2.814770221710205, + "gl_dms_loss": 0.0015726163983345032, + "gl_dms_target_cr": 2.7941174507141113, + "gl_dms_target_frac": 0.6421052813529968, + "gl_eos_tokens": 9.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.004962686914950609, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.5, + "gl_positions_without_loss_calculation": 9.5, + "grad_norm": 4.738519191741943, + "learning_rate": 3e-05, + "loss": 0.005, + "step": 62 + }, + { + "epoch": 0.126, + "gl_detached_lm_loss": 0.480397492647171, + "gl_distil_loss": 0.0035927374847233295, + "gl_dms_closed_frac": 0.650998592376709, + "gl_dms_cr": 2.86590838432312, + "gl_dms_loss": 0.0002143457531929016, + "gl_dms_target_cr": 2.8235297203063965, + "gl_dms_target_frac": 0.6458333134651184, + "gl_eos_tokens": 10.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.003807083237916231, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.75, + "gl_positions_without_loss_calculation": 10.25, + "grad_norm": 4.895111560821533, + "learning_rate": 3e-05, + "loss": 0.0038, + "step": 63 + }, + { + "epoch": 0.128, + "gl_detached_lm_loss": 0.45455634593963623, + "gl_distil_loss": 0.003494451055303216, + "gl_dms_closed_frac": 0.6617327928543091, + "gl_dms_cr": 2.95637845993042, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.8529412746429443, + "gl_dms_target_frac": 0.6494845747947693, + "gl_eos_tokens": 12.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.003494451055303216, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.25, + "gl_positions_without_loss_calculation": 12.75, + "grad_norm": 0.656202495098114, + "learning_rate": 3e-05, + "loss": 0.0035, + "step": 64 + }, + { + "epoch": 0.13, + "gl_detached_lm_loss": 0.4529498219490051, + "gl_distil_loss": 0.004102835897356272, + "gl_dms_closed_frac": 0.6730498671531677, + "gl_dms_cr": 3.058912754058838, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.882352828979492, + "gl_dms_target_frac": 0.6530612111091614, + "gl_eos_tokens": 11.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.004102835897356272, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.0, + "gl_positions_without_loss_calculation": 11.0, + "grad_norm": 2.2566981315612793, + "learning_rate": 3e-05, + "loss": 0.0041, + "step": 65 + }, + { + "epoch": 0.132, + "gl_detached_lm_loss": 0.4596954882144928, + "gl_distil_loss": 0.0038116772193461657, + "gl_dms_closed_frac": 0.6781314015388489, + "gl_dms_cr": 3.107957363128662, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.91176438331604, + "gl_dms_target_frac": 0.6565656661987305, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0038116772193461657, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 1.5338054895401, + "learning_rate": 3e-05, + "loss": 0.0038, + "step": 66 + }, + { + "epoch": 0.134, + "gl_detached_lm_loss": 0.4333009123802185, + "gl_distil_loss": 0.0037192029412835836, + "gl_dms_closed_frac": 0.6820775866508484, + "gl_dms_cr": 3.1470866203308105, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.941176414489746, + "gl_dms_target_frac": 0.6600000262260437, + "gl_eos_tokens": 10.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0037192029412835836, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.75, + "gl_positions_without_loss_calculation": 10.25, + "grad_norm": 0.676351010799408, + "learning_rate": 3e-05, + "loss": 0.0037, + "step": 67 + }, + { + "epoch": 0.136, + "gl_detached_lm_loss": 0.47546181082725525, + "gl_distil_loss": 0.004121327772736549, + "gl_dms_closed_frac": 0.6781496405601501, + "gl_dms_cr": 3.1076772212982178, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 2.970588445663452, + "gl_dms_target_frac": 0.6633663177490234, + "gl_eos_tokens": 12.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.004121327772736549, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.0, + "gl_positions_without_loss_calculation": 12.0, + "grad_norm": 1.0178581476211548, + "learning_rate": 3e-05, + "loss": 0.0041, + "step": 68 + }, + { + "epoch": 0.138, + "gl_detached_lm_loss": 0.48524773120880127, + "gl_distil_loss": 0.0038435542955994606, + "gl_dms_closed_frac": 0.6703423261642456, + "gl_dms_cr": 3.0347557067871094, + "gl_dms_loss": 0.001511596143245697, + "gl_dms_target_cr": 3.0, + "gl_dms_target_frac": 0.6666666269302368, + "gl_eos_tokens": 10.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005355150438845158, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.5, + "gl_positions_without_loss_calculation": 10.5, + "grad_norm": 6.971191883087158, + "learning_rate": 3e-05, + "loss": 0.0054, + "step": 69 + }, + { + "epoch": 0.14, + "gl_detached_lm_loss": 0.47538912296295166, + "gl_distil_loss": 0.003917126450687647, + "gl_dms_closed_frac": 0.6765962839126587, + "gl_dms_cr": 3.0925676822662354, + "gl_dms_loss": 0.0001483336091041565, + "gl_dms_target_cr": 3.029411554336548, + "gl_dms_target_frac": 0.6699028611183167, + "gl_eos_tokens": 11.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.004065460059791803, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.75, + "gl_positions_without_loss_calculation": 11.25, + "grad_norm": 2.054696798324585, + "learning_rate": 3e-05, + "loss": 0.0041, + "step": 70 + }, + { + "epoch": 0.142, + "gl_detached_lm_loss": 0.45387187600135803, + "gl_distil_loss": 0.003971648868173361, + "gl_dms_closed_frac": 0.689899206161499, + "gl_dms_cr": 3.2253713607788086, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.058823585510254, + "gl_dms_target_frac": 0.6730769872665405, + "gl_eos_tokens": 11.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.003971648868173361, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.0, + "gl_positions_without_loss_calculation": 11.0, + "grad_norm": 0.9588703513145447, + "learning_rate": 3e-05, + "loss": 0.004, + "step": 71 + }, + { + "epoch": 0.144, + "gl_detached_lm_loss": 0.4384338855743408, + "gl_distil_loss": 0.0038274391554296017, + "gl_dms_closed_frac": 0.6943807601928711, + "gl_dms_cr": 3.272785186767578, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.08823561668396, + "gl_dms_target_frac": 0.6761904358863831, + "gl_eos_tokens": 12.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0038274391554296017, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.75, + "gl_positions_without_loss_calculation": 12.25, + "grad_norm": 0.6780281662940979, + "learning_rate": 3e-05, + "loss": 0.0038, + "step": 72 + }, + { + "epoch": 0.146, + "gl_detached_lm_loss": 0.44365930557250977, + "gl_distil_loss": 0.003922718111425638, + "gl_dms_closed_frac": 0.6959836483001709, + "gl_dms_cr": 3.2900500297546387, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.117647171020508, + "gl_dms_target_frac": 0.6792453527450562, + "gl_eos_tokens": 11.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.003922718111425638, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.75, + "gl_positions_without_loss_calculation": 11.25, + "grad_norm": 0.5752905011177063, + "learning_rate": 3e-05, + "loss": 0.0039, + "step": 73 + }, + { + "epoch": 0.148, + "gl_detached_lm_loss": 0.4326343834400177, + "gl_distil_loss": 0.003944290801882744, + "gl_dms_closed_frac": 0.696725070476532, + "gl_dms_cr": 3.2982115745544434, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.1470587253570557, + "gl_dms_target_frac": 0.6822429299354553, + "gl_eos_tokens": 10.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.003944290801882744, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.5, + "gl_positions_without_loss_calculation": 10.5, + "grad_norm": 0.799717366695404, + "learning_rate": 3e-05, + "loss": 0.0039, + "step": 74 + }, + { + "epoch": 0.15, + "gl_detached_lm_loss": 0.486849308013916, + "gl_distil_loss": 0.004036651458591223, + "gl_dms_closed_frac": 0.6863027215003967, + "gl_dms_cr": 3.1896045207977295, + "gl_dms_loss": 0.0025970041751861572, + "gl_dms_target_cr": 3.1764702796936035, + "gl_dms_target_frac": 0.6851851940155029, + "gl_eos_tokens": 12.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.00663365563377738, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.0, + "gl_positions_without_loss_calculation": 12.0, + "grad_norm": 14.300118446350098, + "learning_rate": 3e-05, + "loss": 0.0066, + "step": 75 + }, + { + "epoch": 0.152, + "gl_detached_lm_loss": 0.4877408444881439, + "gl_distil_loss": 0.004087416920810938, + "gl_dms_closed_frac": 0.688453733921051, + "gl_dms_cr": 3.2113051414489746, + "gl_dms_loss": 0.0024287551641464233, + "gl_dms_target_cr": 3.2058825492858887, + "gl_dms_target_frac": 0.6880733966827393, + "gl_eos_tokens": 11.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0065161725506186485, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.75, + "gl_positions_without_loss_calculation": 11.25, + "grad_norm": 13.963382720947266, + "learning_rate": 3e-05, + "loss": 0.0065, + "step": 76 + }, + { + "epoch": 0.154, + "gl_detached_lm_loss": 0.44424474239349365, + "gl_distil_loss": 0.0036926830653101206, + "gl_dms_closed_frac": 0.6956153512001038, + "gl_dms_cr": 3.2859933376312256, + "gl_dms_loss": 0.0003067776560783386, + "gl_dms_target_cr": 3.2352941036224365, + "gl_dms_target_frac": 0.6909090280532837, + "gl_eos_tokens": 11.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0039994604885578156, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.25, + "gl_positions_without_loss_calculation": 11.75, + "grad_norm": 2.204572916030884, + "learning_rate": 3e-05, + "loss": 0.004, + "step": 77 + }, + { + "epoch": 0.156, + "gl_detached_lm_loss": 0.49248582124710083, + "gl_distil_loss": 0.00488574244081974, + "gl_dms_closed_frac": 0.7086416482925415, + "gl_dms_cr": 3.4329066276550293, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.2647058963775635, + "gl_dms_target_frac": 0.6936936974525452, + "gl_eos_tokens": 11.375, + "gl_input_tokens": 32768.0, + "gl_loss": 0.00488574244081974, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.625, + "gl_positions_without_loss_calculation": 11.375, + "grad_norm": 0.9433761835098267, + "learning_rate": 3e-05, + "loss": 0.0049, + "step": 78 + }, + { + "epoch": 0.158, + "gl_detached_lm_loss": 0.47902607917785645, + "gl_distil_loss": 0.004779959097504616, + "gl_dms_closed_frac": 0.7143765091896057, + "gl_dms_cr": 3.503458023071289, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.2941174507141113, + "gl_dms_target_frac": 0.696428656578064, + "gl_eos_tokens": 10.375, + "gl_input_tokens": 32768.0, + "gl_loss": 0.004779959097504616, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.625, + "gl_positions_without_loss_calculation": 10.375, + "grad_norm": 1.8233813047409058, + "learning_rate": 3e-05, + "loss": 0.0048, + "step": 79 + }, + { + "epoch": 0.16, + "gl_detached_lm_loss": 0.45535245537757874, + "gl_distil_loss": 0.004689468070864677, + "gl_dms_closed_frac": 0.7179027795791626, + "gl_dms_cr": 3.545504093170166, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.3235297203063965, + "gl_dms_target_frac": 0.6991150379180908, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.004689468070864677, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 0.7161194086074829, + "learning_rate": 3e-05, + "loss": 0.0047, + "step": 80 + }, + { + "epoch": 0.162, + "gl_detached_lm_loss": 0.4955553412437439, + "gl_distil_loss": 0.005196040961891413, + "gl_dms_closed_frac": 0.7184000015258789, + "gl_dms_cr": 3.553600788116455, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.3529412746429443, + "gl_dms_target_frac": 0.7017544507980347, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005196040961891413, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 1.219171404838562, + "learning_rate": 3e-05, + "loss": 0.0052, + "step": 81 + }, + { + "epoch": 0.164, + "gl_detached_lm_loss": 0.4791584610939026, + "gl_distil_loss": 0.004651572089642286, + "gl_dms_closed_frac": 0.7162825465202332, + "gl_dms_cr": 3.5258140563964844, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.382352828979492, + "gl_dms_target_frac": 0.7043478488922119, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.004651572089642286, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 0.9896210432052612, + "learning_rate": 3e-05, + "loss": 0.0047, + "step": 82 + }, + { + "epoch": 0.166, + "gl_detached_lm_loss": 0.47231408953666687, + "gl_distil_loss": 0.004033881705254316, + "gl_dms_closed_frac": 0.7049646973609924, + "gl_dms_cr": 3.3899085521698, + "gl_dms_loss": 0.0026998519897460938, + "gl_dms_target_cr": 3.41176438331604, + "gl_dms_target_frac": 0.7068965435028076, + "gl_eos_tokens": 11.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.00673373369500041, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.75, + "gl_positions_without_loss_calculation": 11.25, + "grad_norm": 13.66180419921875, + "learning_rate": 3e-05, + "loss": 0.0067, + "step": 83 + }, + { + "epoch": 0.168, + "gl_detached_lm_loss": 0.46945416927337646, + "gl_distil_loss": 0.00402010977268219, + "gl_dms_closed_frac": 0.7052032351493835, + "gl_dms_cr": 3.392927408218384, + "gl_dms_loss": 0.004722759127616882, + "gl_dms_target_cr": 3.4411768913269043, + "gl_dms_target_frac": 0.7094016671180725, + "gl_eos_tokens": 12.625, + "gl_input_tokens": 32768.0, + "gl_loss": 0.008742868900299072, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.375, + "gl_positions_without_loss_calculation": 12.625, + "grad_norm": 16.05010414123535, + "learning_rate": 3e-05, + "loss": 0.0087, + "step": 84 + }, + { + "epoch": 0.17, + "gl_detached_lm_loss": 0.44967958331108093, + "gl_distil_loss": 0.003981797024607658, + "gl_dms_closed_frac": 0.7158433198928833, + "gl_dms_cr": 3.520792245864868, + "gl_dms_loss": 0.0010783001780509949, + "gl_dms_target_cr": 3.470588445663452, + "gl_dms_target_frac": 0.7118644118309021, + "gl_eos_tokens": 12.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005060096736997366, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.0, + "gl_positions_without_loss_calculation": 12.0, + "grad_norm": 6.453094482421875, + "learning_rate": 3e-05, + "loss": 0.0051, + "step": 85 + }, + { + "epoch": 0.172, + "gl_detached_lm_loss": 0.4791140556335449, + "gl_distil_loss": 0.00494480412453413, + "gl_dms_closed_frac": 0.7277750372886658, + "gl_dms_cr": 3.6741161346435547, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.5, + "gl_dms_target_frac": 0.7142857909202576, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.00494480412453413, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 0.9170877933502197, + "learning_rate": 3e-05, + "loss": 0.0049, + "step": 86 + }, + { + "epoch": 0.174, + "gl_detached_lm_loss": 0.47055312991142273, + "gl_distil_loss": 0.005052007734775543, + "gl_dms_closed_frac": 0.7355045080184937, + "gl_dms_cr": 3.781733751296997, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.529411554336548, + "gl_dms_target_frac": 0.7166666388511658, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005052007734775543, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 0.9470101594924927, + "learning_rate": 3e-05, + "loss": 0.0051, + "step": 87 + }, + { + "epoch": 0.176, + "gl_detached_lm_loss": 0.4664342999458313, + "gl_distil_loss": 0.004945836495608091, + "gl_dms_closed_frac": 0.7362442016601562, + "gl_dms_cr": 3.79286789894104, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.5588231086730957, + "gl_dms_target_frac": 0.7190083265304565, + "gl_eos_tokens": 10.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.004945836495608091, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.0, + "gl_positions_without_loss_calculation": 10.0, + "grad_norm": 0.7975891828536987, + "learning_rate": 3e-05, + "loss": 0.0049, + "step": 88 + }, + { + "epoch": 0.178, + "gl_detached_lm_loss": 0.45679762959480286, + "gl_distil_loss": 0.004826837219297886, + "gl_dms_closed_frac": 0.7391374111175537, + "gl_dms_cr": 3.8347959518432617, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.58823561668396, + "gl_dms_target_frac": 0.7213115096092224, + "gl_eos_tokens": 12.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.004826837219297886, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.75, + "gl_positions_without_loss_calculation": 12.25, + "grad_norm": 0.7645382881164551, + "learning_rate": 3e-05, + "loss": 0.0048, + "step": 89 + }, + { + "epoch": 0.18, + "gl_detached_lm_loss": 0.4502984285354614, + "gl_distil_loss": 0.004537405911833048, + "gl_dms_closed_frac": 0.7365341782569885, + "gl_dms_cr": 3.797346830368042, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.617647171020508, + "gl_dms_target_frac": 0.7235772609710693, + "gl_eos_tokens": 12.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.004537405911833048, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.25, + "gl_positions_without_loss_calculation": 12.75, + "grad_norm": 2.0026698112487793, + "learning_rate": 3e-05, + "loss": 0.0045, + "step": 90 + }, + { + "epoch": 0.182, + "gl_detached_lm_loss": 0.46442925930023193, + "gl_distil_loss": 0.0043852319940924644, + "gl_dms_closed_frac": 0.727599561214447, + "gl_dms_cr": 3.6718647480010986, + "gl_dms_loss": 0.0010252445936203003, + "gl_dms_target_cr": 3.6470587253570557, + "gl_dms_target_frac": 0.725806474685669, + "gl_eos_tokens": 11.875, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005410476587712765, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.125, + "gl_positions_without_loss_calculation": 11.875, + "grad_norm": 7.0575032234191895, + "learning_rate": 3e-05, + "loss": 0.0054, + "step": 91 + }, + { + "epoch": 0.184, + "gl_detached_lm_loss": 0.4796100854873657, + "gl_distil_loss": 0.004694662988185883, + "gl_dms_closed_frac": 0.7282149791717529, + "gl_dms_cr": 3.6801345348358154, + "gl_dms_loss": 0.0016043484210968018, + "gl_dms_target_cr": 3.6764702796936035, + "gl_dms_target_frac": 0.7280000448226929, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.006299011409282684, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 6.339512825012207, + "learning_rate": 3e-05, + "loss": 0.0063, + "step": 92 + }, + { + "epoch": 0.186, + "gl_detached_lm_loss": 0.45466843247413635, + "gl_distil_loss": 0.0042836032807827, + "gl_dms_closed_frac": 0.7350358963012695, + "gl_dms_cr": 3.775805950164795, + "gl_dms_loss": 0.0011206194758415222, + "gl_dms_target_cr": 3.7058825492858887, + "gl_dms_target_frac": 0.7301587462425232, + "gl_eos_tokens": 12.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005404223222285509, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.5, + "gl_positions_without_loss_calculation": 12.5, + "grad_norm": 4.007299900054932, + "learning_rate": 3e-05, + "loss": 0.0054, + "step": 93 + }, + { + "epoch": 0.188, + "gl_detached_lm_loss": 0.4374981224536896, + "gl_distil_loss": 0.005377020221203566, + "gl_dms_closed_frac": 0.7533713579177856, + "gl_dms_cr": 4.057657241821289, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.7352941036224365, + "gl_dms_target_frac": 0.7322835326194763, + "gl_eos_tokens": 12.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005377020221203566, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.0, + "gl_positions_without_loss_calculation": 12.0, + "grad_norm": 2.0438311100006104, + "learning_rate": 3e-05, + "loss": 0.0054, + "step": 94 + }, + { + "epoch": 0.19, + "gl_detached_lm_loss": 0.46797192096710205, + "gl_distil_loss": 0.005093785934150219, + "gl_dms_closed_frac": 0.7516717314720154, + "gl_dms_cr": 4.029850482940674, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.7647058963775635, + "gl_dms_target_frac": 0.734375, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005093785934150219, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 1.0185790061950684, + "learning_rate": 3e-05, + "loss": 0.0051, + "step": 95 + }, + { + "epoch": 0.192, + "gl_detached_lm_loss": 0.47336265444755554, + "gl_distil_loss": 0.005353494081646204, + "gl_dms_closed_frac": 0.7560113072395325, + "gl_dms_cr": 4.101371765136719, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.7941174507141113, + "gl_dms_target_frac": 0.7364340424537659, + "gl_eos_tokens": 10.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005353494081646204, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.5, + "gl_positions_without_loss_calculation": 10.5, + "grad_norm": 1.1575500965118408, + "learning_rate": 3e-05, + "loss": 0.0054, + "step": 96 + }, + { + "epoch": 0.194, + "gl_detached_lm_loss": 0.4636452794075012, + "gl_distil_loss": 0.005527734290808439, + "gl_dms_closed_frac": 0.7628649473190308, + "gl_dms_cr": 4.217801094055176, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.8235297203063965, + "gl_dms_target_frac": 0.7384615540504456, + "gl_eos_tokens": 10.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005527734290808439, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.5, + "gl_positions_without_loss_calculation": 10.5, + "grad_norm": 1.2612135410308838, + "learning_rate": 3e-05, + "loss": 0.0055, + "step": 97 + }, + { + "epoch": 0.196, + "gl_detached_lm_loss": 0.49130597710609436, + "gl_distil_loss": 0.005308451130986214, + "gl_dms_closed_frac": 0.755087673664093, + "gl_dms_cr": 4.086455345153809, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.8529412746429443, + "gl_dms_target_frac": 0.7404580116271973, + "gl_eos_tokens": 12.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005308451130986214, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.75, + "gl_positions_without_loss_calculation": 12.25, + "grad_norm": 0.8162508606910706, + "learning_rate": 3e-05, + "loss": 0.0053, + "step": 98 + }, + { + "epoch": 0.198, + "gl_detached_lm_loss": 0.4728451669216156, + "gl_distil_loss": 0.004922911990433931, + "gl_dms_closed_frac": 0.7501329779624939, + "gl_dms_cr": 4.0030927658081055, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 3.882352828979492, + "gl_dms_target_frac": 0.7424242496490479, + "gl_eos_tokens": 12.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.004922911990433931, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.25, + "gl_positions_without_loss_calculation": 12.75, + "grad_norm": 0.8720424771308899, + "learning_rate": 3e-05, + "loss": 0.0049, + "step": 99 + }, + { + "epoch": 0.2, + "gl_detached_lm_loss": 0.46046435832977295, + "gl_distil_loss": 0.004833785351365805, + "gl_dms_closed_frac": 0.7439101934432983, + "gl_dms_cr": 3.9068267345428467, + "gl_dms_loss": 0.0024033188819885254, + "gl_dms_target_cr": 3.91176438331604, + "gl_dms_target_frac": 0.7443609237670898, + "gl_eos_tokens": 10.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0072371033020317554, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.5, + "gl_positions_without_loss_calculation": 10.5, + "grad_norm": 8.47694206237793, + "learning_rate": 3e-05, + "loss": 0.0072, + "step": 100 + }, + { + "epoch": 0.202, + "gl_detached_lm_loss": 0.49140673875808716, + "gl_distil_loss": 0.004810246638953686, + "gl_dms_closed_frac": 0.7399882078170776, + "gl_dms_cr": 3.8466882705688477, + "gl_dms_loss": 0.006481736898422241, + "gl_dms_target_cr": 3.9411768913269043, + "gl_dms_target_frac": 0.7462686896324158, + "gl_eos_tokens": 12.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.011291983537375927, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.0, + "gl_positions_without_loss_calculation": 12.0, + "grad_norm": 14.86678409576416, + "learning_rate": 3e-05, + "loss": 0.0113, + "step": 101 + }, + { + "epoch": 0.204, + "gl_detached_lm_loss": 0.4723597764968872, + "gl_distil_loss": 0.004680398851633072, + "gl_dms_closed_frac": 0.7438403964042664, + "gl_dms_cr": 3.9060354232788086, + "gl_dms_loss": 0.005097024142742157, + "gl_dms_target_cr": 3.970588445663452, + "gl_dms_target_frac": 0.7481480836868286, + "gl_eos_tokens": 12.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.009777422994375229, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.75, + "gl_positions_without_loss_calculation": 12.25, + "grad_norm": 12.764368057250977, + "learning_rate": 3e-05, + "loss": 0.0098, + "step": 102 + }, + { + "epoch": 0.206, + "gl_detached_lm_loss": 0.45221349596977234, + "gl_distil_loss": 0.00520294951274991, + "gl_dms_closed_frac": 0.7546575665473938, + "gl_dms_cr": 4.077920436859131, + "gl_dms_loss": 0.0010900869965553284, + "gl_dms_target_cr": 4.0, + "gl_dms_target_frac": 0.75, + "gl_eos_tokens": 12.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.006293036509305239, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.5, + "gl_positions_without_loss_calculation": 12.5, + "grad_norm": 4.082666873931885, + "learning_rate": 3e-05, + "loss": 0.0063, + "step": 103 + }, + { + "epoch": 0.208, + "gl_detached_lm_loss": 0.4825055003166199, + "gl_distil_loss": 0.0058518978767097, + "gl_dms_closed_frac": 0.7645120620727539, + "gl_dms_cr": 4.248115539550781, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.029411792755127, + "gl_dms_target_frac": 0.7518247961997986, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0058518978767097, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 1.0137183666229248, + "learning_rate": 3e-05, + "loss": 0.0059, + "step": 104 + }, + { + "epoch": 0.21, + "gl_detached_lm_loss": 0.47762972116470337, + "gl_distil_loss": 0.005726248025894165, + "gl_dms_closed_frac": 0.770868182182312, + "gl_dms_cr": 4.367452621459961, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.058823108673096, + "gl_dms_target_frac": 0.7536231279373169, + "gl_eos_tokens": 10.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005726248025894165, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.75, + "gl_positions_without_loss_calculation": 10.25, + "grad_norm": 1.5721111297607422, + "learning_rate": 3e-05, + "loss": 0.0057, + "step": 105 + }, + { + "epoch": 0.212, + "gl_detached_lm_loss": 0.4453291594982147, + "gl_distil_loss": 0.005907304584980011, + "gl_dms_closed_frac": 0.7767513990402222, + "gl_dms_cr": 4.480729103088379, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.088235378265381, + "gl_dms_target_frac": 0.755395770072937, + "gl_eos_tokens": 13.375, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005907304584980011, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32754.625, + "gl_positions_without_loss_calculation": 13.375, + "grad_norm": 1.3633668422698975, + "learning_rate": 3e-05, + "loss": 0.0059, + "step": 106 + }, + { + "epoch": 0.214, + "gl_detached_lm_loss": 0.4815954267978668, + "gl_distil_loss": 0.005865244194865227, + "gl_dms_closed_frac": 0.7773128747940063, + "gl_dms_cr": 4.4921698570251465, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.117647171020508, + "gl_dms_target_frac": 0.7571429014205933, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005865244194865227, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 1.251830816268921, + "learning_rate": 3e-05, + "loss": 0.0059, + "step": 107 + }, + { + "epoch": 0.216, + "gl_detached_lm_loss": 0.473334401845932, + "gl_distil_loss": 0.005540169775485992, + "gl_dms_closed_frac": 0.7712863683700562, + "gl_dms_cr": 4.375051498413086, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.147058963775635, + "gl_dms_target_frac": 0.7588652968406677, + "gl_eos_tokens": 12.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005540169775485992, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.75, + "gl_positions_without_loss_calculation": 12.25, + "grad_norm": 0.9412449598312378, + "learning_rate": 3e-05, + "loss": 0.0055, + "step": 108 + }, + { + "epoch": 0.218, + "gl_detached_lm_loss": 0.4723076820373535, + "gl_distil_loss": 0.005424611270427704, + "gl_dms_closed_frac": 0.7736709713935852, + "gl_dms_cr": 4.420588493347168, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.17647123336792, + "gl_dms_target_frac": 0.7605633735656738, + "gl_eos_tokens": 11.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.005424611270427704, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.75, + "gl_positions_without_loss_calculation": 11.25, + "grad_norm": 1.0735340118408203, + "learning_rate": 3e-05, + "loss": 0.0054, + "step": 109 + }, + { + "epoch": 0.22, + "gl_detached_lm_loss": 0.48178762197494507, + "gl_distil_loss": 0.005986446980386972, + "gl_dms_closed_frac": 0.764805793762207, + "gl_dms_cr": 4.252771854400635, + "gl_dms_loss": 0.000624343752861023, + "gl_dms_target_cr": 4.205882549285889, + "gl_dms_target_frac": 0.7622377872467041, + "gl_eos_tokens": 9.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.006610790733247995, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32759.0, + "gl_positions_without_loss_calculation": 9.0, + "grad_norm": 3.545208215713501, + "learning_rate": 3e-05, + "loss": 0.0066, + "step": 110 + }, + { + "epoch": 0.222, + "gl_detached_lm_loss": 0.4608258605003357, + "gl_distil_loss": 0.00500837666913867, + "gl_dms_closed_frac": 0.763525128364563, + "gl_dms_cr": 4.232396125793457, + "gl_dms_loss": 0.003266759216785431, + "gl_dms_target_cr": 4.235294342041016, + "gl_dms_target_frac": 0.7638888955116272, + "gl_eos_tokens": 11.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.008275135420262814, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.0, + "gl_positions_without_loss_calculation": 11.0, + "grad_norm": 7.840135097503662, + "learning_rate": 3e-05, + "loss": 0.0083, + "step": 111 + }, + { + "epoch": 0.224, + "gl_detached_lm_loss": 0.46065497398376465, + "gl_distil_loss": 0.005225225817412138, + "gl_dms_closed_frac": 0.7623160481452942, + "gl_dms_cr": 4.2096405029296875, + "gl_dms_loss": 0.004034481942653656, + "gl_dms_target_cr": 4.264705657958984, + "gl_dms_target_frac": 0.7655172348022461, + "gl_eos_tokens": 11.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.009259707294404507, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.25, + "gl_positions_without_loss_calculation": 11.75, + "grad_norm": 11.423973083496094, + "learning_rate": 3e-05, + "loss": 0.0093, + "step": 112 + }, + { + "epoch": 0.226, + "gl_detached_lm_loss": 0.47371160984039307, + "gl_distil_loss": 0.005666877143085003, + "gl_dms_closed_frac": 0.774148166179657, + "gl_dms_cr": 4.4307637214660645, + "gl_dms_loss": 0.0007906854152679443, + "gl_dms_target_cr": 4.294117450714111, + "gl_dms_target_frac": 0.767123281955719, + "gl_eos_tokens": 11.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.006457562558352947, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.75, + "gl_positions_without_loss_calculation": 11.25, + "grad_norm": 1.7580599784851074, + "learning_rate": 3e-05, + "loss": 0.0065, + "step": 113 + }, + { + "epoch": 0.228, + "gl_detached_lm_loss": 0.4567711353302002, + "gl_distil_loss": 0.00592584116384387, + "gl_dms_closed_frac": 0.7843614220619202, + "gl_dms_cr": 4.637998580932617, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.32352876663208, + "gl_dms_target_frac": 0.7687073945999146, + "gl_eos_tokens": 11.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.00592584116384387, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.0, + "gl_positions_without_loss_calculation": 11.0, + "grad_norm": 0.8876529932022095, + "learning_rate": 3e-05, + "loss": 0.0059, + "step": 114 + }, + { + "epoch": 0.23, + "gl_detached_lm_loss": 0.4576658606529236, + "gl_distil_loss": 0.0065962583757936954, + "gl_dms_closed_frac": 0.7914761900901794, + "gl_dms_cr": 4.798535346984863, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.352941036224365, + "gl_dms_target_frac": 0.7702702879905701, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0065962583757936954, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 1.1149492263793945, + "learning_rate": 3e-05, + "loss": 0.0066, + "step": 115 + }, + { + "epoch": 0.232, + "gl_detached_lm_loss": 0.4753264784812927, + "gl_distil_loss": 0.006386497989296913, + "gl_dms_closed_frac": 0.7959333658218384, + "gl_dms_cr": 4.9022216796875, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.382352828979492, + "gl_dms_target_frac": 0.7718120217323303, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.006386497989296913, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 1.1145360469818115, + "learning_rate": 3e-05, + "loss": 0.0064, + "step": 116 + }, + { + "epoch": 0.234, + "gl_detached_lm_loss": 0.4654878079891205, + "gl_distil_loss": 0.006225732620805502, + "gl_dms_closed_frac": 0.7925079464912415, + "gl_dms_cr": 4.8202338218688965, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.411764621734619, + "gl_dms_target_frac": 0.7733333110809326, + "gl_eos_tokens": 12.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.006225732620805502, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.25, + "gl_positions_without_loss_calculation": 12.75, + "grad_norm": 1.121919870376587, + "learning_rate": 3e-05, + "loss": 0.0062, + "step": 117 + }, + { + "epoch": 0.236, + "gl_detached_lm_loss": 0.46314623951911926, + "gl_distil_loss": 0.0068687875755131245, + "gl_dms_closed_frac": 0.7923510074615479, + "gl_dms_cr": 4.819302558898926, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.441176891326904, + "gl_dms_target_frac": 0.7748345136642456, + "gl_eos_tokens": 10.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0068687875755131245, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.0, + "gl_positions_without_loss_calculation": 10.0, + "grad_norm": 2.76143741607666, + "learning_rate": 3e-05, + "loss": 0.0069, + "step": 118 + }, + { + "epoch": 0.238, + "gl_detached_lm_loss": 0.4607497453689575, + "gl_distil_loss": 0.006361807230859995, + "gl_dms_closed_frac": 0.786472499370575, + "gl_dms_cr": 4.685853004455566, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.470588207244873, + "gl_dms_target_frac": 0.7763157486915588, + "gl_eos_tokens": 9.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.006361807230859995, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.25, + "gl_positions_without_loss_calculation": 9.75, + "grad_norm": 0.921043872833252, + "learning_rate": 3e-05, + "loss": 0.0064, + "step": 119 + }, + { + "epoch": 0.24, + "gl_detached_lm_loss": 0.47533145546913147, + "gl_distil_loss": 0.005683056078851223, + "gl_dms_closed_frac": 0.7742487788200378, + "gl_dms_cr": 4.4305853843688965, + "gl_dms_loss": 0.0037161409854888916, + "gl_dms_target_cr": 4.5, + "gl_dms_target_frac": 0.7777777314186096, + "gl_eos_tokens": 11.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.009399197064340115, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.25, + "gl_positions_without_loss_calculation": 11.75, + "grad_norm": 13.05197811126709, + "learning_rate": 3e-05, + "loss": 0.0094, + "step": 120 + }, + { + "epoch": 0.242, + "gl_detached_lm_loss": 0.4673323631286621, + "gl_distil_loss": 0.005364595912396908, + "gl_dms_closed_frac": 0.774548351764679, + "gl_dms_cr": 4.43836784362793, + "gl_dms_loss": 0.005432605743408203, + "gl_dms_target_cr": 4.529411792755127, + "gl_dms_target_frac": 0.779220700263977, + "gl_eos_tokens": 12.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.010797200724482536, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.0, + "gl_positions_without_loss_calculation": 12.0, + "grad_norm": 11.362590789794922, + "learning_rate": 3e-05, + "loss": 0.0108, + "step": 121 + }, + { + "epoch": 0.244, + "gl_detached_lm_loss": 0.46205538511276245, + "gl_distil_loss": 0.005460300482809544, + "gl_dms_closed_frac": 0.7765141725540161, + "gl_dms_cr": 4.475861072540283, + "gl_dms_loss": 0.00442194938659668, + "gl_dms_target_cr": 4.558823108673096, + "gl_dms_target_frac": 0.7806451320648193, + "gl_eos_tokens": 12.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.009882249869406223, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.25, + "gl_positions_without_loss_calculation": 12.75, + "grad_norm": 11.339438438415527, + "learning_rate": 3e-05, + "loss": 0.0099, + "step": 122 + }, + { + "epoch": 0.246, + "gl_detached_lm_loss": 0.4668773412704468, + "gl_distil_loss": 0.006131088826805353, + "gl_dms_closed_frac": 0.7835890650749207, + "gl_dms_cr": 4.622951030731201, + "gl_dms_loss": 0.0013428032398223877, + "gl_dms_target_cr": 4.588235378265381, + "gl_dms_target_frac": 0.7820512056350708, + "gl_eos_tokens": 10.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.007473892066627741, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.0, + "gl_positions_without_loss_calculation": 10.0, + "grad_norm": 4.741786479949951, + "learning_rate": 3e-05, + "loss": 0.0075, + "step": 123 + }, + { + "epoch": 0.248, + "gl_detached_lm_loss": 0.48726099729537964, + "gl_distil_loss": 0.006155762821435928, + "gl_dms_closed_frac": 0.7912291884422302, + "gl_dms_cr": 4.793363094329834, + "gl_dms_loss": 0.00044589489698410034, + "gl_dms_target_cr": 4.617647171020508, + "gl_dms_target_frac": 0.783439576625824, + "gl_eos_tokens": 9.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.006601657718420029, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.25, + "gl_positions_without_loss_calculation": 9.75, + "grad_norm": 1.6124783754348755, + "learning_rate": 3e-05, + "loss": 0.0066, + "step": 124 + }, + { + "epoch": 0.25, + "gl_detached_lm_loss": 0.4655800461769104, + "gl_distil_loss": 0.006976068951189518, + "gl_dms_closed_frac": 0.8033832907676697, + "gl_dms_cr": 5.088557243347168, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.647058963775635, + "gl_dms_target_frac": 0.7848101258277893, + "gl_eos_tokens": 11.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.006976068951189518, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.75, + "gl_positions_without_loss_calculation": 11.25, + "grad_norm": 1.4380581378936768, + "learning_rate": 3e-05, + "loss": 0.007, + "step": 125 + }, + { + "epoch": 0.252, + "gl_detached_lm_loss": 0.4725133776664734, + "gl_distil_loss": 0.007554593961685896, + "gl_dms_closed_frac": 0.8122463822364807, + "gl_dms_cr": 5.328559398651123, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.67647123336792, + "gl_dms_target_frac": 0.7861634492874146, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.007554593961685896, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 1.0721145868301392, + "learning_rate": 3e-05, + "loss": 0.0076, + "step": 126 + }, + { + "epoch": 0.254, + "gl_detached_lm_loss": 0.470710426568985, + "gl_distil_loss": 0.007993207313120365, + "gl_dms_closed_frac": 0.8147773742675781, + "gl_dms_cr": 5.401104927062988, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.705882549285889, + "gl_dms_target_frac": 0.7874999642372131, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.007993207313120365, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 1.3880622386932373, + "learning_rate": 3e-05, + "loss": 0.008, + "step": 127 + }, + { + "epoch": 0.256, + "gl_detached_lm_loss": 0.4928918182849884, + "gl_distil_loss": 0.007520680315792561, + "gl_dms_closed_frac": 0.8154762983322144, + "gl_dms_cr": 5.420012950897217, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.735294342041016, + "gl_dms_target_frac": 0.7888198494911194, + "gl_eos_tokens": 12.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.007520680315792561, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.0, + "gl_positions_without_loss_calculation": 12.0, + "grad_norm": 1.0682696104049683, + "learning_rate": 3e-05, + "loss": 0.0075, + "step": 128 + }, + { + "epoch": 0.258, + "gl_detached_lm_loss": 0.4680056571960449, + "gl_distil_loss": 0.00793403573334217, + "gl_dms_closed_frac": 0.8118024468421936, + "gl_dms_cr": 5.31453800201416, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.764705657958984, + "gl_dms_target_frac": 0.790123462677002, + "gl_eos_tokens": 10.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.00793403573334217, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.0, + "gl_positions_without_loss_calculation": 10.0, + "grad_norm": 1.0643372535705566, + "learning_rate": 3e-05, + "loss": 0.0079, + "step": 129 + }, + { + "epoch": 0.26, + "gl_detached_lm_loss": 0.47951599955558777, + "gl_distil_loss": 0.007271426264196634, + "gl_dms_closed_frac": 0.8056186437606812, + "gl_dms_cr": 5.148845672607422, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.794117450714111, + "gl_dms_target_frac": 0.7914109826087952, + "gl_eos_tokens": 11.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.007271426264196634, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.75, + "gl_positions_without_loss_calculation": 11.25, + "grad_norm": 2.485182046890259, + "learning_rate": 3e-05, + "loss": 0.0073, + "step": 130 + }, + { + "epoch": 0.262, + "gl_detached_lm_loss": 0.4433704912662506, + "gl_distil_loss": 0.006428849883377552, + "gl_dms_closed_frac": 0.8004820942878723, + "gl_dms_cr": 5.012540817260742, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.82352876663208, + "gl_dms_target_frac": 0.7926830053329468, + "gl_eos_tokens": 9.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.006428849883377552, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.5, + "gl_positions_without_loss_calculation": 9.5, + "grad_norm": 1.4084274768829346, + "learning_rate": 3e-05, + "loss": 0.0064, + "step": 131 + }, + { + "epoch": 0.264, + "gl_detached_lm_loss": 0.4866332709789276, + "gl_distil_loss": 0.006288310047239065, + "gl_dms_closed_frac": 0.78985196352005, + "gl_dms_cr": 4.759027481079102, + "gl_dms_loss": 0.004087455570697784, + "gl_dms_target_cr": 4.852941036224365, + "gl_dms_target_frac": 0.793939471244812, + "gl_eos_tokens": 11.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.010375766083598137, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.75, + "gl_positions_without_loss_calculation": 11.25, + "grad_norm": 13.989530563354492, + "learning_rate": 3e-05, + "loss": 0.0104, + "step": 132 + }, + { + "epoch": 0.266, + "gl_detached_lm_loss": 0.4728991985321045, + "gl_distil_loss": 0.00651888782158494, + "gl_dms_closed_frac": 0.7894306182861328, + "gl_dms_cr": 4.750518321990967, + "gl_dms_loss": 0.005873598158359528, + "gl_dms_target_cr": 4.882352828979492, + "gl_dms_target_frac": 0.7951807379722595, + "gl_eos_tokens": 10.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.01239248737692833, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.75, + "gl_positions_without_loss_calculation": 10.25, + "grad_norm": 12.475295066833496, + "learning_rate": 3e-05, + "loss": 0.0124, + "step": 133 + }, + { + "epoch": 0.268, + "gl_detached_lm_loss": 0.43680471181869507, + "gl_distil_loss": 0.005585328210145235, + "gl_dms_closed_frac": 0.7920165657997131, + "gl_dms_cr": 4.809713363647461, + "gl_dms_loss": 0.004696056246757507, + "gl_dms_target_cr": 4.911764621734619, + "gl_dms_target_frac": 0.796407163143158, + "gl_eos_tokens": 12.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.010281385853886604, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.75, + "gl_positions_without_loss_calculation": 12.25, + "grad_norm": 11.904838562011719, + "learning_rate": 3e-05, + "loss": 0.0103, + "step": 134 + }, + { + "epoch": 0.27, + "gl_detached_lm_loss": 0.49193722009658813, + "gl_distil_loss": 0.006255386397242546, + "gl_dms_closed_frac": 0.794114351272583, + "gl_dms_cr": 4.860118865966797, + "gl_dms_loss": 0.004544064402580261, + "gl_dms_target_cr": 4.941176891326904, + "gl_dms_target_frac": 0.7976189851760864, + "gl_eos_tokens": 10.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.010799449868500233, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.75, + "gl_positions_without_loss_calculation": 10.25, + "grad_norm": 10.285233497619629, + "learning_rate": 3e-05, + "loss": 0.0108, + "step": 135 + }, + { + "epoch": 0.272, + "gl_detached_lm_loss": 0.454690158367157, + "gl_distil_loss": 0.006719416473060846, + "gl_dms_closed_frac": 0.8077540397644043, + "gl_dms_cr": 5.204880237579346, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 4.970588207244873, + "gl_dms_target_frac": 0.7988166213035583, + "gl_eos_tokens": 11.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.006719416473060846, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.25, + "gl_positions_without_loss_calculation": 11.75, + "grad_norm": 8.923818588256836, + "learning_rate": 3e-05, + "loss": 0.0067, + "step": 136 + }, + { + "epoch": 0.274, + "gl_detached_lm_loss": 0.45901381969451904, + "gl_distil_loss": 0.007275748066604137, + "gl_dms_closed_frac": 0.8141677975654602, + "gl_dms_cr": 5.384031295776367, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.0, + "gl_dms_target_frac": 0.8000000715255737, + "gl_eos_tokens": 11.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.007275748066604137, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.0, + "gl_positions_without_loss_calculation": 11.0, + "grad_norm": 3.378786325454712, + "learning_rate": 3e-05, + "loss": 0.0073, + "step": 137 + }, + { + "epoch": 0.276, + "gl_detached_lm_loss": 0.4887048006057739, + "gl_distil_loss": 0.007861443795263767, + "gl_dms_closed_frac": 0.8187439441680908, + "gl_dms_cr": 5.519390106201172, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.029411792755127, + "gl_dms_target_frac": 0.8011695146560669, + "gl_eos_tokens": 11.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.007861443795263767, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.25, + "gl_positions_without_loss_calculation": 11.75, + "grad_norm": 1.7097647190093994, + "learning_rate": 3e-05, + "loss": 0.0079, + "step": 138 + }, + { + "epoch": 0.278, + "gl_detached_lm_loss": 0.4592478573322296, + "gl_distil_loss": 0.0075279176235198975, + "gl_dms_closed_frac": 0.8244554400444031, + "gl_dms_cr": 5.699601650238037, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.058823108673096, + "gl_dms_target_frac": 0.8023256659507751, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0075279176235198975, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 1.3076081275939941, + "learning_rate": 3e-05, + "loss": 0.0075, + "step": 139 + }, + { + "epoch": 0.28, + "gl_detached_lm_loss": 0.46759268641471863, + "gl_distil_loss": 0.008083690889179707, + "gl_dms_closed_frac": 0.8244903087615967, + "gl_dms_cr": 5.70239782333374, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.088235378265381, + "gl_dms_target_frac": 0.8034682273864746, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.008083690889179707, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 2.303365468978882, + "learning_rate": 3e-05, + "loss": 0.0081, + "step": 140 + }, + { + "epoch": 0.282, + "gl_detached_lm_loss": 0.4478227496147156, + "gl_distil_loss": 0.0072470554150640965, + "gl_dms_closed_frac": 0.8226778507232666, + "gl_dms_cr": 5.6427435874938965, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.117647171020508, + "gl_dms_target_frac": 0.8045977354049683, + "gl_eos_tokens": 11.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0072470554150640965, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.25, + "gl_positions_without_loss_calculation": 11.75, + "grad_norm": 2.6740522384643555, + "learning_rate": 3e-05, + "loss": 0.0072, + "step": 141 + }, + { + "epoch": 0.284, + "gl_detached_lm_loss": 0.4702463150024414, + "gl_distil_loss": 0.0071958452463150024, + "gl_dms_closed_frac": 0.8195258975028992, + "gl_dms_cr": 5.542600631713867, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.147058963775635, + "gl_dms_target_frac": 0.8057142496109009, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0071958452463150024, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 3.9141881465911865, + "learning_rate": 3e-05, + "loss": 0.0072, + "step": 142 + }, + { + "epoch": 0.286, + "gl_detached_lm_loss": 0.4721701443195343, + "gl_distil_loss": 0.006920252926647663, + "gl_dms_closed_frac": 0.8142572045326233, + "gl_dms_cr": 5.386926651000977, + "gl_dms_loss": 5.720555782318115e-05, + "gl_dms_target_cr": 5.17647123336792, + "gl_dms_target_frac": 0.8068181276321411, + "gl_eos_tokens": 13.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.006977458484470844, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32754.75, + "gl_positions_without_loss_calculation": 13.25, + "grad_norm": 1.4146333932876587, + "learning_rate": 3e-05, + "loss": 0.007, + "step": 143 + }, + { + "epoch": 0.288, + "gl_detached_lm_loss": 0.4847974479198456, + "gl_distil_loss": 0.0072076404467225075, + "gl_dms_closed_frac": 0.813251793384552, + "gl_dms_cr": 5.357405185699463, + "gl_dms_loss": 7.994472980499268e-05, + "gl_dms_target_cr": 5.205882549285889, + "gl_dms_target_frac": 0.8079095482826233, + "gl_eos_tokens": 12.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0072875851765275, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.75, + "gl_positions_without_loss_calculation": 12.25, + "grad_norm": 1.362390160560608, + "learning_rate": 3e-05, + "loss": 0.0073, + "step": 144 + }, + { + "epoch": 0.29, + "gl_detached_lm_loss": 0.46022504568099976, + "gl_distil_loss": 0.00686027854681015, + "gl_dms_closed_frac": 0.8179160356521606, + "gl_dms_cr": 5.494892120361328, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.235294342041016, + "gl_dms_target_frac": 0.8089886903762817, + "gl_eos_tokens": 11.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.00686027854681015, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.25, + "gl_positions_without_loss_calculation": 11.75, + "grad_norm": 1.7687309980392456, + "learning_rate": 3e-05, + "loss": 0.0069, + "step": 145 + }, + { + "epoch": 0.292, + "gl_detached_lm_loss": 0.46016189455986023, + "gl_distil_loss": 0.0068475911393761635, + "gl_dms_closed_frac": 0.8167369365692139, + "gl_dms_cr": 5.463459014892578, + "gl_dms_loss": 0.00013460218906402588, + "gl_dms_target_cr": 5.264705657958984, + "gl_dms_target_frac": 0.8100557923316956, + "gl_eos_tokens": 12.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.006982193328440189, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.0, + "gl_positions_without_loss_calculation": 12.0, + "grad_norm": 2.82254695892334, + "learning_rate": 3e-05, + "loss": 0.007, + "step": 146 + }, + { + "epoch": 0.294, + "gl_detached_lm_loss": 0.46145033836364746, + "gl_distil_loss": 0.00709201954305172, + "gl_dms_closed_frac": 0.8229960799217224, + "gl_dms_cr": 5.653026103973389, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.294117450714111, + "gl_dms_target_frac": 0.8111110329627991, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.00709201954305172, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 1.2006562948226929, + "learning_rate": 3e-05, + "loss": 0.0071, + "step": 147 + }, + { + "epoch": 0.296, + "gl_detached_lm_loss": 0.4399712383747101, + "gl_distil_loss": 0.007063089869916439, + "gl_dms_closed_frac": 0.8253207802772522, + "gl_dms_cr": 5.728308200836182, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.32352876663208, + "gl_dms_target_frac": 0.8121547102928162, + "gl_eos_tokens": 10.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.007063089869916439, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.0, + "gl_positions_without_loss_calculation": 10.0, + "grad_norm": 2.0583231449127197, + "learning_rate": 3e-05, + "loss": 0.0071, + "step": 148 + }, + { + "epoch": 0.298, + "gl_detached_lm_loss": 0.4723561108112335, + "gl_distil_loss": 0.007901018485426903, + "gl_dms_closed_frac": 0.8258881568908691, + "gl_dms_cr": 5.744671821594238, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.352940559387207, + "gl_dms_target_frac": 0.813186764717102, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.007901018485426903, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 1.7985323667526245, + "learning_rate": 3e-05, + "loss": 0.0079, + "step": 149 + }, + { + "epoch": 0.3, + "gl_detached_lm_loss": 0.4717611074447632, + "gl_distil_loss": 0.008037258870899677, + "gl_dms_closed_frac": 0.8242531418800354, + "gl_dms_cr": 5.693231105804443, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.382352828979492, + "gl_dms_target_frac": 0.8142076134681702, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.008037258870899677, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 1.1288707256317139, + "learning_rate": 3e-05, + "loss": 0.008, + "step": 150 + }, + { + "epoch": 0.302, + "gl_detached_lm_loss": 0.46595779061317444, + "gl_distil_loss": 0.006495564244687557, + "gl_dms_closed_frac": 0.8167760968208313, + "gl_dms_cr": 5.466947555541992, + "gl_dms_loss": 0.0025366097688674927, + "gl_dms_target_cr": 5.411765098571777, + "gl_dms_target_frac": 0.8152174353599548, + "gl_eos_tokens": 13.875, + "gl_input_tokens": 32768.0, + "gl_loss": 0.00903217401355505, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32754.125, + "gl_positions_without_loss_calculation": 13.875, + "grad_norm": 4.6393632888793945, + "learning_rate": 3e-05, + "loss": 0.009, + "step": 151 + }, + { + "epoch": 0.304, + "gl_detached_lm_loss": 0.4414372742176056, + "gl_distil_loss": 0.006566391792148352, + "gl_dms_closed_frac": 0.8200860023498535, + "gl_dms_cr": 5.560153484344482, + "gl_dms_loss": 0.000374525785446167, + "gl_dms_target_cr": 5.441176891326904, + "gl_dms_target_frac": 0.8162163496017456, + "gl_eos_tokens": 12.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.006940917111933231, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.25, + "gl_positions_without_loss_calculation": 12.75, + "grad_norm": 3.3325436115264893, + "learning_rate": 3e-05, + "loss": 0.0069, + "step": 152 + }, + { + "epoch": 0.306, + "gl_detached_lm_loss": 0.4721607267856598, + "gl_distil_loss": 0.007050786633044481, + "gl_dms_closed_frac": 0.8164743185043335, + "gl_dms_cr": 5.452330112457275, + "gl_dms_loss": 0.0020945072174072266, + "gl_dms_target_cr": 5.470588207244873, + "gl_dms_target_frac": 0.8172043561935425, + "gl_eos_tokens": 10.625, + "gl_input_tokens": 32768.0, + "gl_loss": 0.009145294316112995, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.375, + "gl_positions_without_loss_calculation": 10.625, + "grad_norm": 4.4689860343933105, + "learning_rate": 3e-05, + "loss": 0.0091, + "step": 153 + }, + { + "epoch": 0.308, + "gl_detached_lm_loss": 0.49011290073394775, + "gl_distil_loss": 0.007768554147332907, + "gl_dms_closed_frac": 0.8201054930686951, + "gl_dms_cr": 5.564466953277588, + "gl_dms_loss": 0.0018368959426879883, + "gl_dms_target_cr": 5.5, + "gl_dms_target_frac": 0.8181818723678589, + "gl_eos_tokens": 9.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.009605450555682182, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.5, + "gl_positions_without_loss_calculation": 9.5, + "grad_norm": 4.584949493408203, + "learning_rate": 3e-05, + "loss": 0.0096, + "step": 154 + }, + { + "epoch": 0.31, + "gl_detached_lm_loss": 0.4801517724990845, + "gl_distil_loss": 0.0074509927071630955, + "gl_dms_closed_frac": 0.829155683517456, + "gl_dms_cr": 5.857370376586914, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.529411792755127, + "gl_dms_target_frac": 0.8191489577293396, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0074509927071630955, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 0.992720365524292, + "learning_rate": 3e-05, + "loss": 0.0075, + "step": 155 + }, + { + "epoch": 0.312, + "gl_detached_lm_loss": 0.47425395250320435, + "gl_distil_loss": 0.008633583784103394, + "gl_dms_closed_frac": 0.8352111577987671, + "gl_dms_cr": 6.070135593414307, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.558823108673096, + "gl_dms_target_frac": 0.8201056718826294, + "gl_eos_tokens": 10.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.008633583784103394, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.5, + "gl_positions_without_loss_calculation": 10.5, + "grad_norm": 1.5924896001815796, + "learning_rate": 3e-05, + "loss": 0.0086, + "step": 156 + }, + { + "epoch": 0.314, + "gl_detached_lm_loss": 0.4853774905204773, + "gl_distil_loss": 0.008461525663733482, + "gl_dms_closed_frac": 0.8332453370094299, + "gl_dms_cr": 6.001097679138184, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.588234901428223, + "gl_dms_target_frac": 0.821052610874176, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.008461525663733482, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 1.1708149909973145, + "learning_rate": 3e-05, + "loss": 0.0085, + "step": 157 + }, + { + "epoch": 0.316, + "gl_detached_lm_loss": 0.5149613618850708, + "gl_distil_loss": 0.008489280007779598, + "gl_dms_closed_frac": 0.8285330533981323, + "gl_dms_cr": 5.833188056945801, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.617647171020508, + "gl_dms_target_frac": 0.8219895362854004, + "gl_eos_tokens": 11.5, + "gl_input_tokens": 32768.0, + "gl_loss": 0.008489280007779598, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.5, + "gl_positions_without_loss_calculation": 11.5, + "grad_norm": 1.376883864402771, + "learning_rate": 3e-05, + "loss": 0.0085, + "step": 158 + }, + { + "epoch": 0.318, + "gl_detached_lm_loss": 0.4815997779369354, + "gl_distil_loss": 0.007431944832205772, + "gl_dms_closed_frac": 0.8258528709411621, + "gl_dms_cr": 5.750091552734375, + "gl_dms_loss": 0.0016558989882469177, + "gl_dms_target_cr": 5.647059440612793, + "gl_dms_target_frac": 0.8229166269302368, + "gl_eos_tokens": 10.875, + "gl_input_tokens": 32768.0, + "gl_loss": 0.00908784382045269, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.125, + "gl_positions_without_loss_calculation": 10.875, + "grad_norm": 2.853998899459839, + "learning_rate": 3e-05, + "loss": 0.0091, + "step": 159 + }, + { + "epoch": 0.32, + "gl_detached_lm_loss": 0.4931546449661255, + "gl_distil_loss": 0.0077752345241606236, + "gl_dms_closed_frac": 0.8248758912086487, + "gl_dms_cr": 5.713653087615967, + "gl_dms_loss": 0.001398041844367981, + "gl_dms_target_cr": 5.67647123336792, + "gl_dms_target_frac": 0.8238343000411987, + "gl_eos_tokens": 9.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.009173276834189892, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32758.25, + "gl_positions_without_loss_calculation": 9.75, + "grad_norm": 5.879312515258789, + "learning_rate": 3e-05, + "loss": 0.0092, + "step": 160 + }, + { + "epoch": 0.322, + "gl_detached_lm_loss": 0.49633410573005676, + "gl_distil_loss": 0.007922427728772163, + "gl_dms_closed_frac": 0.8279393911361694, + "gl_dms_cr": 5.816441535949707, + "gl_dms_loss": 0.0007596909999847412, + "gl_dms_target_cr": 5.705882549285889, + "gl_dms_target_frac": 0.8247422575950623, + "gl_eos_tokens": 11.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.008682118728756905, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.25, + "gl_positions_without_loss_calculation": 11.75, + "grad_norm": 4.280725002288818, + "learning_rate": 3e-05, + "loss": 0.0087, + "step": 161 + }, + { + "epoch": 0.324, + "gl_detached_lm_loss": 0.46856874227523804, + "gl_distil_loss": 0.0076766908168792725, + "gl_dms_closed_frac": 0.8342061042785645, + "gl_dms_cr": 6.03446102142334, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.735294342041016, + "gl_dms_target_frac": 0.8256410956382751, + "gl_eos_tokens": 12.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0076766908168792725, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.25, + "gl_positions_without_loss_calculation": 12.75, + "grad_norm": 1.2406954765319824, + "learning_rate": 3e-05, + "loss": 0.0077, + "step": 162 + }, + { + "epoch": 0.326, + "gl_detached_lm_loss": 0.4626222252845764, + "gl_distil_loss": 0.007656690198928118, + "gl_dms_closed_frac": 0.8375633358955383, + "gl_dms_cr": 6.158156394958496, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.764705657958984, + "gl_dms_target_frac": 0.8265305757522583, + "gl_eos_tokens": 11.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.007656690198928118, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.25, + "gl_positions_without_loss_calculation": 11.75, + "grad_norm": 1.0347411632537842, + "learning_rate": 3e-05, + "loss": 0.0077, + "step": 163 + }, + { + "epoch": 0.328, + "gl_detached_lm_loss": 0.465984582901001, + "gl_distil_loss": 0.008057629689574242, + "gl_dms_closed_frac": 0.8382847905158997, + "gl_dms_cr": 6.185460090637207, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.794117450714111, + "gl_dms_target_frac": 0.8274111747741699, + "gl_eos_tokens": 13.125, + "gl_input_tokens": 32768.0, + "gl_loss": 0.008057629689574242, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32754.875, + "gl_positions_without_loss_calculation": 13.125, + "grad_norm": 1.1180733442306519, + "learning_rate": 3e-05, + "loss": 0.0081, + "step": 164 + }, + { + "epoch": 0.33, + "gl_detached_lm_loss": 0.4694860577583313, + "gl_distil_loss": 0.007432657293975353, + "gl_dms_closed_frac": 0.8331831097602844, + "gl_dms_cr": 5.996460914611816, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.82352876663208, + "gl_dms_target_frac": 0.8282828330993652, + "gl_eos_tokens": 11.625, + "gl_input_tokens": 32768.0, + "gl_loss": 0.007432657293975353, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.375, + "gl_positions_without_loss_calculation": 11.625, + "grad_norm": 1.125787615776062, + "learning_rate": 3e-05, + "loss": 0.0074, + "step": 165 + }, + { + "epoch": 0.332, + "gl_detached_lm_loss": 0.4963419437408447, + "gl_distil_loss": 0.008539569564163685, + "gl_dms_closed_frac": 0.8266037702560425, + "gl_dms_cr": 5.772175312042236, + "gl_dms_loss": 0.003481827676296234, + "gl_dms_target_cr": 5.852940559387207, + "gl_dms_target_frac": 0.8291457891464233, + "gl_eos_tokens": 10.125, + "gl_input_tokens": 32768.0, + "gl_loss": 0.012021397240459919, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.875, + "gl_positions_without_loss_calculation": 10.125, + "grad_norm": 5.81721830368042, + "learning_rate": 3e-05, + "loss": 0.012, + "step": 166 + }, + { + "epoch": 0.334, + "gl_detached_lm_loss": 0.47259899973869324, + "gl_distil_loss": 0.007361316587775946, + "gl_dms_closed_frac": 0.8257991075515747, + "gl_dms_cr": 5.7416534423828125, + "gl_dms_loss": 0.004307880997657776, + "gl_dms_target_cr": 5.882352828979492, + "gl_dms_target_frac": 0.8299999833106995, + "gl_eos_tokens": 10.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.011669197119772434, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32757.25, + "gl_positions_without_loss_calculation": 10.75, + "grad_norm": 10.337682723999023, + "learning_rate": 3e-05, + "loss": 0.0117, + "step": 167 + }, + { + "epoch": 0.336, + "gl_detached_lm_loss": 0.4550955891609192, + "gl_distil_loss": 0.0070419758558273315, + "gl_dms_closed_frac": 0.8298782110214233, + "gl_dms_cr": 5.879843711853027, + "gl_dms_loss": 0.0018124282360076904, + "gl_dms_target_cr": 5.911765098571777, + "gl_dms_target_frac": 0.8308457732200623, + "gl_eos_tokens": 12.25, + "gl_input_tokens": 32768.0, + "gl_loss": 0.008854404091835022, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32755.75, + "gl_positions_without_loss_calculation": 12.25, + "grad_norm": 5.435131072998047, + "learning_rate": 3e-05, + "loss": 0.0089, + "step": 168 + }, + { + "epoch": 0.338, + "gl_detached_lm_loss": 0.4601227045059204, + "gl_distil_loss": 0.007701840251684189, + "gl_dms_closed_frac": 0.835907518863678, + "gl_dms_cr": 6.098821640014648, + "gl_dms_loss": 0.0005731061100959778, + "gl_dms_target_cr": 5.941176891326904, + "gl_dms_target_frac": 0.8316831588745117, + "gl_eos_tokens": 12.0, + "gl_input_tokens": 32768.0, + "gl_loss": 0.008274946361780167, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.0, + "gl_positions_without_loss_calculation": 12.0, + "grad_norm": 0.9658327698707581, + "learning_rate": 3e-05, + "loss": 0.0083, + "step": 169 + }, + { + "epoch": 0.34, + "gl_detached_lm_loss": 0.4400879442691803, + "gl_distil_loss": 0.0074328347109258175, + "gl_dms_closed_frac": 0.8415706753730774, + "gl_dms_cr": 6.313007354736328, + "gl_dms_loss": 0.0, + "gl_dms_target_cr": 5.970588207244873, + "gl_dms_target_frac": 0.8325123190879822, + "gl_eos_tokens": 11.75, + "gl_input_tokens": 32768.0, + "gl_loss": 0.0074328347109258175, + "gl_masked_tokens": 0.0, + "gl_positions_for_loss_calculation": 32756.25, + "gl_positions_without_loss_calculation": 11.75, + "grad_norm": 1.949934720993042, + "learning_rate": 3e-05, + "loss": 0.0074, + "step": 170 + } + ], + "logging_steps": 1, + "max_steps": 238, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 34, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}