{ "best_metric": 0.07459307722662609, "best_model_checkpoint": "/bartabsa-reproduce/outputs/gpt22gpt2_42/checkpoint-20000", "epoch": 2.999832822513235, "eval_steps": 2000, "global_step": 26916, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05572582892170521, "grad_norm": 3.767733097076416, "learning_rate": 2.5e-05, "loss": 4.6099, "step": 500 }, { "epoch": 0.11145165784341042, "grad_norm": 2.3378305435180664, "learning_rate": 5e-05, "loss": 3.4791, "step": 1000 }, { "epoch": 0.16717748676511562, "grad_norm": 1.7727080583572388, "learning_rate": 4.9035344960642076e-05, "loss": 3.3408, "step": 1500 }, { "epoch": 0.22290331568682084, "grad_norm": 1.6580827236175537, "learning_rate": 4.807068992128415e-05, "loss": 3.2453, "step": 2000 }, { "epoch": 0.22290331568682084, "eval_loss": 3.040846347808838, "eval_rouge1": 0.2030762835922255, "eval_rouge2": 0.03654806097360143, "eval_rougeL": 0.1269024607851587, "eval_rougeLsum": 0.19064872923784548, "eval_runtime": 3935.594, "eval_samples_per_second": 3.397, "eval_steps_per_second": 0.212, "step": 2000 }, { "epoch": 0.27862914460852606, "grad_norm": 1.6749674081802368, "learning_rate": 4.7106034881926225e-05, "loss": 3.1966, "step": 2500 }, { "epoch": 0.33435497353023125, "grad_norm": 1.6479393243789673, "learning_rate": 4.61413798425683e-05, "loss": 3.1416, "step": 3000 }, { "epoch": 0.3900808024519365, "grad_norm": 1.6228386163711548, "learning_rate": 4.517672480321037e-05, "loss": 3.0931, "step": 3500 }, { "epoch": 0.4458066313736417, "grad_norm": 1.6669822931289673, "learning_rate": 4.421206976385245e-05, "loss": 3.0421, "step": 4000 }, { "epoch": 0.4458066313736417, "eval_loss": 2.8455963134765625, "eval_rouge1": 0.2380314893051126, "eval_rouge2": 0.05128827161353091, "eval_rougeL": 0.1453227983282736, "eval_rougeLsum": 0.22321551567650025, "eval_runtime": 3950.4694, "eval_samples_per_second": 3.384, "eval_steps_per_second": 0.212, "step": 4000 }, { "epoch": 0.5015324602953469, "grad_norm": 1.721129059791565, "learning_rate": 4.324741472449452e-05, "loss": 3.0031, "step": 4500 }, { "epoch": 0.5572582892170521, "grad_norm": 1.6570061445236206, "learning_rate": 4.2282759685136595e-05, "loss": 2.9799, "step": 5000 }, { "epoch": 0.6129841181387573, "grad_norm": 1.6984457969665527, "learning_rate": 4.131810464577867e-05, "loss": 2.9441, "step": 5500 }, { "epoch": 0.6687099470604625, "grad_norm": 1.780073642730713, "learning_rate": 4.035344960642074e-05, "loss": 2.9227, "step": 6000 }, { "epoch": 0.6687099470604625, "eval_loss": 2.728790044784546, "eval_rouge1": 0.2595394320658464, "eval_rouge2": 0.06168732304612659, "eval_rougeL": 0.15581664226279762, "eval_rougeLsum": 0.2423658467883625, "eval_runtime": 3924.3137, "eval_samples_per_second": 3.406, "eval_steps_per_second": 0.213, "step": 6000 }, { "epoch": 0.7244357759821677, "grad_norm": 1.7621432542800903, "learning_rate": 3.938879456706282e-05, "loss": 2.9003, "step": 6500 }, { "epoch": 0.780161604903873, "grad_norm": 1.6608766317367554, "learning_rate": 3.84241395277049e-05, "loss": 2.8805, "step": 7000 }, { "epoch": 0.8358874338255782, "grad_norm": 1.6685175895690918, "learning_rate": 3.745948448834697e-05, "loss": 2.8633, "step": 7500 }, { "epoch": 0.8916132627472834, "grad_norm": 1.766258955001831, "learning_rate": 3.6494829448989046e-05, "loss": 2.8436, "step": 8000 }, { "epoch": 0.8916132627472834, "eval_loss": 2.6555898189544678, "eval_rouge1": 0.2583863310766251, "eval_rouge2": 0.06324442312681633, "eval_rougeL": 0.1554957409851852, "eval_rougeLsum": 0.24164198724587968, "eval_runtime": 3936.4212, "eval_samples_per_second": 3.396, "eval_steps_per_second": 0.212, "step": 8000 }, { "epoch": 0.9473390916689886, "grad_norm": 1.59657621383667, "learning_rate": 3.553017440963112e-05, "loss": 2.8263, "step": 8500 }, { "epoch": 1.0030649205906939, "grad_norm": 1.5849162340164185, "learning_rate": 3.4565519370273194e-05, "loss": 2.8088, "step": 9000 }, { "epoch": 1.058790749512399, "grad_norm": 1.7484833002090454, "learning_rate": 3.360086433091527e-05, "loss": 2.7095, "step": 9500 }, { "epoch": 1.1145165784341042, "grad_norm": 1.5881661176681519, "learning_rate": 3.263620929155734e-05, "loss": 2.6961, "step": 10000 }, { "epoch": 1.1145165784341042, "eval_loss": 2.5992419719696045, "eval_rouge1": 0.25777068407797354, "eval_rouge2": 0.06420331632465279, "eval_rougeL": 0.15702078007420395, "eval_rougeLsum": 0.2409777539933322, "eval_runtime": 3924.4889, "eval_samples_per_second": 3.406, "eval_steps_per_second": 0.213, "step": 10000 }, { "epoch": 1.1702424073558095, "grad_norm": 1.5762600898742676, "learning_rate": 3.1671554252199416e-05, "loss": 2.6942, "step": 10500 }, { "epoch": 1.2259682362775146, "grad_norm": 1.630346655845642, "learning_rate": 3.070689921284149e-05, "loss": 2.688, "step": 11000 }, { "epoch": 1.28169406519922, "grad_norm": 1.6222407817840576, "learning_rate": 2.9742244173483564e-05, "loss": 2.6682, "step": 11500 }, { "epoch": 1.337419894120925, "grad_norm": 1.6392185688018799, "learning_rate": 2.8777589134125638e-05, "loss": 2.6662, "step": 12000 }, { "epoch": 1.337419894120925, "eval_loss": 2.551286458969116, "eval_rouge1": 0.27486750620247946, "eval_rouge2": 0.07168156814787813, "eval_rougeL": 0.1642252769198796, "eval_rougeLsum": 0.25714973714244077, "eval_runtime": 3900.0246, "eval_samples_per_second": 3.428, "eval_steps_per_second": 0.214, "step": 12000 }, { "epoch": 1.3931457230426303, "grad_norm": 1.5941892862319946, "learning_rate": 2.7812934094767712e-05, "loss": 2.6653, "step": 12500 }, { "epoch": 1.4488715519643356, "grad_norm": 1.6253877878189087, "learning_rate": 2.6848279055409786e-05, "loss": 2.6586, "step": 13000 }, { "epoch": 1.5045973808860407, "grad_norm": 1.6063872575759888, "learning_rate": 2.588362401605186e-05, "loss": 2.6443, "step": 13500 }, { "epoch": 1.5603232098077457, "grad_norm": 1.713887095451355, "learning_rate": 2.4918968976693934e-05, "loss": 2.6312, "step": 14000 }, { "epoch": 1.5603232098077457, "eval_loss": 2.508091688156128, "eval_rouge1": 0.2530435850081141, "eval_rouge2": 0.06384506041315902, "eval_rougeL": 0.1543038576804575, "eval_rougeLsum": 0.2366360294033542, "eval_runtime": 3907.0776, "eval_samples_per_second": 3.421, "eval_steps_per_second": 0.214, "step": 14000 }, { "epoch": 1.616049038729451, "grad_norm": 1.8332961797714233, "learning_rate": 2.3954313937336008e-05, "loss": 2.63, "step": 14500 }, { "epoch": 1.6717748676511563, "grad_norm": 1.6101057529449463, "learning_rate": 2.2989658897978082e-05, "loss": 2.6114, "step": 15000 }, { "epoch": 1.7275006965728616, "grad_norm": 1.7273740768432617, "learning_rate": 2.2025003858620156e-05, "loss": 2.6051, "step": 15500 }, { "epoch": 1.7832265254945667, "grad_norm": 1.7632737159729004, "learning_rate": 2.1060348819262234e-05, "loss": 2.6058, "step": 16000 }, { "epoch": 1.7832265254945667, "eval_loss": 2.463944911956787, "eval_rouge1": 0.2636323697106167, "eval_rouge2": 0.07174514437983107, "eval_rougeL": 0.1601389399578005, "eval_rougeLsum": 0.2469347915587097, "eval_runtime": 3915.9785, "eval_samples_per_second": 3.414, "eval_steps_per_second": 0.213, "step": 16000 }, { "epoch": 1.8389523544162718, "grad_norm": 1.8221988677978516, "learning_rate": 2.0095693779904308e-05, "loss": 2.5988, "step": 16500 }, { "epoch": 1.894678183337977, "grad_norm": 1.8893871307373047, "learning_rate": 1.9131038740546382e-05, "loss": 2.5847, "step": 17000 }, { "epoch": 1.9504040122596824, "grad_norm": 1.953140139579773, "learning_rate": 1.8166383701188456e-05, "loss": 2.5804, "step": 17500 }, { "epoch": 2.0061298411813877, "grad_norm": 1.8473776578903198, "learning_rate": 1.720172866183053e-05, "loss": 2.5725, "step": 18000 }, { "epoch": 2.0061298411813877, "eval_loss": 2.4292125701904297, "eval_rouge1": 0.2567421048616416, "eval_rouge2": 0.06891060288535017, "eval_rougeL": 0.15595194613787078, "eval_rougeLsum": 0.24070474254739155, "eval_runtime": 3869.9616, "eval_samples_per_second": 3.454, "eval_steps_per_second": 0.216, "step": 18000 }, { "epoch": 2.0618556701030926, "grad_norm": 1.7851742506027222, "learning_rate": 1.6237073622472604e-05, "loss": 2.489, "step": 18500 }, { "epoch": 2.117581499024798, "grad_norm": 2.2768101692199707, "learning_rate": 1.5272418583114678e-05, "loss": 2.4861, "step": 19000 }, { "epoch": 2.173307327946503, "grad_norm": 2.209219455718994, "learning_rate": 1.4307763543756752e-05, "loss": 2.4912, "step": 19500 }, { "epoch": 2.2290331568682085, "grad_norm": 2.0397818088531494, "learning_rate": 1.3343108504398828e-05, "loss": 2.4892, "step": 20000 }, { "epoch": 2.2290331568682085, "eval_loss": 2.4027278423309326, "eval_rouge1": 0.2706722599374948, "eval_rouge2": 0.07459307722662609, "eval_rougeL": 0.16398173707926839, "eval_rougeLsum": 0.2530942151578608, "eval_runtime": 3893.765, "eval_samples_per_second": 3.433, "eval_steps_per_second": 0.215, "step": 20000 }, { "epoch": 2.2847589857899138, "grad_norm": 2.035895824432373, "learning_rate": 1.2378453465040902e-05, "loss": 2.4728, "step": 20500 }, { "epoch": 2.340484814711619, "grad_norm": 2.106766939163208, "learning_rate": 1.1413798425682977e-05, "loss": 2.4768, "step": 21000 }, { "epoch": 2.396210643633324, "grad_norm": 2.103576183319092, "learning_rate": 1.0449143386325052e-05, "loss": 2.4689, "step": 21500 }, { "epoch": 2.4519364725550292, "grad_norm": 2.0902152061462402, "learning_rate": 9.484488346967126e-06, "loss": 2.4647, "step": 22000 }, { "epoch": 2.4519364725550292, "eval_loss": 2.3800978660583496, "eval_rouge1": 0.25082265645038193, "eval_rouge2": 0.06640380147549775, "eval_rougeL": 0.1539963772798671, "eval_rougeLsum": 0.23498739580707717, "eval_runtime": 3884.9364, "eval_samples_per_second": 3.441, "eval_steps_per_second": 0.215, "step": 22000 }, { "epoch": 2.5076623014767345, "grad_norm": 1.8595211505889893, "learning_rate": 8.5198333076092e-06, "loss": 2.4541, "step": 22500 }, { "epoch": 2.56338813039844, "grad_norm": 2.1612913608551025, "learning_rate": 7.5551782682512745e-06, "loss": 2.4519, "step": 23000 }, { "epoch": 2.6191139593201447, "grad_norm": 2.2538599967956543, "learning_rate": 6.5905232288933485e-06, "loss": 2.4544, "step": 23500 }, { "epoch": 2.67483978824185, "grad_norm": 2.060137987136841, "learning_rate": 5.6258681895354226e-06, "loss": 2.4479, "step": 24000 }, { "epoch": 2.67483978824185, "eval_loss": 2.361970901489258, "eval_rouge1": 0.263764877338478, "eval_rouge2": 0.07266606751022181, "eval_rougeL": 0.1608073972426968, "eval_rougeLsum": 0.24731330981409283, "eval_runtime": 3893.8374, "eval_samples_per_second": 3.433, "eval_steps_per_second": 0.215, "step": 24000 }, { "epoch": 2.7305656171635553, "grad_norm": 1.9665076732635498, "learning_rate": 4.661213150177497e-06, "loss": 2.4587, "step": 24500 }, { "epoch": 2.7862914460852606, "grad_norm": 2.218065023422241, "learning_rate": 3.6965581108195706e-06, "loss": 2.4458, "step": 25000 }, { "epoch": 2.842017275006966, "grad_norm": 2.042405605316162, "learning_rate": 2.7319030714616455e-06, "loss": 2.4513, "step": 25500 }, { "epoch": 2.897743103928671, "grad_norm": 1.9766805171966553, "learning_rate": 1.7672480321037198e-06, "loss": 2.4474, "step": 26000 }, { "epoch": 2.897743103928671, "eval_loss": 2.3526828289031982, "eval_rouge1": 0.2544085209463849, "eval_rouge2": 0.06856415444008992, "eval_rougeL": 0.15601198971765073, "eval_rougeLsum": 0.23845835139467592, "eval_runtime": 3886.9121, "eval_samples_per_second": 3.439, "eval_steps_per_second": 0.215, "step": 26000 }, { "epoch": 2.953468932850376, "grad_norm": 2.4569876194000244, "learning_rate": 8.025929927457941e-07, "loss": 2.4411, "step": 26500 }, { "epoch": 2.999832822513235, "step": 26916, "total_flos": 5.251566637814907e+17, "train_loss": 2.7409434880486105, "train_runtime": 62948.108, "train_samples_per_second": 13.683, "train_steps_per_second": 0.428 } ], "logging_steps": 500, "max_steps": 26916, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.251566637814907e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }