{ "best_global_step": 2500, "best_metric": 20.97090721130371, "best_model_checkpoint": "./bert-kinyarwanda-continued/checkpoint-2500", "epoch": 3.0, "eval_steps": 500, "global_step": 2808, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10684474123539232, "grad_norm": 15.981427192687988, "learning_rate": 1.9955930958501655e-05, "loss": 625.3383984375, "step": 100 }, { "epoch": 0.21368948247078465, "grad_norm": 14.30290699005127, "learning_rate": 1.92214469335292e-05, "loss": 499.739765625, "step": 200 }, { "epoch": 0.32053422370617696, "grad_norm": 14.312140464782715, "learning_rate": 1.8486962908556742e-05, "loss": 456.5875, "step": 300 }, { "epoch": 0.4273789649415693, "grad_norm": 14.270943641662598, "learning_rate": 1.7752478883584282e-05, "loss": 433.20015625, "step": 400 }, { "epoch": 0.5342237061769616, "grad_norm": 14.424040794372559, "learning_rate": 1.7017994858611826e-05, "loss": 416.892109375, "step": 500 }, { "epoch": 0.5342237061769616, "eval_loss": 25.040687561035156, "eval_runtime": 39.3772, "eval_samples_per_second": 1280.92, "eval_steps_per_second": 40.049, "step": 500 }, { "epoch": 0.6410684474123539, "grad_norm": 13.946560859680176, "learning_rate": 1.628351083363937e-05, "loss": 405.925078125, "step": 600 }, { "epoch": 0.7479131886477463, "grad_norm": 13.409845352172852, "learning_rate": 1.5549026808666913e-05, "loss": 397.17734375, "step": 700 }, { "epoch": 0.8547579298831386, "grad_norm": 13.034708023071289, "learning_rate": 1.4814542783694456e-05, "loss": 389.632109375, "step": 800 }, { "epoch": 0.9616026711185309, "grad_norm": 12.836302757263184, "learning_rate": 1.4080058758722e-05, "loss": 383.981953125, "step": 900 }, { "epoch": 1.068380634390651, "grad_norm": 13.04681396484375, "learning_rate": 1.3345574733749541e-05, "loss": 378.46109375, "step": 1000 }, { "epoch": 1.068380634390651, "eval_loss": 22.749231338500977, "eval_runtime": 40.0531, "eval_samples_per_second": 1259.302, "eval_steps_per_second": 39.373, "step": 1000 }, { "epoch": 1.1752253756260433, "grad_norm": 13.020837783813477, "learning_rate": 1.2611090708777085e-05, "loss": 374.2799609375, "step": 1100 }, { "epoch": 1.2820701168614357, "grad_norm": 14.044160842895508, "learning_rate": 1.1876606683804628e-05, "loss": 370.35078125, "step": 1200 }, { "epoch": 1.388914858096828, "grad_norm": 14.57093334197998, "learning_rate": 1.114212265883217e-05, "loss": 367.606875, "step": 1300 }, { "epoch": 1.4957595993322204, "grad_norm": 13.08434009552002, "learning_rate": 1.0407638633859714e-05, "loss": 364.3687890625, "step": 1400 }, { "epoch": 1.6026043405676127, "grad_norm": 12.885830879211426, "learning_rate": 9.673154608887257e-06, "loss": 360.90921875, "step": 1500 }, { "epoch": 1.6026043405676127, "eval_loss": 21.860645294189453, "eval_runtime": 39.7103, "eval_samples_per_second": 1270.174, "eval_steps_per_second": 39.713, "step": 1500 }, { "epoch": 1.709449081803005, "grad_norm": 13.03475284576416, "learning_rate": 8.938670583914801e-06, "loss": 359.79640625, "step": 1600 }, { "epoch": 1.8162938230383974, "grad_norm": 12.590346336364746, "learning_rate": 8.204186558942344e-06, "loss": 357.7059375, "step": 1700 }, { "epoch": 1.9231385642737897, "grad_norm": 13.379103660583496, "learning_rate": 7.469702533969887e-06, "loss": 355.667265625, "step": 1800 }, { "epoch": 2.0299165275459097, "grad_norm": 13.557140350341797, "learning_rate": 6.73521850899743e-06, "loss": 353.45390625, "step": 1900 }, { "epoch": 2.136761268781302, "grad_norm": 12.653982162475586, "learning_rate": 6.000734484024972e-06, "loss": 351.9113671875, "step": 2000 }, { "epoch": 2.136761268781302, "eval_loss": 21.225444793701172, "eval_runtime": 39.4597, "eval_samples_per_second": 1278.239, "eval_steps_per_second": 39.965, "step": 2000 }, { "epoch": 2.2436060100166944, "grad_norm": 13.150154113769531, "learning_rate": 5.266250459052517e-06, "loss": 351.964921875, "step": 2100 }, { "epoch": 2.3504507512520867, "grad_norm": 13.27846908569336, "learning_rate": 4.531766434080059e-06, "loss": 349.540390625, "step": 2200 }, { "epoch": 2.457295492487479, "grad_norm": 13.423123359680176, "learning_rate": 3.797282409107602e-06, "loss": 348.7391796875, "step": 2300 }, { "epoch": 2.5641402337228714, "grad_norm": 12.963814735412598, "learning_rate": 3.0627983841351456e-06, "loss": 347.498046875, "step": 2400 }, { "epoch": 2.6709849749582637, "grad_norm": 12.780089378356934, "learning_rate": 2.3283143591626882e-06, "loss": 348.4215234375, "step": 2500 }, { "epoch": 2.6709849749582637, "eval_loss": 20.97090721130371, "eval_runtime": 39.9875, "eval_samples_per_second": 1261.369, "eval_steps_per_second": 39.437, "step": 2500 }, { "epoch": 2.777829716193656, "grad_norm": 12.934194564819336, "learning_rate": 1.5938303341902313e-06, "loss": 346.3137890625, "step": 2600 }, { "epoch": 2.8846744574290484, "grad_norm": 13.299077033996582, "learning_rate": 8.593463092177746e-07, "loss": 347.1732421875, "step": 2700 }, { "epoch": 2.9915191986644407, "grad_norm": 12.96524429321289, "learning_rate": 1.2486228424531768e-07, "loss": 346.15265625, "step": 2800 } ], "logging_steps": 100, "max_steps": 2808, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.001 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.70596511063212e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }