Instructions to use ishathombre/monolingual-hindi-from-scratch with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ishathombre/monolingual-hindi-from-scratch with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("fill-mask", model="ishathombre/monolingual-hindi-from-scratch")# Load model directly from transformers import AutoTokenizer, AutoModelForMaskedLM tokenizer = AutoTokenizer.from_pretrained("ishathombre/monolingual-hindi-from-scratch") model = AutoModelForMaskedLM.from_pretrained("ishathombre/monolingual-hindi-from-scratch") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 15000, | |
| "global_step": 140625, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0035555555555555557, | |
| "eval_loss": 10.192023277282715, | |
| "eval_runtime": 3606.549, | |
| "eval_samples_per_second": 138.637, | |
| "eval_steps_per_second": 4.332, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0071111111111111115, | |
| "grad_norm": 2.387178421020508, | |
| "learning_rate": 7.103242320819113e-06, | |
| "loss": 10.1288, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0071111111111111115, | |
| "eval_loss": 9.418336868286133, | |
| "eval_runtime": 3608.5139, | |
| "eval_samples_per_second": 138.561, | |
| "eval_steps_per_second": 4.33, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.010666666666666666, | |
| "eval_loss": 8.553510665893555, | |
| "eval_runtime": 3626.6638, | |
| "eval_samples_per_second": 137.868, | |
| "eval_steps_per_second": 4.308, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.014222222222222223, | |
| "grad_norm": 1.6770904064178467, | |
| "learning_rate": 1.4213594994311718e-05, | |
| "loss": 8.5935, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.014222222222222223, | |
| "eval_loss": 7.8756632804870605, | |
| "eval_runtime": 3629.9225, | |
| "eval_samples_per_second": 137.744, | |
| "eval_steps_per_second": 4.304, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.017777777777777778, | |
| "eval_loss": 7.625248432159424, | |
| "eval_runtime": 3637.2635, | |
| "eval_samples_per_second": 137.466, | |
| "eval_steps_per_second": 4.296, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.021333333333333333, | |
| "grad_norm": 1.6508774757385254, | |
| "learning_rate": 2.1323947667804326e-05, | |
| "loss": 7.6525, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.021333333333333333, | |
| "eval_loss": 7.534073829650879, | |
| "eval_runtime": 3654.9087, | |
| "eval_samples_per_second": 136.802, | |
| "eval_steps_per_second": 4.275, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.024888888888888887, | |
| "eval_loss": 7.45560884475708, | |
| "eval_runtime": 3671.4513, | |
| "eval_samples_per_second": 136.186, | |
| "eval_steps_per_second": 4.256, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.028444444444444446, | |
| "grad_norm": 2.0226364135742188, | |
| "learning_rate": 2.8434300341296933e-05, | |
| "loss": 7.4767, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.028444444444444446, | |
| "eval_loss": 7.391005992889404, | |
| "eval_runtime": 3680.5941, | |
| "eval_samples_per_second": 135.848, | |
| "eval_steps_per_second": 4.245, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "eval_loss": 7.328425407409668, | |
| "eval_runtime": 3685.0206, | |
| "eval_samples_per_second": 135.684, | |
| "eval_steps_per_second": 4.24, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.035555555555555556, | |
| "grad_norm": 1.980565071105957, | |
| "learning_rate": 3.554465301478954e-05, | |
| "loss": 7.3497, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.035555555555555556, | |
| "eval_loss": 7.287872314453125, | |
| "eval_runtime": 3698.5108, | |
| "eval_samples_per_second": 135.19, | |
| "eval_steps_per_second": 4.225, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.03911111111111111, | |
| "eval_loss": 7.240482807159424, | |
| "eval_runtime": 3709.8586, | |
| "eval_samples_per_second": 134.776, | |
| "eval_steps_per_second": 4.212, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.042666666666666665, | |
| "grad_norm": 2.800004720687866, | |
| "learning_rate": 4.265500568828214e-05, | |
| "loss": 7.2465, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.042666666666666665, | |
| "eval_loss": 7.197748184204102, | |
| "eval_runtime": 3735.3694, | |
| "eval_samples_per_second": 133.856, | |
| "eval_steps_per_second": 4.183, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.04622222222222222, | |
| "eval_loss": 7.154662132263184, | |
| "eval_runtime": 3742.7005, | |
| "eval_samples_per_second": 133.593, | |
| "eval_steps_per_second": 4.175, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.049777777777777775, | |
| "grad_norm": 3.009158134460449, | |
| "learning_rate": 4.9765358361774746e-05, | |
| "loss": 7.1859, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.049777777777777775, | |
| "eval_loss": 7.120786190032959, | |
| "eval_runtime": 3747.8473, | |
| "eval_samples_per_second": 133.41, | |
| "eval_steps_per_second": 4.169, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.05333333333333334, | |
| "eval_loss": 7.0815510749816895, | |
| "eval_runtime": 3748.6541, | |
| "eval_samples_per_second": 133.381, | |
| "eval_steps_per_second": 4.168, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.05688888888888889, | |
| "grad_norm": 3.3026907444000244, | |
| "learning_rate": 4.963807983951255e-05, | |
| "loss": 7.1209, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.05688888888888889, | |
| "eval_loss": 7.053798675537109, | |
| "eval_runtime": 3748.7994, | |
| "eval_samples_per_second": 133.376, | |
| "eval_steps_per_second": 4.168, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.060444444444444446, | |
| "eval_loss": 7.023181438446045, | |
| "eval_runtime": 3634.6666, | |
| "eval_samples_per_second": 137.564, | |
| "eval_steps_per_second": 4.299, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 3.314805030822754, | |
| "learning_rate": 4.926380873249347e-05, | |
| "loss": 7.0348, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.07111111111111111, | |
| "grad_norm": 2.875911235809326, | |
| "learning_rate": 4.888953762547439e-05, | |
| "loss": 7.0072, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.07111111111111111, | |
| "eval_loss": 6.941441535949707, | |
| "eval_runtime": 3748.7055, | |
| "eval_samples_per_second": 133.379, | |
| "eval_steps_per_second": 4.168, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.07822222222222222, | |
| "grad_norm": 2.914552688598633, | |
| "learning_rate": 4.851526651845531e-05, | |
| "loss": 6.947, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.08533333333333333, | |
| "grad_norm": 2.964020252227783, | |
| "learning_rate": 4.814099541143623e-05, | |
| "loss": 6.8917, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.08533333333333333, | |
| "eval_loss": 6.830844402313232, | |
| "eval_runtime": 3577.7662, | |
| "eval_samples_per_second": 139.752, | |
| "eval_steps_per_second": 4.367, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.09244444444444444, | |
| "grad_norm": 3.0612735748291016, | |
| "learning_rate": 4.7766724304417146e-05, | |
| "loss": 6.8343, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.09955555555555555, | |
| "grad_norm": 2.7895731925964355, | |
| "learning_rate": 4.739245319739807e-05, | |
| "loss": 6.8006, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.09955555555555555, | |
| "eval_loss": 6.7523064613342285, | |
| "eval_runtime": 3593.4585, | |
| "eval_samples_per_second": 139.142, | |
| "eval_steps_per_second": 4.348, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.10666666666666667, | |
| "grad_norm": 4.3099260330200195, | |
| "learning_rate": 4.7018182090378985e-05, | |
| "loss": 6.7552, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.11377777777777778, | |
| "grad_norm": 4.26462459564209, | |
| "learning_rate": 4.664391098335991e-05, | |
| "loss": 6.716, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.11377777777777778, | |
| "eval_loss": 6.662391185760498, | |
| "eval_runtime": 3610.0752, | |
| "eval_samples_per_second": 138.501, | |
| "eval_steps_per_second": 4.328, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.12088888888888889, | |
| "grad_norm": 4.591431140899658, | |
| "learning_rate": 4.626963987634083e-05, | |
| "loss": 6.6886, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 3.8068220615386963, | |
| "learning_rate": 4.589536876932175e-05, | |
| "loss": 6.6432, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "eval_loss": 6.585519313812256, | |
| "eval_runtime": 3618.075, | |
| "eval_samples_per_second": 138.195, | |
| "eval_steps_per_second": 4.319, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.1351111111111111, | |
| "grad_norm": 4.0343523025512695, | |
| "learning_rate": 4.552109766230267e-05, | |
| "loss": 6.6323, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.14222222222222222, | |
| "grad_norm": 4.53961181640625, | |
| "learning_rate": 4.5146826555283586e-05, | |
| "loss": 6.5772, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.14222222222222222, | |
| "eval_loss": 6.517657279968262, | |
| "eval_runtime": 3623.0508, | |
| "eval_samples_per_second": 138.005, | |
| "eval_steps_per_second": 4.313, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.14933333333333335, | |
| "grad_norm": 4.860204219818115, | |
| "learning_rate": 4.477255544826451e-05, | |
| "loss": 6.5456, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.15644444444444444, | |
| "grad_norm": 4.892744541168213, | |
| "learning_rate": 4.4398284341245425e-05, | |
| "loss": 6.4887, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.15644444444444444, | |
| "eval_loss": 6.45619010925293, | |
| "eval_runtime": 3632.2999, | |
| "eval_samples_per_second": 137.654, | |
| "eval_steps_per_second": 4.302, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.16355555555555557, | |
| "grad_norm": 4.8081135749816895, | |
| "learning_rate": 4.402401323422635e-05, | |
| "loss": 6.4871, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.17066666666666666, | |
| "grad_norm": 6.334560871124268, | |
| "learning_rate": 4.3649742127207264e-05, | |
| "loss": 6.4517, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.17066666666666666, | |
| "eval_loss": 6.385794639587402, | |
| "eval_runtime": 3643.4203, | |
| "eval_samples_per_second": 137.234, | |
| "eval_steps_per_second": 4.289, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.17777777777777778, | |
| "grad_norm": 6.487077713012695, | |
| "learning_rate": 4.3275471020188187e-05, | |
| "loss": 6.4336, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.18488888888888888, | |
| "grad_norm": 6.764550685882568, | |
| "learning_rate": 4.29011999131691e-05, | |
| "loss": 6.3861, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.18488888888888888, | |
| "eval_loss": 6.320189476013184, | |
| "eval_runtime": 3651.5073, | |
| "eval_samples_per_second": 136.93, | |
| "eval_steps_per_second": 4.279, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 6.320560932159424, | |
| "learning_rate": 4.2526928806150026e-05, | |
| "loss": 6.3545, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.1991111111111111, | |
| "grad_norm": 7.219266414642334, | |
| "learning_rate": 4.215265769913094e-05, | |
| "loss": 6.3368, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.1991111111111111, | |
| "eval_loss": 6.245439052581787, | |
| "eval_runtime": 3691.7383, | |
| "eval_samples_per_second": 135.438, | |
| "eval_steps_per_second": 4.232, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.20622222222222222, | |
| "grad_norm": 7.970364570617676, | |
| "learning_rate": 4.1778386592111864e-05, | |
| "loss": 6.2752, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 4.8507232666015625, | |
| "learning_rate": 4.140411548509278e-05, | |
| "loss": 6.2344, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.21333333333333335, | |
| "eval_loss": 6.119101524353027, | |
| "eval_runtime": 3746.4316, | |
| "eval_samples_per_second": 133.46, | |
| "eval_steps_per_second": 4.171, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.22044444444444444, | |
| "grad_norm": 4.850826740264893, | |
| "learning_rate": 4.10298443780737e-05, | |
| "loss": 6.1825, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.22755555555555557, | |
| "grad_norm": 6.614843368530273, | |
| "learning_rate": 4.0655573271054626e-05, | |
| "loss": 6.1117, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.22755555555555557, | |
| "eval_loss": 5.977404594421387, | |
| "eval_runtime": 3748.1526, | |
| "eval_samples_per_second": 133.399, | |
| "eval_steps_per_second": 4.169, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.23466666666666666, | |
| "grad_norm": 7.195309162139893, | |
| "learning_rate": 4.028130216403554e-05, | |
| "loss": 6.0633, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.24177777777777779, | |
| "grad_norm": 6.1979522705078125, | |
| "learning_rate": 3.9907031057016465e-05, | |
| "loss": 5.9906, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.24177777777777779, | |
| "eval_loss": 5.844593524932861, | |
| "eval_runtime": 3748.7753, | |
| "eval_samples_per_second": 133.377, | |
| "eval_steps_per_second": 4.168, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.24888888888888888, | |
| "grad_norm": 5.907817840576172, | |
| "learning_rate": 3.953275994999738e-05, | |
| "loss": 5.9681, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 10.167774200439453, | |
| "learning_rate": 3.9158488842978304e-05, | |
| "loss": 5.8877, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.26311111111111113, | |
| "grad_norm": 6.226233005523682, | |
| "learning_rate": 3.878421773595922e-05, | |
| "loss": 5.8555, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.2702222222222222, | |
| "grad_norm": 8.957833290100098, | |
| "learning_rate": 3.840994662894014e-05, | |
| "loss": 5.7746, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.2773333333333333, | |
| "grad_norm": 5.283188819885254, | |
| "learning_rate": 3.803567552192106e-05, | |
| "loss": 5.7368, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.28444444444444444, | |
| "grad_norm": 8.641979217529297, | |
| "learning_rate": 3.766140441490198e-05, | |
| "loss": 5.684, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.28444444444444444, | |
| "eval_loss": 5.490408420562744, | |
| "eval_runtime": 3560.4593, | |
| "eval_samples_per_second": 140.431, | |
| "eval_steps_per_second": 4.388, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.29155555555555557, | |
| "grad_norm": 7.102558135986328, | |
| "learning_rate": 3.72871333078829e-05, | |
| "loss": 5.6058, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.2986666666666667, | |
| "grad_norm": 10.386979103088379, | |
| "learning_rate": 3.691286220086382e-05, | |
| "loss": 5.5434, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.30577777777777776, | |
| "grad_norm": 8.64822006225586, | |
| "learning_rate": 3.653859109384474e-05, | |
| "loss": 5.5001, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.3128888888888889, | |
| "grad_norm": 5.785928249359131, | |
| "learning_rate": 3.616431998682566e-05, | |
| "loss": 5.4771, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.5996294021606445, | |
| "learning_rate": 3.5790048879806576e-05, | |
| "loss": 5.4112, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_loss": 5.232267379760742, | |
| "eval_runtime": 3637.4962, | |
| "eval_samples_per_second": 137.457, | |
| "eval_steps_per_second": 4.296, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.32711111111111113, | |
| "grad_norm": 6.475677967071533, | |
| "learning_rate": 3.54157777727875e-05, | |
| "loss": 5.3516, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.3342222222222222, | |
| "grad_norm": 8.867932319641113, | |
| "learning_rate": 3.504150666576842e-05, | |
| "loss": 5.3162, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 0.3413333333333333, | |
| "grad_norm": 6.978850364685059, | |
| "learning_rate": 3.466723555874934e-05, | |
| "loss": 5.2946, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.34844444444444445, | |
| "grad_norm": 8.528263092041016, | |
| "learning_rate": 3.429296445173026e-05, | |
| "loss": 5.263, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 0.35555555555555557, | |
| "grad_norm": 8.980386734008789, | |
| "learning_rate": 3.3919067615818196e-05, | |
| "loss": 5.2126, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.3626666666666667, | |
| "grad_norm": 6.648774147033691, | |
| "learning_rate": 3.354479650879911e-05, | |
| "loss": 5.1586, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 0.36977777777777776, | |
| "grad_norm": 7.404205322265625, | |
| "learning_rate": 3.3170899672887054e-05, | |
| "loss": 5.1437, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 0.3768888888888889, | |
| "grad_norm": 8.521736145019531, | |
| "learning_rate": 3.279662856586798e-05, | |
| "loss": 5.1017, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 8.535130500793457, | |
| "learning_rate": 3.242273172995591e-05, | |
| "loss": 5.0448, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 0.39111111111111113, | |
| "grad_norm": 8.199433326721191, | |
| "learning_rate": 3.204846062293683e-05, | |
| "loss": 5.0525, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.3982222222222222, | |
| "grad_norm": 6.0434651374816895, | |
| "learning_rate": 3.167456378702477e-05, | |
| "loss": 5.028, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 0.4053333333333333, | |
| "grad_norm": 9.137014389038086, | |
| "learning_rate": 3.130029268000569e-05, | |
| "loss": 4.9989, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 0.41244444444444445, | |
| "grad_norm": 8.472779273986816, | |
| "learning_rate": 3.092639584409363e-05, | |
| "loss": 4.9428, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 0.41955555555555557, | |
| "grad_norm": 8.970288276672363, | |
| "learning_rate": 3.055212473707455e-05, | |
| "loss": 4.9541, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 8.8677339553833, | |
| "learning_rate": 3.017785363005547e-05, | |
| "loss": 4.9165, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.4266666666666667, | |
| "eval_loss": 4.728287220001221, | |
| "eval_runtime": 1065.1935, | |
| "eval_samples_per_second": 469.398, | |
| "eval_steps_per_second": 14.669, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.43377777777777776, | |
| "grad_norm": 7.122616767883301, | |
| "learning_rate": 2.980358252303639e-05, | |
| "loss": 4.8869, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 0.4408888888888889, | |
| "grad_norm": 8.567378044128418, | |
| "learning_rate": 2.9429685687124324e-05, | |
| "loss": 4.87, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 6.809515953063965, | |
| "learning_rate": 2.9055788851212266e-05, | |
| "loss": 4.8423, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 0.45511111111111113, | |
| "grad_norm": 8.371034622192383, | |
| "learning_rate": 2.8681517744193186e-05, | |
| "loss": 4.8137, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 0.4622222222222222, | |
| "grad_norm": 11.026784896850586, | |
| "learning_rate": 2.8307246637174105e-05, | |
| "loss": 4.7887, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 0.4693333333333333, | |
| "grad_norm": 7.04138708114624, | |
| "learning_rate": 2.7933349801262044e-05, | |
| "loss": 4.7642, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 0.47644444444444445, | |
| "grad_norm": 6.654124736785889, | |
| "learning_rate": 2.7559078694242964e-05, | |
| "loss": 4.7682, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 0.48355555555555557, | |
| "grad_norm": 9.198745727539062, | |
| "learning_rate": 2.718480758722388e-05, | |
| "loss": 4.7505, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 0.49066666666666664, | |
| "grad_norm": 7.339107036590576, | |
| "learning_rate": 2.68105364802048e-05, | |
| "loss": 4.695, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 0.49777777777777776, | |
| "grad_norm": 7.021571636199951, | |
| "learning_rate": 2.6436639644292742e-05, | |
| "loss": 4.6952, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.5048888888888889, | |
| "grad_norm": 6.531971454620361, | |
| "learning_rate": 2.606236853727366e-05, | |
| "loss": 4.7041, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 8.730067253112793, | |
| "learning_rate": 2.56884717013616e-05, | |
| "loss": 4.6792, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 0.5191111111111111, | |
| "grad_norm": 6.701645374298096, | |
| "learning_rate": 2.531420059434252e-05, | |
| "loss": 4.6603, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 0.5262222222222223, | |
| "grad_norm": 8.028661727905273, | |
| "learning_rate": 2.493992948732344e-05, | |
| "loss": 4.6216, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 8.593280792236328, | |
| "learning_rate": 2.456565838030436e-05, | |
| "loss": 4.5962, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "eval_loss": 4.437507152557373, | |
| "eval_runtime": 1115.0836, | |
| "eval_samples_per_second": 448.397, | |
| "eval_steps_per_second": 14.012, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.5404444444444444, | |
| "grad_norm": 7.844559192657471, | |
| "learning_rate": 2.4191761544392298e-05, | |
| "loss": 4.5922, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 0.5475555555555556, | |
| "grad_norm": 8.809325218200684, | |
| "learning_rate": 2.3817490437373217e-05, | |
| "loss": 4.5741, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 0.5546666666666666, | |
| "grad_norm": 8.746879577636719, | |
| "learning_rate": 2.3443593601461156e-05, | |
| "loss": 4.5682, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 0.5617777777777778, | |
| "grad_norm": 6.95306921005249, | |
| "learning_rate": 2.3069322494442076e-05, | |
| "loss": 4.5368, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 0.5688888888888889, | |
| "grad_norm": 6.743412494659424, | |
| "learning_rate": 2.2695425658530015e-05, | |
| "loss": 4.533, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 7.390259742736816, | |
| "learning_rate": 2.2321154551510934e-05, | |
| "loss": 4.4858, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 0.5831111111111111, | |
| "grad_norm": 6.666167736053467, | |
| "learning_rate": 2.1947257715598873e-05, | |
| "loss": 4.4989, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 0.5902222222222222, | |
| "grad_norm": 11.42460823059082, | |
| "learning_rate": 2.1572986608579793e-05, | |
| "loss": 4.4954, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 0.5973333333333334, | |
| "grad_norm": 8.539606094360352, | |
| "learning_rate": 2.1198715501560712e-05, | |
| "loss": 4.4719, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 0.6044444444444445, | |
| "grad_norm": 7.553934097290039, | |
| "learning_rate": 2.082481866564865e-05, | |
| "loss": 4.4676, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 0.6115555555555555, | |
| "grad_norm": 6.772820949554443, | |
| "learning_rate": 2.045054755862957e-05, | |
| "loss": 4.4472, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 0.6186666666666667, | |
| "grad_norm": 7.853897571563721, | |
| "learning_rate": 2.0076650722717507e-05, | |
| "loss": 4.4265, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 0.6257777777777778, | |
| "grad_norm": 9.26160717010498, | |
| "learning_rate": 1.9702379615698426e-05, | |
| "loss": 4.4468, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 0.6328888888888888, | |
| "grad_norm": 7.147493362426758, | |
| "learning_rate": 1.932810850867935e-05, | |
| "loss": 4.4117, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 7.0213446617126465, | |
| "learning_rate": 1.8954211672767285e-05, | |
| "loss": 4.4131, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_loss": 4.241979122161865, | |
| "eval_runtime": 1150.9235, | |
| "eval_samples_per_second": 434.434, | |
| "eval_steps_per_second": 13.576, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.6471111111111111, | |
| "grad_norm": 8.582975387573242, | |
| "learning_rate": 1.8580314836855227e-05, | |
| "loss": 4.4139, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 0.6542222222222223, | |
| "grad_norm": 8.507649421691895, | |
| "learning_rate": 1.8206043729836143e-05, | |
| "loss": 4.3542, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 0.6613333333333333, | |
| "grad_norm": 6.707493305206299, | |
| "learning_rate": 1.7832146893924085e-05, | |
| "loss": 4.3797, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 0.6684444444444444, | |
| "grad_norm": 7.590912818908691, | |
| "learning_rate": 1.7457875786905005e-05, | |
| "loss": 4.3638, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 0.6755555555555556, | |
| "grad_norm": 10.533121109008789, | |
| "learning_rate": 1.7083604679885924e-05, | |
| "loss": 4.3663, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 0.6826666666666666, | |
| "grad_norm": 9.008646965026855, | |
| "learning_rate": 1.670933357286684e-05, | |
| "loss": 4.3485, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 0.6897777777777778, | |
| "grad_norm": 5.894834518432617, | |
| "learning_rate": 1.6335436736954783e-05, | |
| "loss": 4.3309, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 0.6968888888888889, | |
| "grad_norm": 8.39154052734375, | |
| "learning_rate": 1.5961165629935702e-05, | |
| "loss": 4.3287, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 8.664682388305664, | |
| "learning_rate": 1.558726879402364e-05, | |
| "loss": 4.3453, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 0.7111111111111111, | |
| "grad_norm": 6.261595249176025, | |
| "learning_rate": 1.521299768700456e-05, | |
| "loss": 4.3248, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 0.7182222222222222, | |
| "grad_norm": 6.0963006019592285, | |
| "learning_rate": 1.4839100851092497e-05, | |
| "loss": 4.3157, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 0.7253333333333334, | |
| "grad_norm": 8.00066089630127, | |
| "learning_rate": 1.446482974407342e-05, | |
| "loss": 4.2899, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 0.7324444444444445, | |
| "grad_norm": 7.378486156463623, | |
| "learning_rate": 1.4090932908161355e-05, | |
| "loss": 4.2866, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 0.7395555555555555, | |
| "grad_norm": 7.146476745605469, | |
| "learning_rate": 1.3716661801142275e-05, | |
| "loss": 4.2811, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 0.7466666666666667, | |
| "grad_norm": 6.750655174255371, | |
| "learning_rate": 1.3342764965230214e-05, | |
| "loss": 4.286, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 0.7466666666666667, | |
| "eval_loss": 4.117337703704834, | |
| "eval_runtime": 1173.3243, | |
| "eval_samples_per_second": 426.14, | |
| "eval_steps_per_second": 13.317, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 0.7537777777777778, | |
| "grad_norm": 8.043438911437988, | |
| "learning_rate": 1.2968493858211133e-05, | |
| "loss": 4.2712, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 0.7608888888888888, | |
| "grad_norm": 6.856295585632324, | |
| "learning_rate": 1.2594597022299074e-05, | |
| "loss": 4.269, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 7.801486968994141, | |
| "learning_rate": 1.2220325915279993e-05, | |
| "loss": 4.2669, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 0.7751111111111111, | |
| "grad_norm": 6.378691673278809, | |
| "learning_rate": 1.184642907936793e-05, | |
| "loss": 4.2745, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 0.7822222222222223, | |
| "grad_norm": 6.500033855438232, | |
| "learning_rate": 1.1472157972348852e-05, | |
| "loss": 4.2327, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 0.7893333333333333, | |
| "grad_norm": 7.333186626434326, | |
| "learning_rate": 1.109826113643679e-05, | |
| "loss": 4.2546, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 0.7964444444444444, | |
| "grad_norm": 6.290594577789307, | |
| "learning_rate": 1.0723990029417709e-05, | |
| "loss": 4.211, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 0.8035555555555556, | |
| "grad_norm": 7.892796993255615, | |
| "learning_rate": 1.035009319350565e-05, | |
| "loss": 4.2283, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 0.8106666666666666, | |
| "grad_norm": 12.832432746887207, | |
| "learning_rate": 9.975822086486567e-06, | |
| "loss": 4.2151, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 0.8177777777777778, | |
| "grad_norm": 7.7401885986328125, | |
| "learning_rate": 9.601925250574506e-06, | |
| "loss": 4.2408, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 0.8248888888888889, | |
| "grad_norm": 7.005733489990234, | |
| "learning_rate": 9.227654143555427e-06, | |
| "loss": 4.2157, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 6.389645576477051, | |
| "learning_rate": 8.853383036536345e-06, | |
| "loss": 4.1953, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 0.8391111111111111, | |
| "grad_norm": 10.353365898132324, | |
| "learning_rate": 8.479860471731304e-06, | |
| "loss": 4.1834, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 0.8462222222222222, | |
| "grad_norm": 7.046708106994629, | |
| "learning_rate": 8.105589364712223e-06, | |
| "loss": 4.1948, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 9.073925018310547, | |
| "learning_rate": 7.731318257693143e-06, | |
| "loss": 4.2043, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 0.8533333333333334, | |
| "eval_loss": 4.04020881652832, | |
| "eval_runtime": 1223.3756, | |
| "eval_samples_per_second": 408.705, | |
| "eval_steps_per_second": 12.772, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 0.8604444444444445, | |
| "grad_norm": 8.47717571258545, | |
| "learning_rate": 7.357421421781081e-06, | |
| "loss": 4.1805, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 0.8675555555555555, | |
| "grad_norm": 7.262242794036865, | |
| "learning_rate": 6.983150314762002e-06, | |
| "loss": 4.1904, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 0.8746666666666667, | |
| "grad_norm": 7.6930766105651855, | |
| "learning_rate": 6.608879207742922e-06, | |
| "loss": 4.1905, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 0.8817777777777778, | |
| "grad_norm": 8.956729888916016, | |
| "learning_rate": 6.23460810072384e-06, | |
| "loss": 4.1768, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 8.849599838256836, | |
| "learning_rate": 5.860711264811779e-06, | |
| "loss": 4.1684, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 7.687771320343018, | |
| "learning_rate": 5.486440157792699e-06, | |
| "loss": 4.1535, | |
| "step": 126000 | |
| }, | |
| { | |
| "epoch": 0.9031111111111111, | |
| "grad_norm": 7.867640495300293, | |
| "learning_rate": 5.112543321880638e-06, | |
| "loss": 4.1821, | |
| "step": 127000 | |
| }, | |
| { | |
| "epoch": 0.9102222222222223, | |
| "grad_norm": 6.583714008331299, | |
| "learning_rate": 4.738272214861557e-06, | |
| "loss": 4.1653, | |
| "step": 128000 | |
| }, | |
| { | |
| "epoch": 0.9173333333333333, | |
| "grad_norm": 6.076399326324463, | |
| "learning_rate": 4.3643753789494955e-06, | |
| "loss": 4.1669, | |
| "step": 129000 | |
| }, | |
| { | |
| "epoch": 0.9244444444444444, | |
| "grad_norm": 7.333767890930176, | |
| "learning_rate": 3.990104271930416e-06, | |
| "loss": 4.1656, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 0.9315555555555556, | |
| "grad_norm": 7.766304969787598, | |
| "learning_rate": 3.616207436018355e-06, | |
| "loss": 4.1568, | |
| "step": 131000 | |
| }, | |
| { | |
| "epoch": 0.9386666666666666, | |
| "grad_norm": 7.178610324859619, | |
| "learning_rate": 3.241936328999274e-06, | |
| "loss": 4.1382, | |
| "step": 132000 | |
| }, | |
| { | |
| "epoch": 0.9457777777777778, | |
| "grad_norm": 7.358198165893555, | |
| "learning_rate": 2.868039493087213e-06, | |
| "loss": 4.1526, | |
| "step": 133000 | |
| }, | |
| { | |
| "epoch": 0.9528888888888889, | |
| "grad_norm": 6.876594066619873, | |
| "learning_rate": 2.4937683860681324e-06, | |
| "loss": 4.1653, | |
| "step": 134000 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 8.42979621887207, | |
| "learning_rate": 2.119871550156071e-06, | |
| "loss": 4.1835, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_loss": 4.000138282775879, | |
| "eval_runtime": 1234.4744, | |
| "eval_samples_per_second": 405.031, | |
| "eval_steps_per_second": 12.657, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 0.9671111111111111, | |
| "grad_norm": 10.133835792541504, | |
| "learning_rate": 1.7456004431369907e-06, | |
| "loss": 4.1556, | |
| "step": 136000 | |
| }, | |
| { | |
| "epoch": 0.9742222222222222, | |
| "grad_norm": 8.190239906311035, | |
| "learning_rate": 1.3717036072249296e-06, | |
| "loss": 4.1504, | |
| "step": 137000 | |
| }, | |
| { | |
| "epoch": 0.9813333333333333, | |
| "grad_norm": 8.191669464111328, | |
| "learning_rate": 9.974325002058493e-07, | |
| "loss": 4.1432, | |
| "step": 138000 | |
| }, | |
| { | |
| "epoch": 0.9884444444444445, | |
| "grad_norm": 7.776695728302002, | |
| "learning_rate": 6.235356642937879e-07, | |
| "loss": 4.1321, | |
| "step": 139000 | |
| }, | |
| { | |
| "epoch": 0.9955555555555555, | |
| "grad_norm": 9.083742141723633, | |
| "learning_rate": 2.492645572747075e-07, | |
| "loss": 4.1393, | |
| "step": 140000 | |
| } | |
| ], | |
| "logging_steps": 1000, | |
| "max_steps": 140625, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 15000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.541019136e+16, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |