Instructions to use NLPForUA/Llama-3.1-8B-Instruct-zno-cot with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use NLPForUA/Llama-3.1-8B-Instruct-zno-cot with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct") model = PeftModel.from_pretrained(base_model, "NLPForUA/Llama-3.1-8B-Instruct-zno-cot") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_metric": 1.3772375583648682, | |
| "best_model_checkpoint": "4bit_repro_03022025/host8_seed_42_full_det_fp16_no_flash_attn_fix_pad_llama-3.1-instruct-l16-cot-wt-4ep-lr3e04-ws20-bs4-ga4-fp16-05022025/checkpoint-109", | |
| "epoch": 3.9655172413793105, | |
| "eval_steps": 500, | |
| "global_step": 432, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.009195402298850575, | |
| "grad_norm": 1.2664293050765991, | |
| "learning_rate": 1.4999999999999999e-05, | |
| "loss": 2.501, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.01839080459770115, | |
| "grad_norm": 0.8997858762741089, | |
| "learning_rate": 2.9999999999999997e-05, | |
| "loss": 2.5106, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.027586206896551724, | |
| "grad_norm": 1.056443214416504, | |
| "learning_rate": 4.4999999999999996e-05, | |
| "loss": 2.3968, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0367816091954023, | |
| "grad_norm": 0.7283720374107361, | |
| "learning_rate": 5.9999999999999995e-05, | |
| "loss": 2.2734, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.04597701149425287, | |
| "grad_norm": 2.1415610313415527, | |
| "learning_rate": 7.5e-05, | |
| "loss": 2.3937, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.05517241379310345, | |
| "grad_norm": 0.9106594920158386, | |
| "learning_rate": 8.999999999999999e-05, | |
| "loss": 2.2477, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.06436781609195402, | |
| "grad_norm": 0.5318637490272522, | |
| "learning_rate": 0.00010499999999999999, | |
| "loss": 2.315, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0735632183908046, | |
| "grad_norm": 0.5165982246398926, | |
| "learning_rate": 0.00011999999999999999, | |
| "loss": 2.2767, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.08275862068965517, | |
| "grad_norm": 0.6848683953285217, | |
| "learning_rate": 0.000135, | |
| "loss": 2.1223, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.09195402298850575, | |
| "grad_norm": 0.9631755948066711, | |
| "learning_rate": 0.00015, | |
| "loss": 1.8723, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10114942528735632, | |
| "grad_norm": 0.5025399923324585, | |
| "learning_rate": 0.000165, | |
| "loss": 1.903, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.1103448275862069, | |
| "grad_norm": 0.6477987170219421, | |
| "learning_rate": 0.00017999999999999998, | |
| "loss": 1.8794, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.11954022988505747, | |
| "grad_norm": 0.8744252324104309, | |
| "learning_rate": 0.000195, | |
| "loss": 1.5671, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.12873563218390804, | |
| "grad_norm": 0.5130301713943481, | |
| "learning_rate": 0.00020999999999999998, | |
| "loss": 1.7795, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.13793103448275862, | |
| "grad_norm": 0.40617436170578003, | |
| "learning_rate": 0.000225, | |
| "loss": 1.763, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1471264367816092, | |
| "grad_norm": 0.46318596601486206, | |
| "learning_rate": 0.00023999999999999998, | |
| "loss": 1.9987, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.15632183908045977, | |
| "grad_norm": 0.5236511826515198, | |
| "learning_rate": 0.00025499999999999996, | |
| "loss": 1.4979, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.16551724137931034, | |
| "grad_norm": 0.5424871444702148, | |
| "learning_rate": 0.00027, | |
| "loss": 1.4402, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.17471264367816092, | |
| "grad_norm": 0.4224435091018677, | |
| "learning_rate": 0.000285, | |
| "loss": 1.4267, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.1839080459770115, | |
| "grad_norm": 0.3918392062187195, | |
| "learning_rate": 0.0003, | |
| "loss": 1.682, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.19310344827586207, | |
| "grad_norm": 0.38505086302757263, | |
| "learning_rate": 0.00029927184466019415, | |
| "loss": 1.8057, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.20229885057471264, | |
| "grad_norm": 0.3489862382411957, | |
| "learning_rate": 0.00029854368932038833, | |
| "loss": 1.7206, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.21149425287356322, | |
| "grad_norm": 0.42483219504356384, | |
| "learning_rate": 0.0002978155339805825, | |
| "loss": 1.5119, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.2206896551724138, | |
| "grad_norm": 0.3796898424625397, | |
| "learning_rate": 0.0002970873786407767, | |
| "loss": 1.6089, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.22988505747126436, | |
| "grad_norm": 0.32184913754463196, | |
| "learning_rate": 0.00029635922330097087, | |
| "loss": 1.7786, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.23908045977011494, | |
| "grad_norm": 0.35197311639785767, | |
| "learning_rate": 0.00029563106796116505, | |
| "loss": 1.5639, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.2482758620689655, | |
| "grad_norm": 0.4310867488384247, | |
| "learning_rate": 0.0002949029126213592, | |
| "loss": 1.5192, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.2574712643678161, | |
| "grad_norm": 0.3951260447502136, | |
| "learning_rate": 0.00029417475728155335, | |
| "loss": 1.4559, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 0.3948059678077698, | |
| "learning_rate": 0.00029344660194174753, | |
| "loss": 1.7552, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.27586206896551724, | |
| "grad_norm": 0.3327697515487671, | |
| "learning_rate": 0.0002927184466019417, | |
| "loss": 1.9249, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2850574712643678, | |
| "grad_norm": 0.3762843906879425, | |
| "learning_rate": 0.0002919902912621359, | |
| "loss": 1.6966, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.2942528735632184, | |
| "grad_norm": 0.3671524226665497, | |
| "learning_rate": 0.00029126213592233006, | |
| "loss": 1.4718, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.30344827586206896, | |
| "grad_norm": 0.5411608815193176, | |
| "learning_rate": 0.00029053398058252424, | |
| "loss": 1.3394, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.31264367816091954, | |
| "grad_norm": 0.4906865656375885, | |
| "learning_rate": 0.0002898058252427184, | |
| "loss": 1.688, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.3218390804597701, | |
| "grad_norm": 0.4257373511791229, | |
| "learning_rate": 0.0002890776699029126, | |
| "loss": 1.7126, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.3310344827586207, | |
| "grad_norm": 0.44731780886650085, | |
| "learning_rate": 0.0002883495145631068, | |
| "loss": 1.5332, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.34022988505747126, | |
| "grad_norm": 0.34879976511001587, | |
| "learning_rate": 0.00028762135922330096, | |
| "loss": 1.6599, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.34942528735632183, | |
| "grad_norm": 0.328622043132782, | |
| "learning_rate": 0.00028689320388349513, | |
| "loss": 1.7865, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.3586206896551724, | |
| "grad_norm": 0.4515538811683655, | |
| "learning_rate": 0.0002861650485436893, | |
| "loss": 1.3684, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.367816091954023, | |
| "grad_norm": 0.39751267433166504, | |
| "learning_rate": 0.0002854368932038835, | |
| "loss": 1.7847, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.37701149425287356, | |
| "grad_norm": 0.33651894330978394, | |
| "learning_rate": 0.00028470873786407767, | |
| "loss": 1.602, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.38620689655172413, | |
| "grad_norm": 0.5487242341041565, | |
| "learning_rate": 0.00028398058252427185, | |
| "loss": 1.2588, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.3954022988505747, | |
| "grad_norm": 0.373512327671051, | |
| "learning_rate": 0.00028325242718446603, | |
| "loss": 1.5344, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.4045977011494253, | |
| "grad_norm": 0.4022233486175537, | |
| "learning_rate": 0.00028252427184466015, | |
| "loss": 1.4503, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.41379310344827586, | |
| "grad_norm": 0.4379498064517975, | |
| "learning_rate": 0.00028179611650485433, | |
| "loss": 1.3271, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.42298850574712643, | |
| "grad_norm": 0.3380189538002014, | |
| "learning_rate": 0.0002810679611650485, | |
| "loss": 1.6746, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.432183908045977, | |
| "grad_norm": 0.41873887181282043, | |
| "learning_rate": 0.0002803398058252427, | |
| "loss": 1.4792, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.4413793103448276, | |
| "grad_norm": 0.3990491032600403, | |
| "learning_rate": 0.00027961165048543687, | |
| "loss": 1.4309, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.45057471264367815, | |
| "grad_norm": 0.41645604372024536, | |
| "learning_rate": 0.00027888349514563105, | |
| "loss": 1.4003, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.45977011494252873, | |
| "grad_norm": 0.46974533796310425, | |
| "learning_rate": 0.0002781553398058252, | |
| "loss": 1.2245, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.4689655172413793, | |
| "grad_norm": 0.36877307295799255, | |
| "learning_rate": 0.0002774271844660194, | |
| "loss": 1.5979, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.4781609195402299, | |
| "grad_norm": 0.35563924908638, | |
| "learning_rate": 0.0002766990291262136, | |
| "loss": 1.3965, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.48735632183908045, | |
| "grad_norm": 0.44057121872901917, | |
| "learning_rate": 0.00027597087378640776, | |
| "loss": 1.2961, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.496551724137931, | |
| "grad_norm": 0.46805521845817566, | |
| "learning_rate": 0.00027524271844660194, | |
| "loss": 1.2184, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.5057471264367817, | |
| "grad_norm": 0.3901378810405731, | |
| "learning_rate": 0.0002745145631067961, | |
| "loss": 1.6127, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5149425287356322, | |
| "grad_norm": 0.471978098154068, | |
| "learning_rate": 0.0002737864077669903, | |
| "loss": 1.4896, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.5241379310344828, | |
| "grad_norm": 0.4229693114757538, | |
| "learning_rate": 0.0002730582524271845, | |
| "loss": 1.3084, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 0.4022842049598694, | |
| "learning_rate": 0.00027233009708737865, | |
| "loss": 1.464, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.542528735632184, | |
| "grad_norm": 0.4753398895263672, | |
| "learning_rate": 0.0002716019417475728, | |
| "loss": 1.4227, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.5517241379310345, | |
| "grad_norm": 0.3991449475288391, | |
| "learning_rate": 0.00027087378640776696, | |
| "loss": 1.4973, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.5609195402298851, | |
| "grad_norm": 0.5638157725334167, | |
| "learning_rate": 0.00027014563106796114, | |
| "loss": 1.3702, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.5701149425287356, | |
| "grad_norm": 0.3964412808418274, | |
| "learning_rate": 0.0002694174757281553, | |
| "loss": 1.3154, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.5793103448275863, | |
| "grad_norm": 0.4884161055088043, | |
| "learning_rate": 0.0002686893203883495, | |
| "loss": 1.2915, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.5885057471264368, | |
| "grad_norm": 0.5821394920349121, | |
| "learning_rate": 0.00026796116504854367, | |
| "loss": 1.1925, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.5977011494252874, | |
| "grad_norm": 0.8125529289245605, | |
| "learning_rate": 0.00026723300970873785, | |
| "loss": 1.4684, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.6068965517241379, | |
| "grad_norm": 0.46448639035224915, | |
| "learning_rate": 0.00026650485436893203, | |
| "loss": 1.0113, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.6160919540229886, | |
| "grad_norm": 0.5045990347862244, | |
| "learning_rate": 0.0002657766990291262, | |
| "loss": 1.424, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.6252873563218391, | |
| "grad_norm": 0.4608731269836426, | |
| "learning_rate": 0.0002650485436893204, | |
| "loss": 1.4203, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.6344827586206897, | |
| "grad_norm": 0.6361207962036133, | |
| "learning_rate": 0.00026432038834951456, | |
| "loss": 1.4455, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.6436781609195402, | |
| "grad_norm": 0.4667050838470459, | |
| "learning_rate": 0.00026359223300970874, | |
| "loss": 1.3269, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6528735632183909, | |
| "grad_norm": 0.5672599673271179, | |
| "learning_rate": 0.0002628640776699029, | |
| "loss": 1.2629, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.6620689655172414, | |
| "grad_norm": 0.5386179089546204, | |
| "learning_rate": 0.00026213592233009705, | |
| "loss": 1.2188, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.671264367816092, | |
| "grad_norm": 0.4968770146369934, | |
| "learning_rate": 0.0002614077669902912, | |
| "loss": 1.3148, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.6804597701149425, | |
| "grad_norm": 0.5847041010856628, | |
| "learning_rate": 0.0002606796116504854, | |
| "loss": 1.457, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.6896551724137931, | |
| "grad_norm": 0.5828786492347717, | |
| "learning_rate": 0.0002599514563106796, | |
| "loss": 1.2948, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.6988505747126437, | |
| "grad_norm": 0.7716859579086304, | |
| "learning_rate": 0.00025922330097087376, | |
| "loss": 1.5078, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.7080459770114943, | |
| "grad_norm": 0.6014898419380188, | |
| "learning_rate": 0.00025849514563106794, | |
| "loss": 1.2796, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.7172413793103448, | |
| "grad_norm": 0.5102112293243408, | |
| "learning_rate": 0.0002577669902912621, | |
| "loss": 1.178, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.7264367816091954, | |
| "grad_norm": 0.5293049812316895, | |
| "learning_rate": 0.0002570388349514563, | |
| "loss": 1.6404, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.735632183908046, | |
| "grad_norm": 0.5157208442687988, | |
| "learning_rate": 0.0002563106796116505, | |
| "loss": 1.4367, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.7448275862068966, | |
| "grad_norm": 0.4517700672149658, | |
| "learning_rate": 0.0002555825242718446, | |
| "loss": 1.3063, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.7540229885057471, | |
| "grad_norm": 0.42310649156570435, | |
| "learning_rate": 0.0002548543689320388, | |
| "loss": 1.2749, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.7632183908045977, | |
| "grad_norm": 0.5767822861671448, | |
| "learning_rate": 0.00025412621359223296, | |
| "loss": 1.3432, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.7724137931034483, | |
| "grad_norm": 0.49889254570007324, | |
| "learning_rate": 0.00025339805825242714, | |
| "loss": 1.3629, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.7816091954022989, | |
| "grad_norm": 0.5736685395240784, | |
| "learning_rate": 0.0002526699029126213, | |
| "loss": 1.1754, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.7908045977011494, | |
| "grad_norm": 0.4891297221183777, | |
| "learning_rate": 0.0002519417475728155, | |
| "loss": 1.2404, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.45507243275642395, | |
| "learning_rate": 0.00025121359223300967, | |
| "loss": 1.3653, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.8091954022988506, | |
| "grad_norm": 0.43236321210861206, | |
| "learning_rate": 0.00025048543689320385, | |
| "loss": 1.3037, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.8183908045977012, | |
| "grad_norm": 0.5662382245063782, | |
| "learning_rate": 0.00024975728155339803, | |
| "loss": 1.2031, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.8275862068965517, | |
| "grad_norm": 0.5644109845161438, | |
| "learning_rate": 0.0002490291262135922, | |
| "loss": 1.0595, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.8367816091954023, | |
| "grad_norm": 0.4636576473712921, | |
| "learning_rate": 0.0002483009708737864, | |
| "loss": 1.3056, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.8459770114942529, | |
| "grad_norm": 0.4858672022819519, | |
| "learning_rate": 0.00024757281553398056, | |
| "loss": 1.1215, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.8551724137931035, | |
| "grad_norm": 0.5379095673561096, | |
| "learning_rate": 0.00024684466019417474, | |
| "loss": 1.0833, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.864367816091954, | |
| "grad_norm": 0.6066029667854309, | |
| "learning_rate": 0.0002461165048543689, | |
| "loss": 1.1953, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.8735632183908046, | |
| "grad_norm": 0.5864242315292358, | |
| "learning_rate": 0.0002453883495145631, | |
| "loss": 1.2868, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.8827586206896552, | |
| "grad_norm": 0.47946035861968994, | |
| "learning_rate": 0.0002446601941747572, | |
| "loss": 1.0682, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.8919540229885058, | |
| "grad_norm": 0.5266394019126892, | |
| "learning_rate": 0.00024393203883495143, | |
| "loss": 1.2363, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.9011494252873563, | |
| "grad_norm": 0.49734389781951904, | |
| "learning_rate": 0.0002432038834951456, | |
| "loss": 1.2858, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.9103448275862069, | |
| "grad_norm": 0.5391829013824463, | |
| "learning_rate": 0.0002424757281553398, | |
| "loss": 1.1882, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.9195402298850575, | |
| "grad_norm": 0.4663216769695282, | |
| "learning_rate": 0.00024174757281553394, | |
| "loss": 1.1461, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.9287356321839081, | |
| "grad_norm": 0.5432461500167847, | |
| "learning_rate": 0.00024101941747572812, | |
| "loss": 1.134, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.9379310344827586, | |
| "grad_norm": 0.6043102145195007, | |
| "learning_rate": 0.0002402912621359223, | |
| "loss": 1.0859, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.9471264367816092, | |
| "grad_norm": 0.5635194778442383, | |
| "learning_rate": 0.00023956310679611648, | |
| "loss": 1.0828, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.9563218390804598, | |
| "grad_norm": 0.5683427453041077, | |
| "learning_rate": 0.00023883495145631065, | |
| "loss": 1.0066, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.9655172413793104, | |
| "grad_norm": 0.550366997718811, | |
| "learning_rate": 0.00023810679611650483, | |
| "loss": 1.0664, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.9747126436781609, | |
| "grad_norm": 0.5590884685516357, | |
| "learning_rate": 0.000237378640776699, | |
| "loss": 1.3188, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.9839080459770115, | |
| "grad_norm": 0.49222445487976074, | |
| "learning_rate": 0.0002366504854368932, | |
| "loss": 1.1888, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.993103448275862, | |
| "grad_norm": 0.5352950096130371, | |
| "learning_rate": 0.00023592233009708734, | |
| "loss": 0.9848, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.6914329528808594, | |
| "learning_rate": 0.00023519417475728152, | |
| "loss": 1.1814, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.3772375583648682, | |
| "eval_runtime": 70.9021, | |
| "eval_samples_per_second": 4.683, | |
| "eval_steps_per_second": 2.341, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.0091954022988505, | |
| "grad_norm": 0.5395228862762451, | |
| "learning_rate": 0.0002344660194174757, | |
| "loss": 1.0255, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.018390804597701, | |
| "grad_norm": 0.5161539316177368, | |
| "learning_rate": 0.00023373786407766988, | |
| "loss": 1.0072, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.0275862068965518, | |
| "grad_norm": 0.4717608094215393, | |
| "learning_rate": 0.00023300970873786406, | |
| "loss": 0.9007, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.0367816091954023, | |
| "grad_norm": 0.6422189474105835, | |
| "learning_rate": 0.00023228155339805823, | |
| "loss": 1.1458, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.0459770114942528, | |
| "grad_norm": 0.5921640992164612, | |
| "learning_rate": 0.0002315533980582524, | |
| "loss": 1.0321, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.0551724137931036, | |
| "grad_norm": 0.6381415128707886, | |
| "learning_rate": 0.0002308252427184466, | |
| "loss": 1.0119, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.064367816091954, | |
| "grad_norm": 0.49388617277145386, | |
| "learning_rate": 0.00023009708737864074, | |
| "loss": 0.9483, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.0735632183908046, | |
| "grad_norm": 0.669136106967926, | |
| "learning_rate": 0.00022936893203883492, | |
| "loss": 1.0767, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.0827586206896551, | |
| "grad_norm": 0.5716057419776917, | |
| "learning_rate": 0.0002286407766990291, | |
| "loss": 1.0632, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.0919540229885056, | |
| "grad_norm": 0.5134244561195374, | |
| "learning_rate": 0.00022791262135922328, | |
| "loss": 0.9554, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.1011494252873564, | |
| "grad_norm": 0.49190524220466614, | |
| "learning_rate": 0.00022718446601941746, | |
| "loss": 0.9927, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.110344827586207, | |
| "grad_norm": 0.7558183073997498, | |
| "learning_rate": 0.00022645631067961164, | |
| "loss": 1.0067, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.1195402298850574, | |
| "grad_norm": 0.5597643852233887, | |
| "learning_rate": 0.00022572815533980582, | |
| "loss": 0.8869, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.1287356321839082, | |
| "grad_norm": 0.6764683127403259, | |
| "learning_rate": 0.000225, | |
| "loss": 0.9806, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.1379310344827587, | |
| "grad_norm": 0.5788061618804932, | |
| "learning_rate": 0.00022427184466019415, | |
| "loss": 1.0121, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.1471264367816092, | |
| "grad_norm": 0.5578724145889282, | |
| "learning_rate": 0.00022354368932038832, | |
| "loss": 0.91, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.1563218390804597, | |
| "grad_norm": 0.5371822714805603, | |
| "learning_rate": 0.0002228155339805825, | |
| "loss": 0.8422, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.1655172413793102, | |
| "grad_norm": 0.7994176149368286, | |
| "learning_rate": 0.00022208737864077668, | |
| "loss": 0.9105, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.174712643678161, | |
| "grad_norm": 0.5430970788002014, | |
| "learning_rate": 0.00022135922330097086, | |
| "loss": 0.9843, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.1839080459770115, | |
| "grad_norm": 0.5710289478302002, | |
| "learning_rate": 0.00022063106796116504, | |
| "loss": 0.9191, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.193103448275862, | |
| "grad_norm": 0.6985263228416443, | |
| "learning_rate": 0.00021990291262135922, | |
| "loss": 0.892, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.2022988505747128, | |
| "grad_norm": 0.532593309879303, | |
| "learning_rate": 0.00021917475728155337, | |
| "loss": 0.7993, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.2114942528735633, | |
| "grad_norm": 0.5657442808151245, | |
| "learning_rate": 0.00021844660194174755, | |
| "loss": 0.8961, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.2206896551724138, | |
| "grad_norm": 0.5356594920158386, | |
| "learning_rate": 0.00021771844660194173, | |
| "loss": 0.867, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.2298850574712643, | |
| "grad_norm": 0.9694854617118835, | |
| "learning_rate": 0.0002169902912621359, | |
| "loss": 0.8823, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.2390804597701148, | |
| "grad_norm": 0.5810908675193787, | |
| "learning_rate": 0.00021626213592233008, | |
| "loss": 0.9571, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.2482758620689656, | |
| "grad_norm": 0.5119120478630066, | |
| "learning_rate": 0.00021553398058252426, | |
| "loss": 1.0567, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.257471264367816, | |
| "grad_norm": 0.5349586009979248, | |
| "learning_rate": 0.00021480582524271844, | |
| "loss": 0.8422, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.2666666666666666, | |
| "grad_norm": 0.5498553514480591, | |
| "learning_rate": 0.00021407766990291262, | |
| "loss": 0.9509, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.2758620689655173, | |
| "grad_norm": 0.6952114701271057, | |
| "learning_rate": 0.00021334951456310677, | |
| "loss": 1.062, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.2850574712643679, | |
| "grad_norm": 0.5768996477127075, | |
| "learning_rate": 0.00021262135922330095, | |
| "loss": 0.9139, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.2942528735632184, | |
| "grad_norm": 0.559478759765625, | |
| "learning_rate": 0.00021189320388349513, | |
| "loss": 1.0465, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.303448275862069, | |
| "grad_norm": 0.5405702590942383, | |
| "learning_rate": 0.0002111650485436893, | |
| "loss": 0.7433, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.3126436781609194, | |
| "grad_norm": 0.5924260020256042, | |
| "learning_rate": 0.00021043689320388349, | |
| "loss": 0.6828, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.3218390804597702, | |
| "grad_norm": 0.7219521403312683, | |
| "learning_rate": 0.00020970873786407766, | |
| "loss": 0.8972, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.3310344827586207, | |
| "grad_norm": 0.8541707396507263, | |
| "learning_rate": 0.00020898058252427184, | |
| "loss": 0.7595, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.3402298850574712, | |
| "grad_norm": 0.5988287925720215, | |
| "learning_rate": 0.00020825242718446602, | |
| "loss": 0.7803, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.349425287356322, | |
| "grad_norm": 0.6142683625221252, | |
| "learning_rate": 0.00020752427184466017, | |
| "loss": 0.8507, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.3586206896551725, | |
| "grad_norm": 0.5336320400238037, | |
| "learning_rate": 0.00020679611650485435, | |
| "loss": 0.7742, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.367816091954023, | |
| "grad_norm": 0.5284804701805115, | |
| "learning_rate": 0.00020606796116504853, | |
| "loss": 0.6759, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.3770114942528735, | |
| "grad_norm": 0.4955348074436188, | |
| "learning_rate": 0.0002053398058252427, | |
| "loss": 0.7054, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.386206896551724, | |
| "grad_norm": 0.5454412698745728, | |
| "learning_rate": 0.0002046116504854369, | |
| "loss": 0.8826, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.3954022988505748, | |
| "grad_norm": 0.7631385326385498, | |
| "learning_rate": 0.00020388349514563107, | |
| "loss": 0.8582, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.4045977011494253, | |
| "grad_norm": 0.5374491810798645, | |
| "learning_rate": 0.00020315533980582524, | |
| "loss": 0.9728, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.4137931034482758, | |
| "grad_norm": 0.5633158087730408, | |
| "learning_rate": 0.00020242718446601942, | |
| "loss": 0.8275, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.4229885057471265, | |
| "grad_norm": 0.572356641292572, | |
| "learning_rate": 0.00020169902912621357, | |
| "loss": 0.6565, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.432183908045977, | |
| "grad_norm": 0.6513031125068665, | |
| "learning_rate": 0.00020097087378640775, | |
| "loss": 0.9112, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.4413793103448276, | |
| "grad_norm": 0.48528391122817993, | |
| "learning_rate": 0.00020024271844660193, | |
| "loss": 0.7623, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.450574712643678, | |
| "grad_norm": 0.47339922189712524, | |
| "learning_rate": 0.0001995145631067961, | |
| "loss": 0.621, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.4597701149425286, | |
| "grad_norm": 0.5684221386909485, | |
| "learning_rate": 0.0001987864077669903, | |
| "loss": 0.7902, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.4689655172413794, | |
| "grad_norm": 0.87876296043396, | |
| "learning_rate": 0.00019805825242718447, | |
| "loss": 0.9629, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.4781609195402299, | |
| "grad_norm": 0.5925374031066895, | |
| "learning_rate": 0.00019733009708737865, | |
| "loss": 0.9815, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.4873563218390804, | |
| "grad_norm": 0.48992210626602173, | |
| "learning_rate": 0.0001966019417475728, | |
| "loss": 0.6662, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.4965517241379311, | |
| "grad_norm": 0.9017893075942993, | |
| "learning_rate": 0.00019587378640776698, | |
| "loss": 0.7229, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.5057471264367817, | |
| "grad_norm": 0.6186094880104065, | |
| "learning_rate": 0.00019514563106796116, | |
| "loss": 0.6594, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.5149425287356322, | |
| "grad_norm": 0.5197699069976807, | |
| "learning_rate": 0.00019441747572815533, | |
| "loss": 0.8404, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.524137931034483, | |
| "grad_norm": 0.4972164034843445, | |
| "learning_rate": 0.0001936893203883495, | |
| "loss": 0.8106, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.5333333333333332, | |
| "grad_norm": 0.5052672028541565, | |
| "learning_rate": 0.0001929611650485437, | |
| "loss": 0.7682, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.542528735632184, | |
| "grad_norm": 1.1433080434799194, | |
| "learning_rate": 0.00019223300970873787, | |
| "loss": 0.8409, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.5517241379310345, | |
| "grad_norm": 0.730868399143219, | |
| "learning_rate": 0.00019150485436893205, | |
| "loss": 0.7541, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.560919540229885, | |
| "grad_norm": 0.7178101539611816, | |
| "learning_rate": 0.0001907766990291262, | |
| "loss": 0.6899, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.5701149425287357, | |
| "grad_norm": 0.5060885548591614, | |
| "learning_rate": 0.00019004854368932038, | |
| "loss": 0.97, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.5793103448275863, | |
| "grad_norm": 0.7058172821998596, | |
| "learning_rate": 0.00018932038834951456, | |
| "loss": 0.8883, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.5885057471264368, | |
| "grad_norm": 0.6002851128578186, | |
| "learning_rate": 0.00018859223300970874, | |
| "loss": 0.8808, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.5977011494252875, | |
| "grad_norm": 0.5641213059425354, | |
| "learning_rate": 0.00018786407766990291, | |
| "loss": 0.6536, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.6068965517241378, | |
| "grad_norm": 0.621860146522522, | |
| "learning_rate": 0.00018713592233009707, | |
| "loss": 0.784, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.6160919540229886, | |
| "grad_norm": 0.6033377051353455, | |
| "learning_rate": 0.00018640776699029122, | |
| "loss": 0.8755, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.625287356321839, | |
| "grad_norm": 0.7301514148712158, | |
| "learning_rate": 0.0001856796116504854, | |
| "loss": 0.5401, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.6344827586206896, | |
| "grad_norm": 0.6001805067062378, | |
| "learning_rate": 0.00018495145631067957, | |
| "loss": 0.5086, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.6436781609195403, | |
| "grad_norm": 0.5408189296722412, | |
| "learning_rate": 0.00018422330097087375, | |
| "loss": 0.8864, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.6528735632183909, | |
| "grad_norm": 0.548526406288147, | |
| "learning_rate": 0.00018349514563106793, | |
| "loss": 0.7745, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.6620689655172414, | |
| "grad_norm": 0.5930566787719727, | |
| "learning_rate": 0.0001827669902912621, | |
| "loss": 0.8096, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.6712643678160921, | |
| "grad_norm": 0.5918225049972534, | |
| "learning_rate": 0.0001820388349514563, | |
| "loss": 0.8386, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.6804597701149424, | |
| "grad_norm": 0.5837062001228333, | |
| "learning_rate": 0.00018131067961165047, | |
| "loss": 0.6873, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.6896551724137931, | |
| "grad_norm": 0.5399461984634399, | |
| "learning_rate": 0.00018058252427184462, | |
| "loss": 0.8788, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.6988505747126437, | |
| "grad_norm": 0.48315730690956116, | |
| "learning_rate": 0.0001798543689320388, | |
| "loss": 0.7834, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.7080459770114942, | |
| "grad_norm": 0.5982978343963623, | |
| "learning_rate": 0.00017912621359223298, | |
| "loss": 0.6913, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.717241379310345, | |
| "grad_norm": 0.9199710488319397, | |
| "learning_rate": 0.00017839805825242716, | |
| "loss": 0.6592, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.7264367816091954, | |
| "grad_norm": 0.8085829615592957, | |
| "learning_rate": 0.00017766990291262133, | |
| "loss": 0.7439, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.735632183908046, | |
| "grad_norm": 0.6275731325149536, | |
| "learning_rate": 0.0001769417475728155, | |
| "loss": 0.7331, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.7448275862068967, | |
| "grad_norm": 0.5255990624427795, | |
| "learning_rate": 0.0001762135922330097, | |
| "loss": 0.7255, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.754022988505747, | |
| "grad_norm": 0.5680968165397644, | |
| "learning_rate": 0.00017548543689320387, | |
| "loss": 0.7446, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.7632183908045977, | |
| "grad_norm": 0.5626929402351379, | |
| "learning_rate": 0.00017475728155339802, | |
| "loss": 0.7948, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.7724137931034483, | |
| "grad_norm": 0.4922776222229004, | |
| "learning_rate": 0.0001740291262135922, | |
| "loss": 0.6568, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.7816091954022988, | |
| "grad_norm": 0.5294581651687622, | |
| "learning_rate": 0.00017330097087378638, | |
| "loss": 0.5458, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.7908045977011495, | |
| "grad_norm": 0.5421092510223389, | |
| "learning_rate": 0.00017257281553398056, | |
| "loss": 0.6159, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.7755627036094666, | |
| "learning_rate": 0.00017184466019417474, | |
| "loss": 0.7165, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.8091954022988506, | |
| "grad_norm": 0.9723690748214722, | |
| "learning_rate": 0.00017111650485436891, | |
| "loss": 0.7743, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.8183908045977013, | |
| "grad_norm": 0.6832698583602905, | |
| "learning_rate": 0.0001703883495145631, | |
| "loss": 0.7539, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.8275862068965516, | |
| "grad_norm": 0.592229962348938, | |
| "learning_rate": 0.00016966019417475724, | |
| "loss": 0.6838, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.8367816091954023, | |
| "grad_norm": 0.5979653596878052, | |
| "learning_rate": 0.00016893203883495142, | |
| "loss": 0.5518, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.8459770114942529, | |
| "grad_norm": 0.5223293304443359, | |
| "learning_rate": 0.0001682038834951456, | |
| "loss": 0.5327, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.8551724137931034, | |
| "grad_norm": 0.5753535032272339, | |
| "learning_rate": 0.00016747572815533978, | |
| "loss": 0.5783, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.8643678160919541, | |
| "grad_norm": 0.4552556276321411, | |
| "learning_rate": 0.00016674757281553396, | |
| "loss": 0.726, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.8735632183908046, | |
| "grad_norm": 0.46313363313674927, | |
| "learning_rate": 0.00016601941747572814, | |
| "loss": 0.7472, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.8827586206896552, | |
| "grad_norm": 0.7298106551170349, | |
| "learning_rate": 0.00016529126213592232, | |
| "loss": 0.6869, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.891954022988506, | |
| "grad_norm": 0.45026591420173645, | |
| "learning_rate": 0.0001645631067961165, | |
| "loss": 0.6929, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.9011494252873562, | |
| "grad_norm": 0.5525459051132202, | |
| "learning_rate": 0.00016383495145631065, | |
| "loss": 0.6441, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.910344827586207, | |
| "grad_norm": 0.5456161499023438, | |
| "learning_rate": 0.00016310679611650483, | |
| "loss": 0.8553, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.9195402298850575, | |
| "grad_norm": 0.5439329147338867, | |
| "learning_rate": 0.000162378640776699, | |
| "loss": 0.7111, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.928735632183908, | |
| "grad_norm": 0.49237221479415894, | |
| "learning_rate": 0.00016165048543689318, | |
| "loss": 0.6243, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.9379310344827587, | |
| "grad_norm": 0.493568480014801, | |
| "learning_rate": 0.00016092233009708736, | |
| "loss": 0.5351, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.9471264367816092, | |
| "grad_norm": 0.6934286952018738, | |
| "learning_rate": 0.00016019417475728154, | |
| "loss": 0.7098, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.9563218390804598, | |
| "grad_norm": 0.47321686148643494, | |
| "learning_rate": 0.00015946601941747572, | |
| "loss": 0.5246, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.9655172413793105, | |
| "grad_norm": 0.4709272086620331, | |
| "learning_rate": 0.0001587378640776699, | |
| "loss": 0.5498, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.9747126436781608, | |
| "grad_norm": 0.5461050271987915, | |
| "learning_rate": 0.00015800970873786405, | |
| "loss": 0.9668, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.9839080459770115, | |
| "grad_norm": 0.5588494539260864, | |
| "learning_rate": 0.00015728155339805823, | |
| "loss": 0.9515, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.993103448275862, | |
| "grad_norm": 0.6440616846084595, | |
| "learning_rate": 0.0001565533980582524, | |
| "loss": 0.7871, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.6609704494476318, | |
| "learning_rate": 0.00015582524271844658, | |
| "loss": 0.9322, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.3917137384414673, | |
| "eval_runtime": 70.9533, | |
| "eval_samples_per_second": 4.679, | |
| "eval_steps_per_second": 2.34, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.0091954022988507, | |
| "grad_norm": 0.6179903745651245, | |
| "learning_rate": 0.00015509708737864076, | |
| "loss": 0.4927, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 2.018390804597701, | |
| "grad_norm": 0.48804861307144165, | |
| "learning_rate": 0.00015436893203883494, | |
| "loss": 0.6208, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.027586206896552, | |
| "grad_norm": 0.5461186766624451, | |
| "learning_rate": 0.00015364077669902912, | |
| "loss": 0.5774, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.036781609195402, | |
| "grad_norm": 0.4856977164745331, | |
| "learning_rate": 0.0001529126213592233, | |
| "loss": 0.6426, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.045977011494253, | |
| "grad_norm": 0.3683694005012512, | |
| "learning_rate": 0.00015218446601941745, | |
| "loss": 0.4752, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.0551724137931036, | |
| "grad_norm": 0.5229505300521851, | |
| "learning_rate": 0.00015145631067961163, | |
| "loss": 0.5244, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.064367816091954, | |
| "grad_norm": 0.4558103084564209, | |
| "learning_rate": 0.0001507281553398058, | |
| "loss": 0.5453, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.0735632183908046, | |
| "grad_norm": 0.49105018377304077, | |
| "learning_rate": 0.00015, | |
| "loss": 0.4759, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.0827586206896553, | |
| "grad_norm": 0.6662527322769165, | |
| "learning_rate": 0.00014927184466019417, | |
| "loss": 0.8146, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 2.0919540229885056, | |
| "grad_norm": 0.6585009694099426, | |
| "learning_rate": 0.00014854368932038834, | |
| "loss": 0.4853, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.1011494252873564, | |
| "grad_norm": 0.46196475625038147, | |
| "learning_rate": 0.00014781553398058252, | |
| "loss": 0.5042, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 2.110344827586207, | |
| "grad_norm": 0.4975808262825012, | |
| "learning_rate": 0.00014708737864077667, | |
| "loss": 0.5507, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.1195402298850574, | |
| "grad_norm": 0.4535251259803772, | |
| "learning_rate": 0.00014635922330097085, | |
| "loss": 0.5138, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 2.128735632183908, | |
| "grad_norm": 0.5737034678459167, | |
| "learning_rate": 0.00014563106796116503, | |
| "loss": 0.6338, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.1379310344827585, | |
| "grad_norm": 0.4570606052875519, | |
| "learning_rate": 0.0001449029126213592, | |
| "loss": 0.4852, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 2.147126436781609, | |
| "grad_norm": 0.571922242641449, | |
| "learning_rate": 0.0001441747572815534, | |
| "loss": 0.6671, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.15632183908046, | |
| "grad_norm": 0.5813591480255127, | |
| "learning_rate": 0.00014344660194174757, | |
| "loss": 0.6764, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.1655172413793102, | |
| "grad_norm": 0.6348188519477844, | |
| "learning_rate": 0.00014271844660194175, | |
| "loss": 0.5268, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.174712643678161, | |
| "grad_norm": 0.5119661688804626, | |
| "learning_rate": 0.00014199029126213592, | |
| "loss": 0.6359, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 2.1839080459770113, | |
| "grad_norm": 0.478588342666626, | |
| "learning_rate": 0.00014126213592233008, | |
| "loss": 0.6315, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.193103448275862, | |
| "grad_norm": 0.4163134694099426, | |
| "learning_rate": 0.00014053398058252425, | |
| "loss": 0.5183, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.2022988505747128, | |
| "grad_norm": 0.4251859188079834, | |
| "learning_rate": 0.00013980582524271843, | |
| "loss": 0.4611, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.211494252873563, | |
| "grad_norm": 0.6032213568687439, | |
| "learning_rate": 0.0001390776699029126, | |
| "loss": 0.8345, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.220689655172414, | |
| "grad_norm": 0.48562517762184143, | |
| "learning_rate": 0.0001383495145631068, | |
| "loss": 0.4118, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.2298850574712645, | |
| "grad_norm": 0.45934462547302246, | |
| "learning_rate": 0.00013762135922330097, | |
| "loss": 0.4099, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.239080459770115, | |
| "grad_norm": 0.5195468068122864, | |
| "learning_rate": 0.00013689320388349515, | |
| "loss": 0.6766, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.2482758620689656, | |
| "grad_norm": 0.5700232982635498, | |
| "learning_rate": 0.00013616504854368933, | |
| "loss": 0.8323, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.2574712643678163, | |
| "grad_norm": 0.4179138243198395, | |
| "learning_rate": 0.00013543689320388348, | |
| "loss": 0.3745, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.2666666666666666, | |
| "grad_norm": 0.5863101482391357, | |
| "learning_rate": 0.00013470873786407766, | |
| "loss": 0.7907, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.2758620689655173, | |
| "grad_norm": 0.3810423016548157, | |
| "learning_rate": 0.00013398058252427184, | |
| "loss": 0.3766, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.2850574712643676, | |
| "grad_norm": 0.6277771592140198, | |
| "learning_rate": 0.00013325242718446601, | |
| "loss": 0.6325, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.2942528735632184, | |
| "grad_norm": 0.5368175506591797, | |
| "learning_rate": 0.0001325242718446602, | |
| "loss": 0.8255, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.303448275862069, | |
| "grad_norm": 0.5138571262359619, | |
| "learning_rate": 0.00013179611650485437, | |
| "loss": 0.5017, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.3126436781609194, | |
| "grad_norm": 0.42882323265075684, | |
| "learning_rate": 0.00013106796116504852, | |
| "loss": 0.4303, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.32183908045977, | |
| "grad_norm": 0.4786723256111145, | |
| "learning_rate": 0.0001303398058252427, | |
| "loss": 0.5765, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.3310344827586205, | |
| "grad_norm": 0.39283815026283264, | |
| "learning_rate": 0.00012961165048543688, | |
| "loss": 0.4495, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.340229885057471, | |
| "grad_norm": 0.4181981086730957, | |
| "learning_rate": 0.00012888349514563106, | |
| "loss": 0.4583, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.349425287356322, | |
| "grad_norm": 0.5347463488578796, | |
| "learning_rate": 0.00012815533980582524, | |
| "loss": 0.6027, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.3586206896551722, | |
| "grad_norm": 0.578888475894928, | |
| "learning_rate": 0.0001274271844660194, | |
| "loss": 0.7933, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.367816091954023, | |
| "grad_norm": 0.5870189666748047, | |
| "learning_rate": 0.00012669902912621357, | |
| "loss": 0.4751, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.3770114942528737, | |
| "grad_norm": 0.43344399333000183, | |
| "learning_rate": 0.00012597087378640775, | |
| "loss": 0.6022, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.386206896551724, | |
| "grad_norm": 0.5693091750144958, | |
| "learning_rate": 0.00012524271844660192, | |
| "loss": 0.6392, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.3954022988505748, | |
| "grad_norm": 0.5089174509048462, | |
| "learning_rate": 0.0001245145631067961, | |
| "loss": 0.6056, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.4045977011494255, | |
| "grad_norm": 0.35980507731437683, | |
| "learning_rate": 0.00012378640776699028, | |
| "loss": 0.4163, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.413793103448276, | |
| "grad_norm": 0.47071072459220886, | |
| "learning_rate": 0.00012305825242718446, | |
| "loss": 0.3934, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.4229885057471265, | |
| "grad_norm": 0.44740450382232666, | |
| "learning_rate": 0.0001223300970873786, | |
| "loss": 0.3688, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.432183908045977, | |
| "grad_norm": 0.3945635259151459, | |
| "learning_rate": 0.0001216019417475728, | |
| "loss": 0.44, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.4413793103448276, | |
| "grad_norm": 0.5359259843826294, | |
| "learning_rate": 0.00012087378640776697, | |
| "loss": 0.7224, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 2.4505747126436783, | |
| "grad_norm": 0.47032058238983154, | |
| "learning_rate": 0.00012014563106796115, | |
| "loss": 0.5569, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 2.4597701149425286, | |
| "grad_norm": 0.37549111247062683, | |
| "learning_rate": 0.00011941747572815533, | |
| "loss": 0.431, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.4689655172413794, | |
| "grad_norm": 0.39742720127105713, | |
| "learning_rate": 0.0001186893203883495, | |
| "loss": 0.4328, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 2.4781609195402297, | |
| "grad_norm": 0.3550657629966736, | |
| "learning_rate": 0.00011796116504854367, | |
| "loss": 0.377, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.4873563218390804, | |
| "grad_norm": 0.4095318615436554, | |
| "learning_rate": 0.00011723300970873785, | |
| "loss": 0.4171, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 2.496551724137931, | |
| "grad_norm": 0.512859582901001, | |
| "learning_rate": 0.00011650485436893203, | |
| "loss": 0.8625, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.5057471264367814, | |
| "grad_norm": 0.3573138117790222, | |
| "learning_rate": 0.0001157766990291262, | |
| "loss": 0.2954, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 2.514942528735632, | |
| "grad_norm": 0.38249099254608154, | |
| "learning_rate": 0.00011504854368932037, | |
| "loss": 0.5005, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.524137931034483, | |
| "grad_norm": 0.47415104508399963, | |
| "learning_rate": 0.00011432038834951455, | |
| "loss": 0.538, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.533333333333333, | |
| "grad_norm": 0.2491881549358368, | |
| "learning_rate": 0.00011359223300970873, | |
| "loss": 0.2644, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.542528735632184, | |
| "grad_norm": 0.44782719016075134, | |
| "learning_rate": 0.00011286407766990291, | |
| "loss": 0.5777, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 2.5517241379310347, | |
| "grad_norm": 0.416042685508728, | |
| "learning_rate": 0.00011213592233009707, | |
| "loss": 0.5316, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.560919540229885, | |
| "grad_norm": 0.3350113034248352, | |
| "learning_rate": 0.00011140776699029125, | |
| "loss": 0.3597, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 2.5701149425287357, | |
| "grad_norm": 0.2985689640045166, | |
| "learning_rate": 0.00011067961165048543, | |
| "loss": 0.3409, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.5793103448275865, | |
| "grad_norm": 0.3801935613155365, | |
| "learning_rate": 0.00010995145631067961, | |
| "loss": 0.4334, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 2.5885057471264368, | |
| "grad_norm": 0.42416247725486755, | |
| "learning_rate": 0.00010922330097087377, | |
| "loss": 0.4395, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.5977011494252875, | |
| "grad_norm": 0.5186505913734436, | |
| "learning_rate": 0.00010849514563106795, | |
| "loss": 0.6245, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 2.606896551724138, | |
| "grad_norm": 0.3158859312534332, | |
| "learning_rate": 0.00010776699029126213, | |
| "loss": 0.3056, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.6160919540229886, | |
| "grad_norm": 0.761823832988739, | |
| "learning_rate": 0.00010703883495145631, | |
| "loss": 0.4312, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.625287356321839, | |
| "grad_norm": 0.38367024064064026, | |
| "learning_rate": 0.00010631067961165047, | |
| "loss": 0.4427, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.6344827586206896, | |
| "grad_norm": 0.389691561460495, | |
| "learning_rate": 0.00010558252427184465, | |
| "loss": 0.3895, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 2.6436781609195403, | |
| "grad_norm": 0.29964563250541687, | |
| "learning_rate": 0.00010485436893203883, | |
| "loss": 0.2478, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.6528735632183906, | |
| "grad_norm": 0.45362621545791626, | |
| "learning_rate": 0.00010412621359223301, | |
| "loss": 0.5568, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 2.6620689655172414, | |
| "grad_norm": 0.509632408618927, | |
| "learning_rate": 0.00010339805825242718, | |
| "loss": 0.5516, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.671264367816092, | |
| "grad_norm": 0.4448017477989197, | |
| "learning_rate": 0.00010266990291262135, | |
| "loss": 0.6105, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 2.6804597701149424, | |
| "grad_norm": 0.3983929455280304, | |
| "learning_rate": 0.00010194174757281553, | |
| "loss": 0.4041, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.689655172413793, | |
| "grad_norm": 0.43729811906814575, | |
| "learning_rate": 0.00010121359223300971, | |
| "loss": 0.6012, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 2.698850574712644, | |
| "grad_norm": 0.38785648345947266, | |
| "learning_rate": 0.00010048543689320388, | |
| "loss": 0.4812, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.708045977011494, | |
| "grad_norm": 0.6477203369140625, | |
| "learning_rate": 9.975728155339806e-05, | |
| "loss": 0.8292, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.717241379310345, | |
| "grad_norm": 0.5400519967079163, | |
| "learning_rate": 9.902912621359223e-05, | |
| "loss": 0.6755, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.7264367816091957, | |
| "grad_norm": 0.39632365107536316, | |
| "learning_rate": 9.83009708737864e-05, | |
| "loss": 0.4604, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 2.735632183908046, | |
| "grad_norm": 0.4676194190979004, | |
| "learning_rate": 9.757281553398058e-05, | |
| "loss": 0.5986, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.7448275862068967, | |
| "grad_norm": 0.34826210141181946, | |
| "learning_rate": 9.684466019417476e-05, | |
| "loss": 0.4151, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 2.754022988505747, | |
| "grad_norm": 0.4393360912799835, | |
| "learning_rate": 9.611650485436893e-05, | |
| "loss": 0.4995, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.7632183908045977, | |
| "grad_norm": 0.39754021167755127, | |
| "learning_rate": 9.53883495145631e-05, | |
| "loss": 0.4783, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 2.772413793103448, | |
| "grad_norm": 0.5463854074478149, | |
| "learning_rate": 9.466019417475728e-05, | |
| "loss": 0.5441, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 2.781609195402299, | |
| "grad_norm": 0.3881894648075104, | |
| "learning_rate": 9.393203883495146e-05, | |
| "loss": 0.3949, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 2.7908045977011495, | |
| "grad_norm": 0.5275911688804626, | |
| "learning_rate": 9.320388349514561e-05, | |
| "loss": 0.6102, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.3656928241252899, | |
| "learning_rate": 9.247572815533979e-05, | |
| "loss": 0.4045, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.8091954022988506, | |
| "grad_norm": 0.35572561621665955, | |
| "learning_rate": 9.174757281553397e-05, | |
| "loss": 0.3061, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 2.8183908045977013, | |
| "grad_norm": 0.569115400314331, | |
| "learning_rate": 9.101941747572814e-05, | |
| "loss": 0.8109, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 2.8275862068965516, | |
| "grad_norm": 0.29033228754997253, | |
| "learning_rate": 9.029126213592231e-05, | |
| "loss": 0.317, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 2.8367816091954023, | |
| "grad_norm": 0.3231950104236603, | |
| "learning_rate": 8.956310679611649e-05, | |
| "loss": 0.3689, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 2.845977011494253, | |
| "grad_norm": 0.374177485704422, | |
| "learning_rate": 8.883495145631067e-05, | |
| "loss": 0.3802, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.8551724137931034, | |
| "grad_norm": 0.25250348448753357, | |
| "learning_rate": 8.810679611650485e-05, | |
| "loss": 0.236, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 2.864367816091954, | |
| "grad_norm": 0.5510450005531311, | |
| "learning_rate": 8.737864077669901e-05, | |
| "loss": 0.6565, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.873563218390805, | |
| "grad_norm": 0.32372424006462097, | |
| "learning_rate": 8.665048543689319e-05, | |
| "loss": 0.4243, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 2.882758620689655, | |
| "grad_norm": 0.34822607040405273, | |
| "learning_rate": 8.592233009708737e-05, | |
| "loss": 0.3775, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.891954022988506, | |
| "grad_norm": 0.535407304763794, | |
| "learning_rate": 8.519417475728155e-05, | |
| "loss": 0.7348, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.901149425287356, | |
| "grad_norm": 0.46572351455688477, | |
| "learning_rate": 8.446601941747571e-05, | |
| "loss": 0.5116, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 2.910344827586207, | |
| "grad_norm": 0.6060393452644348, | |
| "learning_rate": 8.373786407766989e-05, | |
| "loss": 0.8337, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 2.9195402298850572, | |
| "grad_norm": 0.3930990397930145, | |
| "learning_rate": 8.300970873786407e-05, | |
| "loss": 0.4314, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 2.928735632183908, | |
| "grad_norm": 0.3868286907672882, | |
| "learning_rate": 8.228155339805825e-05, | |
| "loss": 0.3842, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 2.9379310344827587, | |
| "grad_norm": 0.5302227139472961, | |
| "learning_rate": 8.155339805825241e-05, | |
| "loss": 0.7226, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.947126436781609, | |
| "grad_norm": 0.46904534101486206, | |
| "learning_rate": 8.082524271844659e-05, | |
| "loss": 0.5142, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 2.9563218390804598, | |
| "grad_norm": 0.4153412878513336, | |
| "learning_rate": 8.009708737864077e-05, | |
| "loss": 0.4995, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 2.9655172413793105, | |
| "grad_norm": 0.3775100111961365, | |
| "learning_rate": 7.936893203883495e-05, | |
| "loss": 0.4082, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 2.974712643678161, | |
| "grad_norm": 0.49698036909103394, | |
| "learning_rate": 7.864077669902911e-05, | |
| "loss": 0.58, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 2.9839080459770115, | |
| "grad_norm": 0.6160023808479309, | |
| "learning_rate": 7.791262135922329e-05, | |
| "loss": 0.6326, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.9931034482758623, | |
| "grad_norm": 0.5694648623466492, | |
| "learning_rate": 7.718446601941747e-05, | |
| "loss": 0.5942, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.6814332604408264, | |
| "learning_rate": 7.645631067961165e-05, | |
| "loss": 0.7482, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 1.406995415687561, | |
| "eval_runtime": 70.8982, | |
| "eval_samples_per_second": 4.683, | |
| "eval_steps_per_second": 2.341, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 3.0091954022988507, | |
| "grad_norm": 0.3148242235183716, | |
| "learning_rate": 7.572815533980581e-05, | |
| "loss": 0.2836, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 3.018390804597701, | |
| "grad_norm": 0.2789768576622009, | |
| "learning_rate": 7.5e-05, | |
| "loss": 0.2641, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 3.027586206896552, | |
| "grad_norm": 0.3457227051258087, | |
| "learning_rate": 7.427184466019417e-05, | |
| "loss": 0.3137, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.036781609195402, | |
| "grad_norm": 0.3316015601158142, | |
| "learning_rate": 7.354368932038834e-05, | |
| "loss": 0.3261, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 3.045977011494253, | |
| "grad_norm": 0.44476330280303955, | |
| "learning_rate": 7.281553398058252e-05, | |
| "loss": 0.446, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 3.0551724137931036, | |
| "grad_norm": 0.4332521855831146, | |
| "learning_rate": 7.20873786407767e-05, | |
| "loss": 0.4406, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 3.064367816091954, | |
| "grad_norm": 0.3380160331726074, | |
| "learning_rate": 7.135922330097087e-05, | |
| "loss": 0.2874, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 3.0735632183908046, | |
| "grad_norm": 0.341746985912323, | |
| "learning_rate": 7.063106796116504e-05, | |
| "loss": 0.306, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.0827586206896553, | |
| "grad_norm": 0.6307492256164551, | |
| "learning_rate": 6.990291262135922e-05, | |
| "loss": 0.682, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 3.0919540229885056, | |
| "grad_norm": 0.5883364677429199, | |
| "learning_rate": 6.91747572815534e-05, | |
| "loss": 0.5423, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 3.1011494252873564, | |
| "grad_norm": 0.36848077178001404, | |
| "learning_rate": 6.844660194174757e-05, | |
| "loss": 0.3521, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 3.110344827586207, | |
| "grad_norm": 0.38027411699295044, | |
| "learning_rate": 6.771844660194174e-05, | |
| "loss": 0.3215, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 3.1195402298850574, | |
| "grad_norm": 0.45381873846054077, | |
| "learning_rate": 6.699029126213592e-05, | |
| "loss": 0.5169, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.128735632183908, | |
| "grad_norm": 0.574475884437561, | |
| "learning_rate": 6.62621359223301e-05, | |
| "loss": 0.5358, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 3.1379310344827585, | |
| "grad_norm": 0.6136448979377747, | |
| "learning_rate": 6.553398058252426e-05, | |
| "loss": 0.7514, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 3.147126436781609, | |
| "grad_norm": 0.32782498002052307, | |
| "learning_rate": 6.480582524271844e-05, | |
| "loss": 0.3133, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 3.15632183908046, | |
| "grad_norm": 0.4657121002674103, | |
| "learning_rate": 6.407766990291262e-05, | |
| "loss": 0.4632, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 3.1655172413793102, | |
| "grad_norm": 0.33205467462539673, | |
| "learning_rate": 6.334951456310678e-05, | |
| "loss": 0.298, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.174712643678161, | |
| "grad_norm": 0.37659305334091187, | |
| "learning_rate": 6.262135922330096e-05, | |
| "loss": 0.2884, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 3.1839080459770113, | |
| "grad_norm": 0.47502419352531433, | |
| "learning_rate": 6.189320388349514e-05, | |
| "loss": 0.5064, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 3.193103448275862, | |
| "grad_norm": 0.6712875366210938, | |
| "learning_rate": 6.11650485436893e-05, | |
| "loss": 0.6806, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 3.2022988505747128, | |
| "grad_norm": 0.5198653340339661, | |
| "learning_rate": 6.0436893203883485e-05, | |
| "loss": 0.5421, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 3.211494252873563, | |
| "grad_norm": 0.43809249997138977, | |
| "learning_rate": 5.9708737864077663e-05, | |
| "loss": 0.4123, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.220689655172414, | |
| "grad_norm": 0.3182849884033203, | |
| "learning_rate": 5.8980582524271835e-05, | |
| "loss": 0.3025, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 3.2298850574712645, | |
| "grad_norm": 0.32380396127700806, | |
| "learning_rate": 5.8252427184466014e-05, | |
| "loss": 0.3006, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 3.239080459770115, | |
| "grad_norm": 0.32146674394607544, | |
| "learning_rate": 5.7524271844660186e-05, | |
| "loss": 0.2698, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 3.2482758620689656, | |
| "grad_norm": 0.3990193009376526, | |
| "learning_rate": 5.6796116504854364e-05, | |
| "loss": 0.379, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 3.2574712643678163, | |
| "grad_norm": 0.5906070470809937, | |
| "learning_rate": 5.6067961165048536e-05, | |
| "loss": 0.5351, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 3.2666666666666666, | |
| "grad_norm": 0.4367024004459381, | |
| "learning_rate": 5.5339805825242715e-05, | |
| "loss": 0.4316, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 3.2758620689655173, | |
| "grad_norm": 0.4074479639530182, | |
| "learning_rate": 5.461165048543689e-05, | |
| "loss": 0.323, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 3.2850574712643676, | |
| "grad_norm": 0.3967445194721222, | |
| "learning_rate": 5.3883495145631065e-05, | |
| "loss": 0.3267, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 3.2942528735632184, | |
| "grad_norm": 0.4907247722148895, | |
| "learning_rate": 5.315533980582524e-05, | |
| "loss": 0.4108, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 3.303448275862069, | |
| "grad_norm": 0.24740606546401978, | |
| "learning_rate": 5.2427184466019416e-05, | |
| "loss": 0.1978, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.3126436781609194, | |
| "grad_norm": 0.6829200387001038, | |
| "learning_rate": 5.169902912621359e-05, | |
| "loss": 0.6487, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 3.32183908045977, | |
| "grad_norm": 0.5345665216445923, | |
| "learning_rate": 5.0970873786407766e-05, | |
| "loss": 0.4638, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 3.3310344827586205, | |
| "grad_norm": 0.3852749466896057, | |
| "learning_rate": 5.024271844660194e-05, | |
| "loss": 0.4437, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 3.340229885057471, | |
| "grad_norm": 0.3901817202568054, | |
| "learning_rate": 4.951456310679612e-05, | |
| "loss": 0.425, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 3.349425287356322, | |
| "grad_norm": 0.5544285178184509, | |
| "learning_rate": 4.878640776699029e-05, | |
| "loss": 0.4356, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 3.3586206896551722, | |
| "grad_norm": 0.49693530797958374, | |
| "learning_rate": 4.805825242718447e-05, | |
| "loss": 0.4442, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 3.367816091954023, | |
| "grad_norm": 0.48270636796951294, | |
| "learning_rate": 4.733009708737864e-05, | |
| "loss": 0.5047, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 3.3770114942528737, | |
| "grad_norm": 0.5815203189849854, | |
| "learning_rate": 4.6601941747572804e-05, | |
| "loss": 0.7068, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 3.386206896551724, | |
| "grad_norm": 0.6433053612709045, | |
| "learning_rate": 4.587378640776698e-05, | |
| "loss": 0.5708, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 3.3954022988505748, | |
| "grad_norm": 0.35983002185821533, | |
| "learning_rate": 4.5145631067961155e-05, | |
| "loss": 0.289, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.4045977011494255, | |
| "grad_norm": 0.4140273630619049, | |
| "learning_rate": 4.4417475728155334e-05, | |
| "loss": 0.4028, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 3.413793103448276, | |
| "grad_norm": 0.36001721024513245, | |
| "learning_rate": 4.3689320388349505e-05, | |
| "loss": 0.3798, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 3.4229885057471265, | |
| "grad_norm": 0.3390791416168213, | |
| "learning_rate": 4.2961165048543684e-05, | |
| "loss": 0.271, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 3.432183908045977, | |
| "grad_norm": 0.44555366039276123, | |
| "learning_rate": 4.2233009708737856e-05, | |
| "loss": 0.3578, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 3.4413793103448276, | |
| "grad_norm": 0.5408360958099365, | |
| "learning_rate": 4.1504854368932035e-05, | |
| "loss": 0.6705, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.4505747126436783, | |
| "grad_norm": 0.3276488482952118, | |
| "learning_rate": 4.0776699029126206e-05, | |
| "loss": 0.302, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 3.4597701149425286, | |
| "grad_norm": 0.5470909476280212, | |
| "learning_rate": 4.0048543689320385e-05, | |
| "loss": 0.4292, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 3.4689655172413794, | |
| "grad_norm": 0.3690268099308014, | |
| "learning_rate": 3.932038834951456e-05, | |
| "loss": 0.3245, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 3.4781609195402297, | |
| "grad_norm": 0.5388251543045044, | |
| "learning_rate": 3.8592233009708736e-05, | |
| "loss": 0.5772, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 3.4873563218390804, | |
| "grad_norm": 0.39318031072616577, | |
| "learning_rate": 3.786407766990291e-05, | |
| "loss": 0.3266, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.496551724137931, | |
| "grad_norm": 0.3929058313369751, | |
| "learning_rate": 3.7135922330097086e-05, | |
| "loss": 0.3401, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 3.5057471264367814, | |
| "grad_norm": 0.4667484760284424, | |
| "learning_rate": 3.640776699029126e-05, | |
| "loss": 0.3983, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 3.514942528735632, | |
| "grad_norm": 0.4457002282142639, | |
| "learning_rate": 3.5679611650485437e-05, | |
| "loss": 0.3478, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 3.524137931034483, | |
| "grad_norm": 0.4100305736064911, | |
| "learning_rate": 3.495145631067961e-05, | |
| "loss": 0.3732, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 3.533333333333333, | |
| "grad_norm": 0.46938103437423706, | |
| "learning_rate": 3.422330097087379e-05, | |
| "loss": 0.4959, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.542528735632184, | |
| "grad_norm": 0.6163147687911987, | |
| "learning_rate": 3.349514563106796e-05, | |
| "loss": 0.6359, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 3.5517241379310347, | |
| "grad_norm": 0.39543232321739197, | |
| "learning_rate": 3.276699029126213e-05, | |
| "loss": 0.3193, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 3.560919540229885, | |
| "grad_norm": 0.3851659297943115, | |
| "learning_rate": 3.203883495145631e-05, | |
| "loss": 0.2912, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 3.5701149425287357, | |
| "grad_norm": 0.31686195731163025, | |
| "learning_rate": 3.131067961165048e-05, | |
| "loss": 0.2545, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 3.5793103448275865, | |
| "grad_norm": 0.4179750084877014, | |
| "learning_rate": 3.058252427184465e-05, | |
| "loss": 0.3569, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.5885057471264368, | |
| "grad_norm": 0.49302592873573303, | |
| "learning_rate": 2.9854368932038832e-05, | |
| "loss": 0.4231, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 3.5977011494252875, | |
| "grad_norm": 0.4251144826412201, | |
| "learning_rate": 2.9126213592233007e-05, | |
| "loss": 0.363, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 3.606896551724138, | |
| "grad_norm": 0.5428581237792969, | |
| "learning_rate": 2.8398058252427182e-05, | |
| "loss": 0.491, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 3.6160919540229886, | |
| "grad_norm": 0.4546624720096588, | |
| "learning_rate": 2.7669902912621357e-05, | |
| "loss": 0.4286, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 3.625287356321839, | |
| "grad_norm": 0.6219410300254822, | |
| "learning_rate": 2.6941747572815533e-05, | |
| "loss": 0.5962, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.6344827586206896, | |
| "grad_norm": 0.4943522810935974, | |
| "learning_rate": 2.6213592233009708e-05, | |
| "loss": 0.3746, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 3.6436781609195403, | |
| "grad_norm": 0.34759849309921265, | |
| "learning_rate": 2.5485436893203883e-05, | |
| "loss": 0.3504, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 3.6528735632183906, | |
| "grad_norm": 0.45892637968063354, | |
| "learning_rate": 2.475728155339806e-05, | |
| "loss": 0.2571, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 3.6620689655172414, | |
| "grad_norm": 0.4840629994869232, | |
| "learning_rate": 2.4029126213592234e-05, | |
| "loss": 0.4038, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 3.671264367816092, | |
| "grad_norm": 0.5587469339370728, | |
| "learning_rate": 2.3300970873786402e-05, | |
| "loss": 0.6057, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.6804597701149424, | |
| "grad_norm": 0.30649712681770325, | |
| "learning_rate": 2.2572815533980577e-05, | |
| "loss": 0.2603, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 3.689655172413793, | |
| "grad_norm": 0.41518911719322205, | |
| "learning_rate": 2.1844660194174753e-05, | |
| "loss": 0.3146, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 3.698850574712644, | |
| "grad_norm": 0.596571147441864, | |
| "learning_rate": 2.1116504854368928e-05, | |
| "loss": 0.6675, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 3.708045977011494, | |
| "grad_norm": 0.4106796085834503, | |
| "learning_rate": 2.0388349514563103e-05, | |
| "loss": 0.4136, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 3.717241379310345, | |
| "grad_norm": 0.5261825323104858, | |
| "learning_rate": 1.966019417475728e-05, | |
| "loss": 0.6104, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.7264367816091957, | |
| "grad_norm": 0.5117531418800354, | |
| "learning_rate": 1.8932038834951454e-05, | |
| "loss": 0.4088, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 3.735632183908046, | |
| "grad_norm": 0.46011051535606384, | |
| "learning_rate": 1.820388349514563e-05, | |
| "loss": 0.3203, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 3.7448275862068967, | |
| "grad_norm": 0.5213779211044312, | |
| "learning_rate": 1.7475728155339804e-05, | |
| "loss": 0.4547, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 3.754022988505747, | |
| "grad_norm": 0.6189706921577454, | |
| "learning_rate": 1.674757281553398e-05, | |
| "loss": 0.4992, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 3.7632183908045977, | |
| "grad_norm": 0.38668373227119446, | |
| "learning_rate": 1.6019417475728155e-05, | |
| "loss": 0.3237, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.772413793103448, | |
| "grad_norm": 0.6517217755317688, | |
| "learning_rate": 1.5291262135922327e-05, | |
| "loss": 0.6421, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 3.781609195402299, | |
| "grad_norm": 0.4396495223045349, | |
| "learning_rate": 1.4563106796116503e-05, | |
| "loss": 0.4529, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 3.7908045977011495, | |
| "grad_norm": 0.5464988350868225, | |
| "learning_rate": 1.3834951456310679e-05, | |
| "loss": 0.5085, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 0.49847450852394104, | |
| "learning_rate": 1.3106796116504854e-05, | |
| "loss": 0.4448, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 3.8091954022988506, | |
| "grad_norm": 0.26215147972106934, | |
| "learning_rate": 1.237864077669903e-05, | |
| "loss": 0.207, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 3.8183908045977013, | |
| "grad_norm": 0.5166394710540771, | |
| "learning_rate": 1.1650485436893201e-05, | |
| "loss": 0.4386, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 3.8275862068965516, | |
| "grad_norm": 0.3470999002456665, | |
| "learning_rate": 1.0922330097087376e-05, | |
| "loss": 0.3337, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 3.8367816091954023, | |
| "grad_norm": 0.41472527384757996, | |
| "learning_rate": 1.0194174757281552e-05, | |
| "loss": 0.3873, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 3.845977011494253, | |
| "grad_norm": 0.3731400668621063, | |
| "learning_rate": 9.466019417475727e-06, | |
| "loss": 0.3003, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 3.8551724137931034, | |
| "grad_norm": 0.4794781506061554, | |
| "learning_rate": 8.737864077669902e-06, | |
| "loss": 0.4373, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.864367816091954, | |
| "grad_norm": 0.4784807562828064, | |
| "learning_rate": 8.009708737864077e-06, | |
| "loss": 0.3727, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 3.873563218390805, | |
| "grad_norm": 0.4147201478481293, | |
| "learning_rate": 7.281553398058252e-06, | |
| "loss": 0.416, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 3.882758620689655, | |
| "grad_norm": 0.33760857582092285, | |
| "learning_rate": 6.553398058252427e-06, | |
| "loss": 0.2351, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 3.891954022988506, | |
| "grad_norm": 0.38601207733154297, | |
| "learning_rate": 5.8252427184466006e-06, | |
| "loss": 0.328, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 3.901149425287356, | |
| "grad_norm": 0.5886791944503784, | |
| "learning_rate": 5.097087378640776e-06, | |
| "loss": 0.523, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 3.910344827586207, | |
| "grad_norm": 0.497087687253952, | |
| "learning_rate": 4.368932038834951e-06, | |
| "loss": 0.4786, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 3.9195402298850572, | |
| "grad_norm": 0.29946476221084595, | |
| "learning_rate": 3.640776699029126e-06, | |
| "loss": 0.208, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 3.928735632183908, | |
| "grad_norm": 0.4071413576602936, | |
| "learning_rate": 2.9126213592233003e-06, | |
| "loss": 0.3595, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 3.9379310344827587, | |
| "grad_norm": 0.4986608624458313, | |
| "learning_rate": 2.1844660194174755e-06, | |
| "loss": 0.3931, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 3.947126436781609, | |
| "grad_norm": 0.32442793250083923, | |
| "learning_rate": 1.4563106796116501e-06, | |
| "loss": 0.4026, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.9563218390804598, | |
| "grad_norm": 0.6014803051948547, | |
| "learning_rate": 7.281553398058251e-07, | |
| "loss": 0.6049, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 3.9655172413793105, | |
| "grad_norm": 0.6689134836196899, | |
| "learning_rate": 0.0, | |
| "loss": 0.6334, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 3.9655172413793105, | |
| "eval_loss": 1.4621280431747437, | |
| "eval_runtime": 70.3815, | |
| "eval_samples_per_second": 4.717, | |
| "eval_steps_per_second": 2.359, | |
| "step": 432 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 432, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.5357902367421235e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |