{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002, "grad_norm": 19.86993980407715, "learning_rate": 2.0000000000000002e-07, "loss": 0.7037215232849121, "step": 1 }, { "epoch": 0.0004, "grad_norm": 17.658987045288086, "learning_rate": 4.0000000000000003e-07, "loss": 0.4115454852581024, "step": 2 }, { "epoch": 0.0006, "grad_norm": 27.911306381225586, "learning_rate": 6.000000000000001e-07, "loss": 0.7317515015602112, "step": 3 }, { "epoch": 0.0008, "grad_norm": 23.786823272705078, "learning_rate": 8.000000000000001e-07, "loss": 0.4670589864253998, "step": 4 }, { "epoch": 0.001, "grad_norm": 736.2578125, "learning_rate": 1.0000000000000002e-06, "loss": 8.38817024230957, "step": 5 }, { "epoch": 0.0012, "grad_norm": 483.0341491699219, "learning_rate": 1.2000000000000002e-06, "loss": 7.923478603363037, "step": 6 }, { "epoch": 0.0014, "grad_norm": 26.421083450317383, "learning_rate": 1.4000000000000001e-06, "loss": 0.5545251369476318, "step": 7 }, { "epoch": 0.0016, "grad_norm": 25.409162521362305, "learning_rate": 1.6000000000000001e-06, "loss": 0.5838867425918579, "step": 8 }, { "epoch": 0.0018, "grad_norm": 23.164382934570312, "learning_rate": 1.8000000000000001e-06, "loss": 0.9057097434997559, "step": 9 }, { "epoch": 0.002, "grad_norm": 18.42571258544922, "learning_rate": 2.0000000000000003e-06, "loss": 0.3040672242641449, "step": 10 }, { "epoch": 0.0022, "grad_norm": 15.404990196228027, "learning_rate": 2.2e-06, "loss": 0.8651700615882874, "step": 11 }, { "epoch": 0.0024, "grad_norm": 11.985212326049805, "learning_rate": 2.4000000000000003e-06, "loss": 0.7416182160377502, "step": 12 }, { "epoch": 0.0026, "grad_norm": 29.11711311340332, "learning_rate": 2.6e-06, "loss": 1.2297133207321167, "step": 13 }, { "epoch": 0.0028, "grad_norm": 27.741609573364258, "learning_rate": 2.8000000000000003e-06, "loss": 1.1059151887893677, "step": 14 }, { "epoch": 0.003, "grad_norm": 26.51416015625, "learning_rate": 3e-06, "loss": 0.6906358599662781, "step": 15 }, { "epoch": 0.0032, "grad_norm": 29.349390029907227, "learning_rate": 3.2000000000000003e-06, "loss": 0.6707777380943298, "step": 16 }, { "epoch": 0.0034, "grad_norm": 262.01763916015625, "learning_rate": 3.4000000000000005e-06, "loss": 5.405397891998291, "step": 17 }, { "epoch": 0.0036, "grad_norm": 203.5963134765625, "learning_rate": 3.6000000000000003e-06, "loss": 5.416719913482666, "step": 18 }, { "epoch": 0.0038, "grad_norm": 17.128952026367188, "learning_rate": 3.8000000000000005e-06, "loss": 0.6377151608467102, "step": 19 }, { "epoch": 0.004, "grad_norm": 18.25572967529297, "learning_rate": 4.000000000000001e-06, "loss": 0.6638563871383667, "step": 20 }, { "epoch": 0.0042, "grad_norm": 224.96484375, "learning_rate": 4.2000000000000004e-06, "loss": 2.4988303184509277, "step": 21 }, { "epoch": 0.0044, "grad_norm": 186.37709045410156, "learning_rate": 4.4e-06, "loss": 2.37096905708313, "step": 22 }, { "epoch": 0.0046, "grad_norm": 301.34625244140625, "learning_rate": 4.600000000000001e-06, "loss": 6.065738201141357, "step": 23 }, { "epoch": 0.0048, "grad_norm": 124.02801513671875, "learning_rate": 4.800000000000001e-06, "loss": 4.959842681884766, "step": 24 }, { "epoch": 0.005, "grad_norm": 191.34832763671875, "learning_rate": 5e-06, "loss": 3.2318477630615234, "step": 25 }, { "epoch": 0.0052, "grad_norm": 102.69784545898438, "learning_rate": 5.2e-06, "loss": 2.0269229412078857, "step": 26 }, { "epoch": 0.0054, "grad_norm": 185.41197204589844, "learning_rate": 5.400000000000001e-06, "loss": 3.1023576259613037, "step": 27 }, { "epoch": 0.0056, "grad_norm": 115.0453872680664, "learning_rate": 5.600000000000001e-06, "loss": 2.033405065536499, "step": 28 }, { "epoch": 0.0058, "grad_norm": 19.503202438354492, "learning_rate": 5.8e-06, "loss": 0.6852450966835022, "step": 29 }, { "epoch": 0.006, "grad_norm": 16.313444137573242, "learning_rate": 6e-06, "loss": 0.6160387992858887, "step": 30 }, { "epoch": 0.0062, "grad_norm": 34.96455764770508, "learning_rate": 6.200000000000001e-06, "loss": 0.8548898696899414, "step": 31 }, { "epoch": 0.0064, "grad_norm": 28.085153579711914, "learning_rate": 6.4000000000000006e-06, "loss": 0.43956005573272705, "step": 32 }, { "epoch": 0.0066, "grad_norm": 15.484055519104004, "learning_rate": 6.600000000000001e-06, "loss": 0.6819882392883301, "step": 33 }, { "epoch": 0.0068, "grad_norm": 11.606240272521973, "learning_rate": 6.800000000000001e-06, "loss": 0.2640717327594757, "step": 34 }, { "epoch": 0.007, "grad_norm": 16.289966583251953, "learning_rate": 7e-06, "loss": 0.6590662598609924, "step": 35 }, { "epoch": 0.0072, "grad_norm": 16.290184020996094, "learning_rate": 7.2000000000000005e-06, "loss": 0.5614698529243469, "step": 36 }, { "epoch": 0.0074, "grad_norm": 9.540451049804688, "learning_rate": 7.4e-06, "loss": 3.6535511016845703, "step": 37 }, { "epoch": 0.0076, "grad_norm": 8.212828636169434, "learning_rate": 7.600000000000001e-06, "loss": 3.6082541942596436, "step": 38 }, { "epoch": 0.0078, "grad_norm": 110.34765625, "learning_rate": 7.800000000000002e-06, "loss": 2.08210825920105, "step": 39 }, { "epoch": 0.008, "grad_norm": 54.909912109375, "learning_rate": 8.000000000000001e-06, "loss": 1.8885221481323242, "step": 40 }, { "epoch": 0.0082, "grad_norm": 14.795517921447754, "learning_rate": 8.2e-06, "loss": 0.8238609433174133, "step": 41 }, { "epoch": 0.0084, "grad_norm": 11.740867614746094, "learning_rate": 8.400000000000001e-06, "loss": 0.2301148623228073, "step": 42 }, { "epoch": 0.0086, "grad_norm": 12.639492988586426, "learning_rate": 8.6e-06, "loss": 0.42430806159973145, "step": 43 }, { "epoch": 0.0088, "grad_norm": 14.877533912658691, "learning_rate": 8.8e-06, "loss": 0.5195145606994629, "step": 44 }, { "epoch": 0.009, "grad_norm": 16.365663528442383, "learning_rate": 9e-06, "loss": 0.7476734519004822, "step": 45 }, { "epoch": 0.0092, "grad_norm": 15.314481735229492, "learning_rate": 9.200000000000002e-06, "loss": 0.6565811634063721, "step": 46 }, { "epoch": 0.0094, "grad_norm": 15.30256175994873, "learning_rate": 9.4e-06, "loss": 0.7285122871398926, "step": 47 }, { "epoch": 0.0096, "grad_norm": 15.99083137512207, "learning_rate": 9.600000000000001e-06, "loss": 0.7699318528175354, "step": 48 }, { "epoch": 0.0098, "grad_norm": 15.664243698120117, "learning_rate": 9.800000000000001e-06, "loss": 0.9520533084869385, "step": 49 }, { "epoch": 0.01, "grad_norm": 12.90129566192627, "learning_rate": 1e-05, "loss": 0.8631060719490051, "step": 50 }, { "epoch": 0.0102, "grad_norm": 10.566743850708008, "learning_rate": 1.02e-05, "loss": 0.4236431121826172, "step": 51 }, { "epoch": 0.0104, "grad_norm": 9.604334831237793, "learning_rate": 1.04e-05, "loss": 0.36946138739585876, "step": 52 }, { "epoch": 0.0106, "grad_norm": 43.98063659667969, "learning_rate": 1.0600000000000002e-05, "loss": 1.8339109420776367, "step": 53 }, { "epoch": 0.0108, "grad_norm": 23.898250579833984, "learning_rate": 1.0800000000000002e-05, "loss": 1.3176811933517456, "step": 54 }, { "epoch": 0.011, "grad_norm": 13.772536277770996, "learning_rate": 1.1000000000000001e-05, "loss": 0.5442234873771667, "step": 55 }, { "epoch": 0.0112, "grad_norm": 6.4163737297058105, "learning_rate": 1.1200000000000001e-05, "loss": 0.14489132165908813, "step": 56 }, { "epoch": 0.0114, "grad_norm": 19.58946990966797, "learning_rate": 1.14e-05, "loss": 1.6732454299926758, "step": 57 }, { "epoch": 0.0116, "grad_norm": 29.295209884643555, "learning_rate": 1.16e-05, "loss": 1.700900673866272, "step": 58 }, { "epoch": 0.0118, "grad_norm": 10.337576866149902, "learning_rate": 1.18e-05, "loss": 0.6807141304016113, "step": 59 }, { "epoch": 0.012, "grad_norm": 9.932164192199707, "learning_rate": 1.2e-05, "loss": 0.2126649022102356, "step": 60 }, { "epoch": 0.0122, "grad_norm": 14.373818397521973, "learning_rate": 1.22e-05, "loss": 0.5442662835121155, "step": 61 }, { "epoch": 0.0124, "grad_norm": 7.414956092834473, "learning_rate": 1.2400000000000002e-05, "loss": 0.20551882684230804, "step": 62 }, { "epoch": 0.0126, "grad_norm": 22.96013641357422, "learning_rate": 1.2600000000000001e-05, "loss": 0.9355632662773132, "step": 63 }, { "epoch": 0.0128, "grad_norm": 30.31879234313965, "learning_rate": 1.2800000000000001e-05, "loss": 2.546799898147583, "step": 64 }, { "epoch": 0.013, "grad_norm": 18.958202362060547, "learning_rate": 1.3000000000000001e-05, "loss": 1.2627061605453491, "step": 65 }, { "epoch": 0.0132, "grad_norm": 19.17605209350586, "learning_rate": 1.3200000000000002e-05, "loss": 1.7123245000839233, "step": 66 }, { "epoch": 0.0134, "grad_norm": 17.46062469482422, "learning_rate": 1.3400000000000002e-05, "loss": 0.7560626864433289, "step": 67 }, { "epoch": 0.0136, "grad_norm": 17.04747200012207, "learning_rate": 1.3600000000000002e-05, "loss": 0.7125154137611389, "step": 68 }, { "epoch": 0.0138, "grad_norm": 14.661514282226562, "learning_rate": 1.38e-05, "loss": 0.5681678652763367, "step": 69 }, { "epoch": 0.014, "grad_norm": 11.718018531799316, "learning_rate": 1.4e-05, "loss": 0.39901670813560486, "step": 70 }, { "epoch": 0.0142, "grad_norm": 17.879074096679688, "learning_rate": 1.4200000000000001e-05, "loss": 1.1150238513946533, "step": 71 }, { "epoch": 0.0144, "grad_norm": 16.158737182617188, "learning_rate": 1.4400000000000001e-05, "loss": 1.1416844129562378, "step": 72 }, { "epoch": 0.0146, "grad_norm": 17.92641258239746, "learning_rate": 1.46e-05, "loss": 1.118418574333191, "step": 73 }, { "epoch": 0.0148, "grad_norm": 14.92599105834961, "learning_rate": 1.48e-05, "loss": 1.0633187294006348, "step": 74 }, { "epoch": 0.015, "grad_norm": 12.820063591003418, "learning_rate": 1.5000000000000002e-05, "loss": 0.53122878074646, "step": 75 }, { "epoch": 0.0152, "grad_norm": 14.567853927612305, "learning_rate": 1.5200000000000002e-05, "loss": 0.4186933934688568, "step": 76 }, { "epoch": 0.0154, "grad_norm": 17.807376861572266, "learning_rate": 1.54e-05, "loss": 1.7758623361587524, "step": 77 }, { "epoch": 0.0156, "grad_norm": 16.614009857177734, "learning_rate": 1.5600000000000003e-05, "loss": 1.3614182472229004, "step": 78 }, { "epoch": 0.0158, "grad_norm": 20.342607498168945, "learning_rate": 1.58e-05, "loss": 1.0765376091003418, "step": 79 }, { "epoch": 0.016, "grad_norm": 18.532209396362305, "learning_rate": 1.6000000000000003e-05, "loss": 1.0294160842895508, "step": 80 }, { "epoch": 0.0162, "grad_norm": 14.95114803314209, "learning_rate": 1.62e-05, "loss": 0.9725584387779236, "step": 81 }, { "epoch": 0.0164, "grad_norm": 16.384130477905273, "learning_rate": 1.64e-05, "loss": 1.2944960594177246, "step": 82 }, { "epoch": 0.0166, "grad_norm": 20.59752082824707, "learning_rate": 1.66e-05, "loss": 1.2144062519073486, "step": 83 }, { "epoch": 0.0168, "grad_norm": 14.727459907531738, "learning_rate": 1.6800000000000002e-05, "loss": 1.4357770681381226, "step": 84 }, { "epoch": 0.017, "grad_norm": 13.784932136535645, "learning_rate": 1.7e-05, "loss": 0.8966814875602722, "step": 85 }, { "epoch": 0.0172, "grad_norm": 8.46639347076416, "learning_rate": 1.72e-05, "loss": 0.7025353312492371, "step": 86 }, { "epoch": 0.0174, "grad_norm": 17.623991012573242, "learning_rate": 1.7400000000000003e-05, "loss": 1.518816351890564, "step": 87 }, { "epoch": 0.0176, "grad_norm": 15.044212341308594, "learning_rate": 1.76e-05, "loss": 1.7205009460449219, "step": 88 }, { "epoch": 0.0178, "grad_norm": 15.654994010925293, "learning_rate": 1.7800000000000002e-05, "loss": 0.5644822120666504, "step": 89 }, { "epoch": 0.018, "grad_norm": 23.13344955444336, "learning_rate": 1.8e-05, "loss": 1.4888534545898438, "step": 90 }, { "epoch": 0.0182, "grad_norm": 15.345812797546387, "learning_rate": 1.8200000000000002e-05, "loss": 2.276603937149048, "step": 91 }, { "epoch": 0.0184, "grad_norm": 12.848119735717773, "learning_rate": 1.8400000000000003e-05, "loss": 2.165219306945801, "step": 92 }, { "epoch": 0.0186, "grad_norm": 12.793383598327637, "learning_rate": 1.86e-05, "loss": 0.40768125653266907, "step": 93 }, { "epoch": 0.0188, "grad_norm": 13.451870918273926, "learning_rate": 1.88e-05, "loss": 0.33898743987083435, "step": 94 }, { "epoch": 0.019, "grad_norm": 8.729146003723145, "learning_rate": 1.9e-05, "loss": 0.3523624837398529, "step": 95 }, { "epoch": 0.0192, "grad_norm": 11.61637020111084, "learning_rate": 1.9200000000000003e-05, "loss": 0.36027684807777405, "step": 96 }, { "epoch": 0.0194, "grad_norm": 11.825678825378418, "learning_rate": 1.94e-05, "loss": 0.4324101507663727, "step": 97 }, { "epoch": 0.0196, "grad_norm": 6.112161159515381, "learning_rate": 1.9600000000000002e-05, "loss": 0.12024092674255371, "step": 98 }, { "epoch": 0.0198, "grad_norm": 8.166031837463379, "learning_rate": 1.98e-05, "loss": 2.617506504058838, "step": 99 }, { "epoch": 0.02, "grad_norm": 6.198336124420166, "learning_rate": 2e-05, "loss": 2.6058919429779053, "step": 100 }, { "epoch": 0.0202, "grad_norm": 23.16619300842285, "learning_rate": 1.999591836734694e-05, "loss": 1.200722098350525, "step": 101 }, { "epoch": 0.0204, "grad_norm": 36.44799041748047, "learning_rate": 1.999183673469388e-05, "loss": 2.272332191467285, "step": 102 }, { "epoch": 0.0206, "grad_norm": 14.700854301452637, "learning_rate": 1.9987755102040818e-05, "loss": 1.4091886281967163, "step": 103 }, { "epoch": 0.0208, "grad_norm": 38.54850769042969, "learning_rate": 1.9983673469387756e-05, "loss": 1.6711570024490356, "step": 104 }, { "epoch": 0.021, "grad_norm": 19.840042114257812, "learning_rate": 1.9979591836734697e-05, "loss": 1.034736156463623, "step": 105 }, { "epoch": 0.0212, "grad_norm": 15.678369522094727, "learning_rate": 1.9975510204081634e-05, "loss": 1.1222563982009888, "step": 106 }, { "epoch": 0.0214, "grad_norm": 16.824556350708008, "learning_rate": 1.9971428571428572e-05, "loss": 1.1417587995529175, "step": 107 }, { "epoch": 0.0216, "grad_norm": 20.7105770111084, "learning_rate": 1.9967346938775513e-05, "loss": 1.1519407033920288, "step": 108 }, { "epoch": 0.0218, "grad_norm": 10.305238723754883, "learning_rate": 1.996326530612245e-05, "loss": 0.4236001968383789, "step": 109 }, { "epoch": 0.022, "grad_norm": 10.24266242980957, "learning_rate": 1.9959183673469388e-05, "loss": 0.31372806429862976, "step": 110 }, { "epoch": 0.0222, "grad_norm": 9.28374195098877, "learning_rate": 1.995510204081633e-05, "loss": 0.48246851563453674, "step": 111 }, { "epoch": 0.0224, "grad_norm": 7.033731460571289, "learning_rate": 1.9951020408163267e-05, "loss": 0.1980251520872116, "step": 112 }, { "epoch": 0.0226, "grad_norm": 16.573246002197266, "learning_rate": 1.9946938775510204e-05, "loss": 2.1796398162841797, "step": 113 }, { "epoch": 0.0228, "grad_norm": 13.532843589782715, "learning_rate": 1.9942857142857142e-05, "loss": 2.140252113342285, "step": 114 }, { "epoch": 0.023, "grad_norm": 11.697552680969238, "learning_rate": 1.9938775510204083e-05, "loss": 0.422529935836792, "step": 115 }, { "epoch": 0.0232, "grad_norm": 6.95910120010376, "learning_rate": 1.993469387755102e-05, "loss": 0.12921731173992157, "step": 116 }, { "epoch": 0.0234, "grad_norm": 9.09284782409668, "learning_rate": 1.993061224489796e-05, "loss": 0.36724767088890076, "step": 117 }, { "epoch": 0.0236, "grad_norm": 10.362861633300781, "learning_rate": 1.99265306122449e-05, "loss": 0.3813866376876831, "step": 118 }, { "epoch": 0.0238, "grad_norm": 13.540061950683594, "learning_rate": 1.9922448979591837e-05, "loss": 1.1938616037368774, "step": 119 }, { "epoch": 0.024, "grad_norm": 20.51814842224121, "learning_rate": 1.9918367346938775e-05, "loss": 0.9086930155754089, "step": 120 }, { "epoch": 0.0242, "grad_norm": 13.426894187927246, "learning_rate": 1.9914285714285716e-05, "loss": 0.5107138752937317, "step": 121 }, { "epoch": 0.0244, "grad_norm": 12.804654121398926, "learning_rate": 1.9910204081632653e-05, "loss": 0.561721682548523, "step": 122 }, { "epoch": 0.0246, "grad_norm": 20.744733810424805, "learning_rate": 1.9906122448979594e-05, "loss": 0.6595413684844971, "step": 123 }, { "epoch": 0.0248, "grad_norm": 14.705329895019531, "learning_rate": 1.9902040816326532e-05, "loss": 0.7842282652854919, "step": 124 }, { "epoch": 0.025, "grad_norm": 12.542791366577148, "learning_rate": 1.9897959183673473e-05, "loss": 0.6271759867668152, "step": 125 }, { "epoch": 0.0252, "grad_norm": 10.929245948791504, "learning_rate": 1.989387755102041e-05, "loss": 0.4905226230621338, "step": 126 }, { "epoch": 0.0254, "grad_norm": 14.044593811035156, "learning_rate": 1.988979591836735e-05, "loss": 0.9779181480407715, "step": 127 }, { "epoch": 0.0256, "grad_norm": 13.47326946258545, "learning_rate": 1.988571428571429e-05, "loss": 0.8366219401359558, "step": 128 }, { "epoch": 0.0258, "grad_norm": 10.467619895935059, "learning_rate": 1.9881632653061227e-05, "loss": 0.4179813861846924, "step": 129 }, { "epoch": 0.026, "grad_norm": 11.507650375366211, "learning_rate": 1.9877551020408165e-05, "loss": 0.37075725197792053, "step": 130 }, { "epoch": 0.0262, "grad_norm": 17.782169342041016, "learning_rate": 1.9873469387755106e-05, "loss": 1.3293887376785278, "step": 131 }, { "epoch": 0.0264, "grad_norm": 12.833648681640625, "learning_rate": 1.9869387755102043e-05, "loss": 0.45509305596351624, "step": 132 }, { "epoch": 0.0266, "grad_norm": 22.548070907592773, "learning_rate": 1.986530612244898e-05, "loss": 0.5744470357894897, "step": 133 }, { "epoch": 0.0268, "grad_norm": 12.30579948425293, "learning_rate": 1.9861224489795922e-05, "loss": 0.20529121160507202, "step": 134 }, { "epoch": 0.027, "grad_norm": 17.774799346923828, "learning_rate": 1.985714285714286e-05, "loss": 1.0186904668807983, "step": 135 }, { "epoch": 0.0272, "grad_norm": 14.874011993408203, "learning_rate": 1.9853061224489797e-05, "loss": 0.779259204864502, "step": 136 }, { "epoch": 0.0274, "grad_norm": 10.167969703674316, "learning_rate": 1.984897959183674e-05, "loss": 0.1621553897857666, "step": 137 }, { "epoch": 0.0276, "grad_norm": 7.144793510437012, "learning_rate": 1.9844897959183676e-05, "loss": 0.09167339652776718, "step": 138 }, { "epoch": 0.0278, "grad_norm": 12.632438659667969, "learning_rate": 1.9840816326530614e-05, "loss": 0.5062233805656433, "step": 139 }, { "epoch": 0.028, "grad_norm": 10.130064964294434, "learning_rate": 1.983673469387755e-05, "loss": 0.16204434633255005, "step": 140 }, { "epoch": 0.0282, "grad_norm": 12.270245552062988, "learning_rate": 1.9832653061224492e-05, "loss": 0.43254661560058594, "step": 141 }, { "epoch": 0.0284, "grad_norm": 7.125492095947266, "learning_rate": 1.982857142857143e-05, "loss": 0.17016957700252533, "step": 142 }, { "epoch": 0.0286, "grad_norm": 50.75775909423828, "learning_rate": 1.9824489795918368e-05, "loss": 1.619408130645752, "step": 143 }, { "epoch": 0.0288, "grad_norm": 58.936527252197266, "learning_rate": 1.982040816326531e-05, "loss": 2.543792963027954, "step": 144 }, { "epoch": 0.029, "grad_norm": 11.407219886779785, "learning_rate": 1.9816326530612246e-05, "loss": 0.23832444846630096, "step": 145 }, { "epoch": 0.0292, "grad_norm": 3.9544060230255127, "learning_rate": 1.9812244897959184e-05, "loss": 0.11590949445962906, "step": 146 }, { "epoch": 0.0294, "grad_norm": 10.821979522705078, "learning_rate": 1.9808163265306125e-05, "loss": 0.5023337006568909, "step": 147 }, { "epoch": 0.0296, "grad_norm": 9.180195808410645, "learning_rate": 1.9804081632653063e-05, "loss": 0.38146814703941345, "step": 148 }, { "epoch": 0.0298, "grad_norm": 16.268226623535156, "learning_rate": 1.98e-05, "loss": 0.5793936848640442, "step": 149 }, { "epoch": 0.03, "grad_norm": 17.789043426513672, "learning_rate": 1.979591836734694e-05, "loss": 0.5812906622886658, "step": 150 }, { "epoch": 0.0302, "grad_norm": 13.205047607421875, "learning_rate": 1.979183673469388e-05, "loss": 0.7076604962348938, "step": 151 }, { "epoch": 0.0304, "grad_norm": 16.953746795654297, "learning_rate": 1.9787755102040816e-05, "loss": 0.7535090446472168, "step": 152 }, { "epoch": 0.0306, "grad_norm": 13.64711856842041, "learning_rate": 1.9783673469387757e-05, "loss": 0.5489466786384583, "step": 153 }, { "epoch": 0.0308, "grad_norm": 12.464900970458984, "learning_rate": 1.9779591836734695e-05, "loss": 0.3147149384021759, "step": 154 }, { "epoch": 0.031, "grad_norm": 15.660754203796387, "learning_rate": 1.9775510204081633e-05, "loss": 0.5840892195701599, "step": 155 }, { "epoch": 0.0312, "grad_norm": 12.983843803405762, "learning_rate": 1.9771428571428574e-05, "loss": 0.16638237237930298, "step": 156 }, { "epoch": 0.0314, "grad_norm": 10.69919490814209, "learning_rate": 1.976734693877551e-05, "loss": 0.4699489176273346, "step": 157 }, { "epoch": 0.0316, "grad_norm": 8.013006210327148, "learning_rate": 1.976326530612245e-05, "loss": 0.140150785446167, "step": 158 }, { "epoch": 0.0318, "grad_norm": 10.338350296020508, "learning_rate": 1.975918367346939e-05, "loss": 0.6970502734184265, "step": 159 }, { "epoch": 0.032, "grad_norm": 18.74736976623535, "learning_rate": 1.9755102040816328e-05, "loss": 0.6801952719688416, "step": 160 }, { "epoch": 0.0322, "grad_norm": 14.109589576721191, "learning_rate": 1.9751020408163265e-05, "loss": 1.7827507257461548, "step": 161 }, { "epoch": 0.0324, "grad_norm": 15.422320365905762, "learning_rate": 1.9746938775510206e-05, "loss": 1.477315902709961, "step": 162 }, { "epoch": 0.0326, "grad_norm": 21.485685348510742, "learning_rate": 1.9742857142857144e-05, "loss": 1.0516995191574097, "step": 163 }, { "epoch": 0.0328, "grad_norm": 18.223512649536133, "learning_rate": 1.973877551020408e-05, "loss": 1.0833985805511475, "step": 164 }, { "epoch": 0.033, "grad_norm": 10.750219345092773, "learning_rate": 1.9734693877551023e-05, "loss": 1.9473018646240234, "step": 165 }, { "epoch": 0.0332, "grad_norm": 10.971612930297852, "learning_rate": 1.973061224489796e-05, "loss": 1.8101927042007446, "step": 166 }, { "epoch": 0.0334, "grad_norm": 10.93656063079834, "learning_rate": 1.9726530612244898e-05, "loss": 2.7729265689849854, "step": 167 }, { "epoch": 0.0336, "grad_norm": 10.621392250061035, "learning_rate": 1.9722448979591836e-05, "loss": 2.4153614044189453, "step": 168 }, { "epoch": 0.0338, "grad_norm": 8.495170593261719, "learning_rate": 1.9718367346938777e-05, "loss": 0.2918655276298523, "step": 169 }, { "epoch": 0.034, "grad_norm": 9.634339332580566, "learning_rate": 1.9714285714285718e-05, "loss": 0.3897497355937958, "step": 170 }, { "epoch": 0.0342, "grad_norm": 7.359801769256592, "learning_rate": 1.9710204081632655e-05, "loss": 0.2934911847114563, "step": 171 }, { "epoch": 0.0344, "grad_norm": 19.19106674194336, "learning_rate": 1.9706122448979593e-05, "loss": 0.25113484263420105, "step": 172 }, { "epoch": 0.0346, "grad_norm": 11.436328887939453, "learning_rate": 1.9702040816326534e-05, "loss": 1.3752632141113281, "step": 173 }, { "epoch": 0.0348, "grad_norm": 11.581891059875488, "learning_rate": 1.969795918367347e-05, "loss": 1.111779808998108, "step": 174 }, { "epoch": 0.035, "grad_norm": 10.153788566589355, "learning_rate": 1.969387755102041e-05, "loss": 0.21362058818340302, "step": 175 }, { "epoch": 0.0352, "grad_norm": 9.029555320739746, "learning_rate": 1.968979591836735e-05, "loss": 0.18420429527759552, "step": 176 }, { "epoch": 0.0354, "grad_norm": 12.224593162536621, "learning_rate": 1.9685714285714288e-05, "loss": 0.8582136034965515, "step": 177 }, { "epoch": 0.0356, "grad_norm": 12.30530834197998, "learning_rate": 1.9681632653061226e-05, "loss": 0.790539562702179, "step": 178 }, { "epoch": 0.0358, "grad_norm": 11.196264266967773, "learning_rate": 1.9677551020408167e-05, "loss": 0.7992145419120789, "step": 179 }, { "epoch": 0.036, "grad_norm": 11.926046371459961, "learning_rate": 1.9673469387755104e-05, "loss": 1.205533742904663, "step": 180 }, { "epoch": 0.0362, "grad_norm": 11.32097053527832, "learning_rate": 1.9669387755102042e-05, "loss": 2.065629243850708, "step": 181 }, { "epoch": 0.0364, "grad_norm": 10.51060962677002, "learning_rate": 1.9665306122448983e-05, "loss": 1.9534316062927246, "step": 182 }, { "epoch": 0.0366, "grad_norm": 12.027915000915527, "learning_rate": 1.966122448979592e-05, "loss": 0.6187699437141418, "step": 183 }, { "epoch": 0.0368, "grad_norm": 10.165942192077637, "learning_rate": 1.9657142857142858e-05, "loss": 0.8004693984985352, "step": 184 }, { "epoch": 0.037, "grad_norm": 13.300896644592285, "learning_rate": 1.96530612244898e-05, "loss": 0.57137131690979, "step": 185 }, { "epoch": 0.0372, "grad_norm": 11.380193710327148, "learning_rate": 1.9648979591836737e-05, "loss": 0.24036674201488495, "step": 186 }, { "epoch": 0.0374, "grad_norm": 15.774130821228027, "learning_rate": 1.9644897959183674e-05, "loss": 0.5146348476409912, "step": 187 }, { "epoch": 0.0376, "grad_norm": 14.454614639282227, "learning_rate": 1.9640816326530616e-05, "loss": 1.4913374185562134, "step": 188 }, { "epoch": 0.0378, "grad_norm": 12.067485809326172, "learning_rate": 1.9636734693877553e-05, "loss": 0.6330704689025879, "step": 189 }, { "epoch": 0.038, "grad_norm": 12.86535358428955, "learning_rate": 1.963265306122449e-05, "loss": 1.4309087991714478, "step": 190 }, { "epoch": 0.0382, "grad_norm": 11.127947807312012, "learning_rate": 1.9628571428571432e-05, "loss": 0.7268363833427429, "step": 191 }, { "epoch": 0.0384, "grad_norm": 10.412270545959473, "learning_rate": 1.962448979591837e-05, "loss": 1.0172964334487915, "step": 192 }, { "epoch": 0.0386, "grad_norm": 8.217020034790039, "learning_rate": 1.9620408163265307e-05, "loss": 0.34921762347221375, "step": 193 }, { "epoch": 0.0388, "grad_norm": 9.945459365844727, "learning_rate": 1.9616326530612245e-05, "loss": 0.3862124979496002, "step": 194 }, { "epoch": 0.039, "grad_norm": 6.348339080810547, "learning_rate": 1.9612244897959186e-05, "loss": 0.26032719016075134, "step": 195 }, { "epoch": 0.0392, "grad_norm": 10.936368942260742, "learning_rate": 1.9608163265306123e-05, "loss": 0.3817369043827057, "step": 196 }, { "epoch": 0.0394, "grad_norm": 17.57430648803711, "learning_rate": 1.960408163265306e-05, "loss": 0.8593737483024597, "step": 197 }, { "epoch": 0.0396, "grad_norm": 12.015942573547363, "learning_rate": 1.9600000000000002e-05, "loss": 0.8226779103279114, "step": 198 }, { "epoch": 0.0398, "grad_norm": 14.96813678741455, "learning_rate": 1.959591836734694e-05, "loss": 2.1698710918426514, "step": 199 }, { "epoch": 0.04, "grad_norm": 13.83712387084961, "learning_rate": 1.9591836734693877e-05, "loss": 2.1152279376983643, "step": 200 }, { "epoch": 0.0402, "grad_norm": 12.426931381225586, "learning_rate": 1.958775510204082e-05, "loss": 0.5947772264480591, "step": 201 }, { "epoch": 0.0404, "grad_norm": 6.404804229736328, "learning_rate": 1.9583673469387756e-05, "loss": 0.09137320518493652, "step": 202 }, { "epoch": 0.0406, "grad_norm": 20.05866050720215, "learning_rate": 1.9579591836734694e-05, "loss": 0.9587116241455078, "step": 203 }, { "epoch": 0.0408, "grad_norm": 13.285184860229492, "learning_rate": 1.9575510204081635e-05, "loss": 0.7525708079338074, "step": 204 }, { "epoch": 0.041, "grad_norm": 18.103675842285156, "learning_rate": 1.9571428571428572e-05, "loss": 0.9542830586433411, "step": 205 }, { "epoch": 0.0412, "grad_norm": 18.912912368774414, "learning_rate": 1.956734693877551e-05, "loss": 1.5049800872802734, "step": 206 }, { "epoch": 0.0414, "grad_norm": 17.95879554748535, "learning_rate": 1.956326530612245e-05, "loss": 0.4391709864139557, "step": 207 }, { "epoch": 0.0416, "grad_norm": 16.980247497558594, "learning_rate": 1.955918367346939e-05, "loss": 0.880516529083252, "step": 208 }, { "epoch": 0.0418, "grad_norm": 25.475059509277344, "learning_rate": 1.9555102040816326e-05, "loss": 1.8526705503463745, "step": 209 }, { "epoch": 0.042, "grad_norm": 27.993799209594727, "learning_rate": 1.9551020408163267e-05, "loss": 2.4861631393432617, "step": 210 }, { "epoch": 0.0422, "grad_norm": 13.218708038330078, "learning_rate": 1.9546938775510205e-05, "loss": 0.7200023531913757, "step": 211 }, { "epoch": 0.0424, "grad_norm": 7.854865074157715, "learning_rate": 1.9542857142857143e-05, "loss": 0.4573878049850464, "step": 212 }, { "epoch": 0.0426, "grad_norm": 16.14672088623047, "learning_rate": 1.9538775510204084e-05, "loss": 0.5513072609901428, "step": 213 }, { "epoch": 0.0428, "grad_norm": 16.056869506835938, "learning_rate": 1.9534693877551025e-05, "loss": 0.41998934745788574, "step": 214 }, { "epoch": 0.043, "grad_norm": 15.660920143127441, "learning_rate": 1.9530612244897962e-05, "loss": 0.292898565530777, "step": 215 }, { "epoch": 0.0432, "grad_norm": 15.301045417785645, "learning_rate": 1.95265306122449e-05, "loss": 0.37798991799354553, "step": 216 }, { "epoch": 0.0434, "grad_norm": 12.420317649841309, "learning_rate": 1.952244897959184e-05, "loss": 1.0150760412216187, "step": 217 }, { "epoch": 0.0436, "grad_norm": 12.52476692199707, "learning_rate": 1.951836734693878e-05, "loss": 1.0119246244430542, "step": 218 }, { "epoch": 0.0438, "grad_norm": 13.150724411010742, "learning_rate": 1.9514285714285716e-05, "loss": 1.8081260919570923, "step": 219 }, { "epoch": 0.044, "grad_norm": 14.73547077178955, "learning_rate": 1.9510204081632654e-05, "loss": 1.8981086015701294, "step": 220 }, { "epoch": 0.0442, "grad_norm": 12.522819519042969, "learning_rate": 1.9506122448979595e-05, "loss": 0.4660833179950714, "step": 221 }, { "epoch": 0.0444, "grad_norm": 8.506572723388672, "learning_rate": 1.9502040816326533e-05, "loss": 0.36523711681365967, "step": 222 }, { "epoch": 0.0446, "grad_norm": 13.08540153503418, "learning_rate": 1.949795918367347e-05, "loss": 0.4142066538333893, "step": 223 }, { "epoch": 0.0448, "grad_norm": 4.7789154052734375, "learning_rate": 1.949387755102041e-05, "loss": 0.043511006981134415, "step": 224 }, { "epoch": 0.045, "grad_norm": 8.255807876586914, "learning_rate": 1.948979591836735e-05, "loss": 0.341126412153244, "step": 225 }, { "epoch": 0.0452, "grad_norm": 8.003602981567383, "learning_rate": 1.9485714285714286e-05, "loss": 0.30454233288764954, "step": 226 }, { "epoch": 0.0454, "grad_norm": 11.212254524230957, "learning_rate": 1.9481632653061227e-05, "loss": 0.4171302318572998, "step": 227 }, { "epoch": 0.0456, "grad_norm": 9.085224151611328, "learning_rate": 1.9477551020408165e-05, "loss": 0.4786769151687622, "step": 228 }, { "epoch": 0.0458, "grad_norm": 10.804896354675293, "learning_rate": 1.9473469387755103e-05, "loss": 1.325444221496582, "step": 229 }, { "epoch": 0.046, "grad_norm": 13.959992408752441, "learning_rate": 1.9469387755102044e-05, "loss": 0.5055341720581055, "step": 230 }, { "epoch": 0.0462, "grad_norm": 10.12248706817627, "learning_rate": 1.946530612244898e-05, "loss": 0.2702232301235199, "step": 231 }, { "epoch": 0.0464, "grad_norm": 6.399989128112793, "learning_rate": 1.946122448979592e-05, "loss": 0.13826696574687958, "step": 232 }, { "epoch": 0.0466, "grad_norm": 10.234905242919922, "learning_rate": 1.945714285714286e-05, "loss": 1.8679510354995728, "step": 233 }, { "epoch": 0.0468, "grad_norm": 10.236428260803223, "learning_rate": 1.9453061224489798e-05, "loss": 1.8090623617172241, "step": 234 }, { "epoch": 0.047, "grad_norm": 8.79781723022461, "learning_rate": 1.9448979591836735e-05, "loss": 1.9289888143539429, "step": 235 }, { "epoch": 0.0472, "grad_norm": 10.299851417541504, "learning_rate": 1.9444897959183676e-05, "loss": 1.7997150421142578, "step": 236 }, { "epoch": 0.0474, "grad_norm": 10.11899185180664, "learning_rate": 1.9440816326530614e-05, "loss": 0.24996638298034668, "step": 237 }, { "epoch": 0.0476, "grad_norm": 6.126742839813232, "learning_rate": 1.943673469387755e-05, "loss": 0.10375798493623734, "step": 238 }, { "epoch": 0.0478, "grad_norm": 12.840261459350586, "learning_rate": 1.9432653061224493e-05, "loss": 0.5607141852378845, "step": 239 }, { "epoch": 0.048, "grad_norm": 13.04357624053955, "learning_rate": 1.942857142857143e-05, "loss": 0.19896751642227173, "step": 240 }, { "epoch": 0.0482, "grad_norm": 12.065544128417969, "learning_rate": 1.9424489795918368e-05, "loss": 0.5035838484764099, "step": 241 }, { "epoch": 0.0484, "grad_norm": 6.4033379554748535, "learning_rate": 1.942040816326531e-05, "loss": 0.08241923898458481, "step": 242 }, { "epoch": 0.0486, "grad_norm": 13.771985054016113, "learning_rate": 1.9416326530612247e-05, "loss": 0.9879512190818787, "step": 243 }, { "epoch": 0.0488, "grad_norm": 9.499734878540039, "learning_rate": 1.9412244897959184e-05, "loss": 0.2659399211406708, "step": 244 }, { "epoch": 0.049, "grad_norm": 9.90380573272705, "learning_rate": 1.9408163265306122e-05, "loss": 0.4616040289402008, "step": 245 }, { "epoch": 0.0492, "grad_norm": 8.802172660827637, "learning_rate": 1.9404081632653063e-05, "loss": 0.2890472114086151, "step": 246 }, { "epoch": 0.0494, "grad_norm": 12.497790336608887, "learning_rate": 1.94e-05, "loss": 1.005435585975647, "step": 247 }, { "epoch": 0.0496, "grad_norm": 13.029925346374512, "learning_rate": 1.9395918367346938e-05, "loss": 0.9115619659423828, "step": 248 }, { "epoch": 0.0498, "grad_norm": 10.337368965148926, "learning_rate": 1.939183673469388e-05, "loss": 0.3756994903087616, "step": 249 }, { "epoch": 0.05, "grad_norm": 36.60479736328125, "learning_rate": 1.9387755102040817e-05, "loss": 0.40144655108451843, "step": 250 }, { "epoch": 0.0502, "grad_norm": 8.426389694213867, "learning_rate": 1.9383673469387755e-05, "loss": 0.38817885518074036, "step": 251 }, { "epoch": 0.0504, "grad_norm": 9.67792797088623, "learning_rate": 1.9379591836734696e-05, "loss": 0.13594162464141846, "step": 252 }, { "epoch": 0.0506, "grad_norm": 6.6829328536987305, "learning_rate": 1.9375510204081633e-05, "loss": 0.06705179065465927, "step": 253 }, { "epoch": 0.0508, "grad_norm": 7.050276279449463, "learning_rate": 1.937142857142857e-05, "loss": 0.06950942426919937, "step": 254 }, { "epoch": 0.051, "grad_norm": 9.589434623718262, "learning_rate": 1.9367346938775512e-05, "loss": 0.3774583637714386, "step": 255 }, { "epoch": 0.0512, "grad_norm": 10.09981918334961, "learning_rate": 1.936326530612245e-05, "loss": 0.3024117946624756, "step": 256 }, { "epoch": 0.0514, "grad_norm": 17.173498153686523, "learning_rate": 1.9359183673469387e-05, "loss": 1.305375337600708, "step": 257 }, { "epoch": 0.0516, "grad_norm": 9.821687698364258, "learning_rate": 1.9355102040816328e-05, "loss": 0.3765145540237427, "step": 258 }, { "epoch": 0.0518, "grad_norm": 9.418091773986816, "learning_rate": 1.935102040816327e-05, "loss": 0.2349083572626114, "step": 259 }, { "epoch": 0.052, "grad_norm": 11.253679275512695, "learning_rate": 1.9346938775510207e-05, "loss": 0.2814233601093292, "step": 260 }, { "epoch": 0.0522, "grad_norm": 17.10232925415039, "learning_rate": 1.9342857142857144e-05, "loss": 0.7681066989898682, "step": 261 }, { "epoch": 0.0524, "grad_norm": 16.147369384765625, "learning_rate": 1.9338775510204086e-05, "loss": 0.9630041718482971, "step": 262 }, { "epoch": 0.0526, "grad_norm": 14.772577285766602, "learning_rate": 1.9334693877551023e-05, "loss": 0.7542390823364258, "step": 263 }, { "epoch": 0.0528, "grad_norm": 8.295753479003906, "learning_rate": 1.933061224489796e-05, "loss": 0.5259800553321838, "step": 264 }, { "epoch": 0.053, "grad_norm": 18.530479431152344, "learning_rate": 1.9326530612244902e-05, "loss": 0.5631456971168518, "step": 265 }, { "epoch": 0.0532, "grad_norm": 18.48293685913086, "learning_rate": 1.932244897959184e-05, "loss": 0.5957385301589966, "step": 266 }, { "epoch": 0.0534, "grad_norm": 10.628904342651367, "learning_rate": 1.9318367346938777e-05, "loss": 0.5219213366508484, "step": 267 }, { "epoch": 0.0536, "grad_norm": 9.826141357421875, "learning_rate": 1.9314285714285718e-05, "loss": 0.16056272387504578, "step": 268 }, { "epoch": 0.0538, "grad_norm": 10.817519187927246, "learning_rate": 1.9310204081632656e-05, "loss": 0.3732954263687134, "step": 269 }, { "epoch": 0.054, "grad_norm": 9.00774097442627, "learning_rate": 1.9306122448979593e-05, "loss": 0.5658559799194336, "step": 270 }, { "epoch": 0.0542, "grad_norm": 16.364824295043945, "learning_rate": 1.930204081632653e-05, "loss": 1.1479178667068481, "step": 271 }, { "epoch": 0.0544, "grad_norm": 15.974719047546387, "learning_rate": 1.9297959183673472e-05, "loss": 1.3299745321273804, "step": 272 }, { "epoch": 0.0546, "grad_norm": 13.28140640258789, "learning_rate": 1.929387755102041e-05, "loss": 0.43030405044555664, "step": 273 }, { "epoch": 0.0548, "grad_norm": 12.720165252685547, "learning_rate": 1.9289795918367347e-05, "loss": 0.46472522616386414, "step": 274 }, { "epoch": 0.055, "grad_norm": 16.799394607543945, "learning_rate": 1.928571428571429e-05, "loss": 0.9537304043769836, "step": 275 }, { "epoch": 0.0552, "grad_norm": 15.223164558410645, "learning_rate": 1.9281632653061226e-05, "loss": 0.7769754528999329, "step": 276 }, { "epoch": 0.0554, "grad_norm": 13.078629493713379, "learning_rate": 1.9277551020408164e-05, "loss": 1.3505254983901978, "step": 277 }, { "epoch": 0.0556, "grad_norm": 16.278898239135742, "learning_rate": 1.9273469387755105e-05, "loss": 1.538793683052063, "step": 278 }, { "epoch": 0.0558, "grad_norm": 17.326444625854492, "learning_rate": 1.9269387755102042e-05, "loss": 1.2157453298568726, "step": 279 }, { "epoch": 0.056, "grad_norm": 13.043142318725586, "learning_rate": 1.926530612244898e-05, "loss": 1.0745325088500977, "step": 280 }, { "epoch": 0.0562, "grad_norm": 13.386103630065918, "learning_rate": 1.926122448979592e-05, "loss": 1.6535282135009766, "step": 281 }, { "epoch": 0.0564, "grad_norm": 14.423442840576172, "learning_rate": 1.925714285714286e-05, "loss": 1.681785225868225, "step": 282 }, { "epoch": 0.0566, "grad_norm": 20.163217544555664, "learning_rate": 1.9253061224489796e-05, "loss": 0.7129221558570862, "step": 283 }, { "epoch": 0.0568, "grad_norm": 18.918493270874023, "learning_rate": 1.9248979591836737e-05, "loss": 0.8880770802497864, "step": 284 }, { "epoch": 0.057, "grad_norm": 13.713382720947266, "learning_rate": 1.9244897959183675e-05, "loss": 0.7181944847106934, "step": 285 }, { "epoch": 0.0572, "grad_norm": 6.4058518409729, "learning_rate": 1.9240816326530613e-05, "loss": 0.08155521005392075, "step": 286 }, { "epoch": 0.0574, "grad_norm": 12.686845779418945, "learning_rate": 1.9236734693877554e-05, "loss": 0.6097404360771179, "step": 287 }, { "epoch": 0.0576, "grad_norm": 5.447309494018555, "learning_rate": 1.923265306122449e-05, "loss": 0.09038770943880081, "step": 288 }, { "epoch": 0.0578, "grad_norm": 17.65273666381836, "learning_rate": 1.922857142857143e-05, "loss": 1.1048628091812134, "step": 289 }, { "epoch": 0.058, "grad_norm": 16.5939998626709, "learning_rate": 1.922448979591837e-05, "loss": 0.8333539962768555, "step": 290 }, { "epoch": 0.0582, "grad_norm": 11.614035606384277, "learning_rate": 1.9220408163265308e-05, "loss": 0.4503566026687622, "step": 291 }, { "epoch": 0.0584, "grad_norm": 7.452908515930176, "learning_rate": 1.9216326530612245e-05, "loss": 0.14592118561267853, "step": 292 }, { "epoch": 0.0586, "grad_norm": 12.480263710021973, "learning_rate": 1.9212244897959186e-05, "loss": 0.598111093044281, "step": 293 }, { "epoch": 0.0588, "grad_norm": 10.65322494506836, "learning_rate": 1.9208163265306124e-05, "loss": 0.4835715591907501, "step": 294 }, { "epoch": 0.059, "grad_norm": 13.473339080810547, "learning_rate": 1.920408163265306e-05, "loss": 1.2906862497329712, "step": 295 }, { "epoch": 0.0592, "grad_norm": 11.87508487701416, "learning_rate": 1.9200000000000003e-05, "loss": 1.0924484729766846, "step": 296 }, { "epoch": 0.0594, "grad_norm": 11.235217094421387, "learning_rate": 1.919591836734694e-05, "loss": 0.70457524061203, "step": 297 }, { "epoch": 0.0596, "grad_norm": 9.966221809387207, "learning_rate": 1.9191836734693878e-05, "loss": 0.7067977786064148, "step": 298 }, { "epoch": 0.0598, "grad_norm": 24.72576904296875, "learning_rate": 1.9187755102040815e-05, "loss": 1.2337020635604858, "step": 299 }, { "epoch": 0.06, "grad_norm": 32.2328987121582, "learning_rate": 1.9183673469387756e-05, "loss": 2.0896799564361572, "step": 300 }, { "epoch": 0.0602, "grad_norm": 11.01812744140625, "learning_rate": 1.9179591836734694e-05, "loss": 0.8730123043060303, "step": 301 }, { "epoch": 0.0604, "grad_norm": 9.47146987915039, "learning_rate": 1.9175510204081632e-05, "loss": 0.849687397480011, "step": 302 }, { "epoch": 0.0606, "grad_norm": 13.432365417480469, "learning_rate": 1.9171428571428573e-05, "loss": 1.5861668586730957, "step": 303 }, { "epoch": 0.0608, "grad_norm": 13.760697364807129, "learning_rate": 1.916734693877551e-05, "loss": 2.072270631790161, "step": 304 }, { "epoch": 0.061, "grad_norm": 15.910550117492676, "learning_rate": 1.916326530612245e-05, "loss": 0.7796235680580139, "step": 305 }, { "epoch": 0.0612, "grad_norm": 17.948802947998047, "learning_rate": 1.915918367346939e-05, "loss": 1.462945580482483, "step": 306 }, { "epoch": 0.0614, "grad_norm": 20.616914749145508, "learning_rate": 1.915510204081633e-05, "loss": 1.0277634859085083, "step": 307 }, { "epoch": 0.0616, "grad_norm": 19.797767639160156, "learning_rate": 1.9151020408163268e-05, "loss": 1.071796178817749, "step": 308 }, { "epoch": 0.0618, "grad_norm": 9.767040252685547, "learning_rate": 1.9146938775510205e-05, "loss": 0.3060181736946106, "step": 309 }, { "epoch": 0.062, "grad_norm": 8.75549602508545, "learning_rate": 1.9142857142857146e-05, "loss": 0.3348238170146942, "step": 310 }, { "epoch": 0.0622, "grad_norm": 10.77705192565918, "learning_rate": 1.9138775510204084e-05, "loss": 0.69964998960495, "step": 311 }, { "epoch": 0.0624, "grad_norm": 10.151534080505371, "learning_rate": 1.913469387755102e-05, "loss": 1.0875051021575928, "step": 312 }, { "epoch": 0.0626, "grad_norm": 9.344226837158203, "learning_rate": 1.9130612244897963e-05, "loss": 0.2267366647720337, "step": 313 }, { "epoch": 0.0628, "grad_norm": 7.4025397300720215, "learning_rate": 1.91265306122449e-05, "loss": 0.10137917846441269, "step": 314 }, { "epoch": 0.063, "grad_norm": 10.967966079711914, "learning_rate": 1.9122448979591838e-05, "loss": 0.4692521393299103, "step": 315 }, { "epoch": 0.0632, "grad_norm": 11.424884796142578, "learning_rate": 1.911836734693878e-05, "loss": 0.244221031665802, "step": 316 }, { "epoch": 0.0634, "grad_norm": 13.277408599853516, "learning_rate": 1.9114285714285717e-05, "loss": 0.6109925508499146, "step": 317 }, { "epoch": 0.0636, "grad_norm": 7.46019983291626, "learning_rate": 1.9110204081632654e-05, "loss": 0.13937990367412567, "step": 318 }, { "epoch": 0.0638, "grad_norm": 19.776010513305664, "learning_rate": 1.9106122448979595e-05, "loss": 0.6616011261940002, "step": 319 }, { "epoch": 0.064, "grad_norm": 19.98860740661621, "learning_rate": 1.9102040816326533e-05, "loss": 0.7814601063728333, "step": 320 }, { "epoch": 0.0642, "grad_norm": 11.662542343139648, "learning_rate": 1.909795918367347e-05, "loss": 0.426974892616272, "step": 321 }, { "epoch": 0.0644, "grad_norm": 5.7308526039123535, "learning_rate": 1.909387755102041e-05, "loss": 0.05896879360079765, "step": 322 }, { "epoch": 0.0646, "grad_norm": 10.513677597045898, "learning_rate": 1.908979591836735e-05, "loss": 0.4508655071258545, "step": 323 }, { "epoch": 0.0648, "grad_norm": 7.710137844085693, "learning_rate": 1.9085714285714287e-05, "loss": 0.2659468948841095, "step": 324 }, { "epoch": 0.065, "grad_norm": 14.837307929992676, "learning_rate": 1.9081632653061225e-05, "loss": 0.975304365158081, "step": 325 }, { "epoch": 0.0652, "grad_norm": 17.25518035888672, "learning_rate": 1.9077551020408166e-05, "loss": 0.8888130187988281, "step": 326 }, { "epoch": 0.0654, "grad_norm": 12.68120288848877, "learning_rate": 1.9073469387755103e-05, "loss": 0.40455499291419983, "step": 327 }, { "epoch": 0.0656, "grad_norm": 8.175586700439453, "learning_rate": 1.906938775510204e-05, "loss": 0.4724791944026947, "step": 328 }, { "epoch": 0.0658, "grad_norm": 13.331406593322754, "learning_rate": 1.9065306122448982e-05, "loss": 1.1158009767532349, "step": 329 }, { "epoch": 0.066, "grad_norm": 13.087471008300781, "learning_rate": 1.906122448979592e-05, "loss": 0.8100793361663818, "step": 330 }, { "epoch": 0.0662, "grad_norm": 11.871191024780273, "learning_rate": 1.9057142857142857e-05, "loss": 0.5084172487258911, "step": 331 }, { "epoch": 0.0664, "grad_norm": 15.806512832641602, "learning_rate": 1.9053061224489798e-05, "loss": 0.4743104875087738, "step": 332 }, { "epoch": 0.0666, "grad_norm": 9.702177047729492, "learning_rate": 1.9048979591836736e-05, "loss": 0.6350889801979065, "step": 333 }, { "epoch": 0.0668, "grad_norm": 9.146988868713379, "learning_rate": 1.9044897959183673e-05, "loss": 0.49416258931159973, "step": 334 }, { "epoch": 0.067, "grad_norm": 11.399431228637695, "learning_rate": 1.9040816326530614e-05, "loss": 0.8340964913368225, "step": 335 }, { "epoch": 0.0672, "grad_norm": 7.515564441680908, "learning_rate": 1.9036734693877552e-05, "loss": 0.6184783577919006, "step": 336 }, { "epoch": 0.0674, "grad_norm": 12.320131301879883, "learning_rate": 1.903265306122449e-05, "loss": 0.41817808151245117, "step": 337 }, { "epoch": 0.0676, "grad_norm": 9.78826904296875, "learning_rate": 1.902857142857143e-05, "loss": 0.34572771191596985, "step": 338 }, { "epoch": 0.0678, "grad_norm": 11.655746459960938, "learning_rate": 1.902448979591837e-05, "loss": 0.5032246708869934, "step": 339 }, { "epoch": 0.068, "grad_norm": 10.898517608642578, "learning_rate": 1.9020408163265306e-05, "loss": 0.48967209458351135, "step": 340 }, { "epoch": 0.0682, "grad_norm": 13.701338768005371, "learning_rate": 1.9016326530612247e-05, "loss": 0.5528108477592468, "step": 341 }, { "epoch": 0.0684, "grad_norm": 4.497371196746826, "learning_rate": 1.9012244897959185e-05, "loss": 0.04467979073524475, "step": 342 }, { "epoch": 0.0686, "grad_norm": 8.856463432312012, "learning_rate": 1.9008163265306122e-05, "loss": 0.42360737919807434, "step": 343 }, { "epoch": 0.0688, "grad_norm": 18.086170196533203, "learning_rate": 1.9004081632653063e-05, "loss": 0.14533616602420807, "step": 344 }, { "epoch": 0.069, "grad_norm": 15.018651008605957, "learning_rate": 1.9e-05, "loss": 2.9416942596435547, "step": 345 }, { "epoch": 0.0692, "grad_norm": 12.553912162780762, "learning_rate": 1.899591836734694e-05, "loss": 2.9571597576141357, "step": 346 }, { "epoch": 0.0694, "grad_norm": 10.689021110534668, "learning_rate": 1.899183673469388e-05, "loss": 0.6087742447853088, "step": 347 }, { "epoch": 0.0696, "grad_norm": 9.105196952819824, "learning_rate": 1.8987755102040817e-05, "loss": 0.35657715797424316, "step": 348 }, { "epoch": 0.0698, "grad_norm": 11.22412109375, "learning_rate": 1.8983673469387755e-05, "loss": 0.46766042709350586, "step": 349 }, { "epoch": 0.07, "grad_norm": 6.606272220611572, "learning_rate": 1.8979591836734696e-05, "loss": 0.06433617323637009, "step": 350 }, { "epoch": 0.0702, "grad_norm": 16.827150344848633, "learning_rate": 1.8975510204081634e-05, "loss": 0.88922518491745, "step": 351 }, { "epoch": 0.0704, "grad_norm": 13.23413372039795, "learning_rate": 1.8971428571428575e-05, "loss": 0.843842089176178, "step": 352 }, { "epoch": 0.0706, "grad_norm": 8.296690940856934, "learning_rate": 1.8967346938775512e-05, "loss": 0.18257860839366913, "step": 353 }, { "epoch": 0.0708, "grad_norm": 7.721277713775635, "learning_rate": 1.896326530612245e-05, "loss": 0.2024276703596115, "step": 354 }, { "epoch": 0.071, "grad_norm": 19.015888214111328, "learning_rate": 1.895918367346939e-05, "loss": 1.2812749147415161, "step": 355 }, { "epoch": 0.0712, "grad_norm": 29.96820640563965, "learning_rate": 1.895510204081633e-05, "loss": 2.0008366107940674, "step": 356 }, { "epoch": 0.0714, "grad_norm": 12.12214469909668, "learning_rate": 1.8951020408163266e-05, "loss": 0.4490646421909332, "step": 357 }, { "epoch": 0.0716, "grad_norm": 11.902840614318848, "learning_rate": 1.8946938775510207e-05, "loss": 0.34503689408302307, "step": 358 }, { "epoch": 0.0718, "grad_norm": 18.720752716064453, "learning_rate": 1.8942857142857145e-05, "loss": 2.382197618484497, "step": 359 }, { "epoch": 0.072, "grad_norm": 16.927722930908203, "learning_rate": 1.8938775510204083e-05, "loss": 2.2210466861724854, "step": 360 }, { "epoch": 0.0722, "grad_norm": 10.560834884643555, "learning_rate": 1.8934693877551024e-05, "loss": 2.395998954772949, "step": 361 }, { "epoch": 0.0724, "grad_norm": 11.43918228149414, "learning_rate": 1.893061224489796e-05, "loss": 2.432565927505493, "step": 362 }, { "epoch": 0.0726, "grad_norm": 12.733336448669434, "learning_rate": 1.89265306122449e-05, "loss": 2.5630064010620117, "step": 363 }, { "epoch": 0.0728, "grad_norm": 10.23054027557373, "learning_rate": 1.892244897959184e-05, "loss": 2.4672763347625732, "step": 364 }, { "epoch": 0.073, "grad_norm": 13.976507186889648, "learning_rate": 1.8918367346938778e-05, "loss": 1.5226325988769531, "step": 365 }, { "epoch": 0.0732, "grad_norm": 12.58265495300293, "learning_rate": 1.8914285714285715e-05, "loss": 0.5800626873970032, "step": 366 }, { "epoch": 0.0734, "grad_norm": 12.559310913085938, "learning_rate": 1.8910204081632656e-05, "loss": 0.43674007058143616, "step": 367 }, { "epoch": 0.0736, "grad_norm": 13.056805610656738, "learning_rate": 1.8906122448979594e-05, "loss": 0.6725685000419617, "step": 368 }, { "epoch": 0.0738, "grad_norm": 12.201833724975586, "learning_rate": 1.890204081632653e-05, "loss": 0.5566837787628174, "step": 369 }, { "epoch": 0.074, "grad_norm": 10.063617706298828, "learning_rate": 1.8897959183673473e-05, "loss": 0.7213773131370544, "step": 370 }, { "epoch": 0.0742, "grad_norm": 10.36479663848877, "learning_rate": 1.889387755102041e-05, "loss": 0.5325279831886292, "step": 371 }, { "epoch": 0.0744, "grad_norm": 8.085536003112793, "learning_rate": 1.8889795918367348e-05, "loss": 0.3209850490093231, "step": 372 }, { "epoch": 0.0746, "grad_norm": 17.15733528137207, "learning_rate": 1.888571428571429e-05, "loss": 1.9678653478622437, "step": 373 }, { "epoch": 0.0748, "grad_norm": 14.497695922851562, "learning_rate": 1.8881632653061226e-05, "loss": 1.6297591924667358, "step": 374 }, { "epoch": 0.075, "grad_norm": 16.877817153930664, "learning_rate": 1.8877551020408164e-05, "loss": 1.9466644525527954, "step": 375 }, { "epoch": 0.0752, "grad_norm": 13.411498069763184, "learning_rate": 1.8873469387755102e-05, "loss": 1.7723740339279175, "step": 376 }, { "epoch": 0.0754, "grad_norm": 14.190561294555664, "learning_rate": 1.8869387755102043e-05, "loss": 0.6084811091423035, "step": 377 }, { "epoch": 0.0756, "grad_norm": 14.289806365966797, "learning_rate": 1.886530612244898e-05, "loss": 0.6782891154289246, "step": 378 }, { "epoch": 0.0758, "grad_norm": 15.375550270080566, "learning_rate": 1.8861224489795918e-05, "loss": 1.4179891347885132, "step": 379 }, { "epoch": 0.076, "grad_norm": 9.811424255371094, "learning_rate": 1.885714285714286e-05, "loss": 0.7442667484283447, "step": 380 }, { "epoch": 0.0762, "grad_norm": 13.01511287689209, "learning_rate": 1.8853061224489797e-05, "loss": 1.3766447305679321, "step": 381 }, { "epoch": 0.0764, "grad_norm": 12.5484037399292, "learning_rate": 1.8848979591836734e-05, "loss": 1.2930976152420044, "step": 382 }, { "epoch": 0.0766, "grad_norm": 11.38595962524414, "learning_rate": 1.8844897959183675e-05, "loss": 1.223589301109314, "step": 383 }, { "epoch": 0.0768, "grad_norm": 10.15343189239502, "learning_rate": 1.8840816326530613e-05, "loss": 1.350658893585205, "step": 384 }, { "epoch": 0.077, "grad_norm": 11.184205055236816, "learning_rate": 1.883673469387755e-05, "loss": 1.2625024318695068, "step": 385 }, { "epoch": 0.0772, "grad_norm": 9.304605484008789, "learning_rate": 1.883265306122449e-05, "loss": 1.0793894529342651, "step": 386 }, { "epoch": 0.0774, "grad_norm": 9.564764022827148, "learning_rate": 1.882857142857143e-05, "loss": 0.29348039627075195, "step": 387 }, { "epoch": 0.0776, "grad_norm": 4.9206318855285645, "learning_rate": 1.8824489795918367e-05, "loss": 0.06144172325730324, "step": 388 }, { "epoch": 0.0778, "grad_norm": 14.901082992553711, "learning_rate": 1.8820408163265308e-05, "loss": 2.588120222091675, "step": 389 }, { "epoch": 0.078, "grad_norm": 10.4075288772583, "learning_rate": 1.8816326530612246e-05, "loss": 2.262578248977661, "step": 390 }, { "epoch": 0.0782, "grad_norm": 13.037023544311523, "learning_rate": 1.8812244897959183e-05, "loss": 0.175909161567688, "step": 391 }, { "epoch": 0.0784, "grad_norm": 3.5371270179748535, "learning_rate": 1.8808163265306124e-05, "loss": 0.25394168496131897, "step": 392 }, { "epoch": 0.0786, "grad_norm": 11.433444023132324, "learning_rate": 1.8804081632653062e-05, "loss": 0.4397641122341156, "step": 393 }, { "epoch": 0.0788, "grad_norm": 6.777037143707275, "learning_rate": 1.88e-05, "loss": 0.052677273750305176, "step": 394 }, { "epoch": 0.079, "grad_norm": 9.813986778259277, "learning_rate": 1.879591836734694e-05, "loss": 0.242705300450325, "step": 395 }, { "epoch": 0.0792, "grad_norm": 5.816673278808594, "learning_rate": 1.879183673469388e-05, "loss": 0.08530381321907043, "step": 396 }, { "epoch": 0.0794, "grad_norm": 11.603211402893066, "learning_rate": 1.878775510204082e-05, "loss": 0.48026981949806213, "step": 397 }, { "epoch": 0.0796, "grad_norm": 4.812192440032959, "learning_rate": 1.8783673469387757e-05, "loss": 0.05827462673187256, "step": 398 }, { "epoch": 0.0798, "grad_norm": 13.11069107055664, "learning_rate": 1.8779591836734698e-05, "loss": 1.2811168432235718, "step": 399 }, { "epoch": 0.08, "grad_norm": 10.038864135742188, "learning_rate": 1.8775510204081636e-05, "loss": 0.673806369304657, "step": 400 }, { "epoch": 0.0802, "grad_norm": 11.571066856384277, "learning_rate": 1.8771428571428573e-05, "loss": 0.5589026808738708, "step": 401 }, { "epoch": 0.0804, "grad_norm": 9.705341339111328, "learning_rate": 1.876734693877551e-05, "loss": 0.24957221746444702, "step": 402 }, { "epoch": 0.0806, "grad_norm": 13.06927490234375, "learning_rate": 1.8763265306122452e-05, "loss": 1.4396060705184937, "step": 403 }, { "epoch": 0.0808, "grad_norm": 17.341108322143555, "learning_rate": 1.875918367346939e-05, "loss": 0.6699572205543518, "step": 404 }, { "epoch": 0.081, "grad_norm": 10.573736190795898, "learning_rate": 1.8755102040816327e-05, "loss": 1.288001298904419, "step": 405 }, { "epoch": 0.0812, "grad_norm": 12.364386558532715, "learning_rate": 1.8751020408163268e-05, "loss": 0.603476881980896, "step": 406 }, { "epoch": 0.0814, "grad_norm": 13.087056159973145, "learning_rate": 1.8746938775510206e-05, "loss": 0.4683491289615631, "step": 407 }, { "epoch": 0.0816, "grad_norm": 9.821493148803711, "learning_rate": 1.8742857142857143e-05, "loss": 0.3452199399471283, "step": 408 }, { "epoch": 0.0818, "grad_norm": 10.179647445678711, "learning_rate": 1.8738775510204084e-05, "loss": 0.6895151138305664, "step": 409 }, { "epoch": 0.082, "grad_norm": 12.020050048828125, "learning_rate": 1.8734693877551022e-05, "loss": 0.35574761033058167, "step": 410 }, { "epoch": 0.0822, "grad_norm": 14.25110149383545, "learning_rate": 1.873061224489796e-05, "loss": 0.3857842683792114, "step": 411 }, { "epoch": 0.0824, "grad_norm": 8.994257926940918, "learning_rate": 1.87265306122449e-05, "loss": 0.4511173665523529, "step": 412 }, { "epoch": 0.0826, "grad_norm": 18.089815139770508, "learning_rate": 1.872244897959184e-05, "loss": 0.9559245109558105, "step": 413 }, { "epoch": 0.0828, "grad_norm": 17.2407169342041, "learning_rate": 1.8718367346938776e-05, "loss": 0.8183043599128723, "step": 414 }, { "epoch": 0.083, "grad_norm": 14.020536422729492, "learning_rate": 1.8714285714285717e-05, "loss": 0.5215857028961182, "step": 415 }, { "epoch": 0.0832, "grad_norm": 12.235998153686523, "learning_rate": 1.8710204081632655e-05, "loss": 0.47453007102012634, "step": 416 }, { "epoch": 0.0834, "grad_norm": 10.737859725952148, "learning_rate": 1.8706122448979592e-05, "loss": 0.3702681362628937, "step": 417 }, { "epoch": 0.0836, "grad_norm": 12.272703170776367, "learning_rate": 1.8702040816326533e-05, "loss": 0.4834024906158447, "step": 418 }, { "epoch": 0.0838, "grad_norm": 10.39461612701416, "learning_rate": 1.869795918367347e-05, "loss": 0.5086528062820435, "step": 419 }, { "epoch": 0.084, "grad_norm": 9.570913314819336, "learning_rate": 1.869387755102041e-05, "loss": 0.49558886885643005, "step": 420 }, { "epoch": 0.0842, "grad_norm": 12.726007461547852, "learning_rate": 1.868979591836735e-05, "loss": 0.38814687728881836, "step": 421 }, { "epoch": 0.0844, "grad_norm": 10.535909652709961, "learning_rate": 1.8685714285714287e-05, "loss": 0.5980618596076965, "step": 422 }, { "epoch": 0.0846, "grad_norm": 12.708730697631836, "learning_rate": 1.8681632653061225e-05, "loss": 1.2424190044403076, "step": 423 }, { "epoch": 0.0848, "grad_norm": 8.670951843261719, "learning_rate": 1.8677551020408166e-05, "loss": 0.856053352355957, "step": 424 }, { "epoch": 0.085, "grad_norm": 7.738373279571533, "learning_rate": 1.8673469387755104e-05, "loss": 0.2559834420681, "step": 425 }, { "epoch": 0.0852, "grad_norm": 12.78547477722168, "learning_rate": 1.866938775510204e-05, "loss": 0.24063949286937714, "step": 426 }, { "epoch": 0.0854, "grad_norm": 10.760578155517578, "learning_rate": 1.8665306122448982e-05, "loss": 0.21334774792194366, "step": 427 }, { "epoch": 0.0856, "grad_norm": 8.15738582611084, "learning_rate": 1.866122448979592e-05, "loss": 0.12981675565242767, "step": 428 }, { "epoch": 0.0858, "grad_norm": 13.824641227722168, "learning_rate": 1.8657142857142858e-05, "loss": 0.8066617846488953, "step": 429 }, { "epoch": 0.086, "grad_norm": 10.609761238098145, "learning_rate": 1.8653061224489795e-05, "loss": 0.8554279208183289, "step": 430 }, { "epoch": 0.0862, "grad_norm": 14.266653060913086, "learning_rate": 1.8648979591836736e-05, "loss": 0.6035276055335999, "step": 431 }, { "epoch": 0.0864, "grad_norm": 12.586504936218262, "learning_rate": 1.8644897959183674e-05, "loss": 0.8313179016113281, "step": 432 }, { "epoch": 0.0866, "grad_norm": 10.952289581298828, "learning_rate": 1.864081632653061e-05, "loss": 0.7162933349609375, "step": 433 }, { "epoch": 0.0868, "grad_norm": 10.059403419494629, "learning_rate": 1.8636734693877553e-05, "loss": 0.2089420109987259, "step": 434 }, { "epoch": 0.087, "grad_norm": 12.50546932220459, "learning_rate": 1.863265306122449e-05, "loss": 0.8918299078941345, "step": 435 }, { "epoch": 0.0872, "grad_norm": 8.853002548217773, "learning_rate": 1.8628571428571428e-05, "loss": 0.1686689257621765, "step": 436 }, { "epoch": 0.0874, "grad_norm": 11.31886100769043, "learning_rate": 1.862448979591837e-05, "loss": 1.5455905199050903, "step": 437 }, { "epoch": 0.0876, "grad_norm": 19.86179542541504, "learning_rate": 1.8620408163265307e-05, "loss": 0.8918477892875671, "step": 438 }, { "epoch": 0.0878, "grad_norm": 40.09619903564453, "learning_rate": 1.8616326530612244e-05, "loss": 2.142383337020874, "step": 439 }, { "epoch": 0.088, "grad_norm": 38.148292541503906, "learning_rate": 1.8612244897959185e-05, "loss": 2.0496737957000732, "step": 440 }, { "epoch": 0.0882, "grad_norm": 14.241689682006836, "learning_rate": 1.8608163265306126e-05, "loss": 0.4535129964351654, "step": 441 }, { "epoch": 0.0884, "grad_norm": 9.968354225158691, "learning_rate": 1.8604081632653064e-05, "loss": 0.1674795150756836, "step": 442 }, { "epoch": 0.0886, "grad_norm": 15.795212745666504, "learning_rate": 1.86e-05, "loss": 0.9672284126281738, "step": 443 }, { "epoch": 0.0888, "grad_norm": 10.975865364074707, "learning_rate": 1.8595918367346943e-05, "loss": 1.0675843954086304, "step": 444 }, { "epoch": 0.089, "grad_norm": 11.84468936920166, "learning_rate": 1.859183673469388e-05, "loss": 0.47408929467201233, "step": 445 }, { "epoch": 0.0892, "grad_norm": 10.550907135009766, "learning_rate": 1.8587755102040818e-05, "loss": 0.29741260409355164, "step": 446 }, { "epoch": 0.0894, "grad_norm": 12.465574264526367, "learning_rate": 1.858367346938776e-05, "loss": 1.206751823425293, "step": 447 }, { "epoch": 0.0896, "grad_norm": 13.958642959594727, "learning_rate": 1.8579591836734696e-05, "loss": 0.9328053593635559, "step": 448 }, { "epoch": 0.0898, "grad_norm": 18.270723342895508, "learning_rate": 1.8575510204081634e-05, "loss": 0.27732086181640625, "step": 449 }, { "epoch": 0.09, "grad_norm": 15.656776428222656, "learning_rate": 1.8571428571428575e-05, "loss": 0.2517017722129822, "step": 450 }, { "epoch": 0.0902, "grad_norm": 16.57853126525879, "learning_rate": 1.8567346938775513e-05, "loss": 0.6180245280265808, "step": 451 }, { "epoch": 0.0904, "grad_norm": 19.03170394897461, "learning_rate": 1.856326530612245e-05, "loss": 0.49093642830848694, "step": 452 }, { "epoch": 0.0906, "grad_norm": 12.310887336730957, "learning_rate": 1.855918367346939e-05, "loss": 0.3317486643791199, "step": 453 }, { "epoch": 0.0908, "grad_norm": 11.586458206176758, "learning_rate": 1.855510204081633e-05, "loss": 0.35807475447654724, "step": 454 }, { "epoch": 0.091, "grad_norm": 10.65620231628418, "learning_rate": 1.8551020408163267e-05, "loss": 0.36819103360176086, "step": 455 }, { "epoch": 0.0912, "grad_norm": 10.369810104370117, "learning_rate": 1.8546938775510204e-05, "loss": 0.30374273657798767, "step": 456 }, { "epoch": 0.0914, "grad_norm": 12.69101333618164, "learning_rate": 1.8542857142857145e-05, "loss": 1.052322268486023, "step": 457 }, { "epoch": 0.0916, "grad_norm": 8.62214469909668, "learning_rate": 1.8538775510204083e-05, "loss": 0.3165879547595978, "step": 458 }, { "epoch": 0.0918, "grad_norm": 13.178669929504395, "learning_rate": 1.853469387755102e-05, "loss": 0.9034677147865295, "step": 459 }, { "epoch": 0.092, "grad_norm": 8.487058639526367, "learning_rate": 1.853061224489796e-05, "loss": 0.5634719133377075, "step": 460 }, { "epoch": 0.0922, "grad_norm": 10.703817367553711, "learning_rate": 1.85265306122449e-05, "loss": 0.4245595932006836, "step": 461 }, { "epoch": 0.0924, "grad_norm": 5.564105987548828, "learning_rate": 1.8522448979591837e-05, "loss": 0.05999644473195076, "step": 462 }, { "epoch": 0.0926, "grad_norm": 5.57875919342041, "learning_rate": 1.8518367346938778e-05, "loss": 0.19407449662685394, "step": 463 }, { "epoch": 0.0928, "grad_norm": 5.318073272705078, "learning_rate": 1.8514285714285716e-05, "loss": 0.1788109540939331, "step": 464 }, { "epoch": 0.093, "grad_norm": 15.313523292541504, "learning_rate": 1.8510204081632653e-05, "loss": 0.68692547082901, "step": 465 }, { "epoch": 0.0932, "grad_norm": 11.062304496765137, "learning_rate": 1.8506122448979594e-05, "loss": 0.8494117856025696, "step": 466 }, { "epoch": 0.0934, "grad_norm": 16.3399658203125, "learning_rate": 1.8502040816326532e-05, "loss": 0.786237895488739, "step": 467 }, { "epoch": 0.0936, "grad_norm": 10.19522476196289, "learning_rate": 1.849795918367347e-05, "loss": 0.9438123106956482, "step": 468 }, { "epoch": 0.0938, "grad_norm": 12.138736724853516, "learning_rate": 1.849387755102041e-05, "loss": 0.4503382742404938, "step": 469 }, { "epoch": 0.094, "grad_norm": 10.992502212524414, "learning_rate": 1.8489795918367348e-05, "loss": 0.6347247958183289, "step": 470 }, { "epoch": 0.0942, "grad_norm": 12.836503028869629, "learning_rate": 1.8485714285714286e-05, "loss": 1.1122783422470093, "step": 471 }, { "epoch": 0.0944, "grad_norm": 14.83193302154541, "learning_rate": 1.8481632653061227e-05, "loss": 1.1462980508804321, "step": 472 }, { "epoch": 0.0946, "grad_norm": 11.722692489624023, "learning_rate": 1.8477551020408165e-05, "loss": 0.5339711308479309, "step": 473 }, { "epoch": 0.0948, "grad_norm": 9.716527938842773, "learning_rate": 1.8473469387755102e-05, "loss": 0.5523082613945007, "step": 474 }, { "epoch": 0.095, "grad_norm": 11.501835823059082, "learning_rate": 1.8469387755102043e-05, "loss": 0.6342002749443054, "step": 475 }, { "epoch": 0.0952, "grad_norm": 7.19667911529541, "learning_rate": 1.846530612244898e-05, "loss": 0.17063356935977936, "step": 476 }, { "epoch": 0.0954, "grad_norm": 11.882790565490723, "learning_rate": 1.846122448979592e-05, "loss": 0.7495070099830627, "step": 477 }, { "epoch": 0.0956, "grad_norm": 7.023852348327637, "learning_rate": 1.845714285714286e-05, "loss": 0.2240893840789795, "step": 478 }, { "epoch": 0.0958, "grad_norm": 13.528419494628906, "learning_rate": 1.8453061224489797e-05, "loss": 1.1490691900253296, "step": 479 }, { "epoch": 0.096, "grad_norm": 14.662690162658691, "learning_rate": 1.8448979591836735e-05, "loss": 1.295188307762146, "step": 480 }, { "epoch": 0.0962, "grad_norm": 10.792596817016602, "learning_rate": 1.8444897959183672e-05, "loss": 0.4174814224243164, "step": 481 }, { "epoch": 0.0964, "grad_norm": 6.941728115081787, "learning_rate": 1.8440816326530613e-05, "loss": 0.15877938270568848, "step": 482 }, { "epoch": 0.0966, "grad_norm": 19.152055740356445, "learning_rate": 1.843673469387755e-05, "loss": 1.4050027132034302, "step": 483 }, { "epoch": 0.0968, "grad_norm": 11.851791381835938, "learning_rate": 1.843265306122449e-05, "loss": 0.5855047106742859, "step": 484 }, { "epoch": 0.097, "grad_norm": 15.214301109313965, "learning_rate": 1.842857142857143e-05, "loss": 0.3991682529449463, "step": 485 }, { "epoch": 0.0972, "grad_norm": 11.383341789245605, "learning_rate": 1.842448979591837e-05, "loss": 0.36300501227378845, "step": 486 }, { "epoch": 0.0974, "grad_norm": 8.519134521484375, "learning_rate": 1.842040816326531e-05, "loss": 0.5041393637657166, "step": 487 }, { "epoch": 0.0976, "grad_norm": 10.494160652160645, "learning_rate": 1.8416326530612246e-05, "loss": 0.8635578751564026, "step": 488 }, { "epoch": 0.0978, "grad_norm": 11.271151542663574, "learning_rate": 1.8412244897959187e-05, "loss": 0.6950626969337463, "step": 489 }, { "epoch": 0.098, "grad_norm": 6.710206031799316, "learning_rate": 1.8408163265306125e-05, "loss": 0.4448658525943756, "step": 490 }, { "epoch": 0.0982, "grad_norm": 11.673338890075684, "learning_rate": 1.8404081632653062e-05, "loss": 0.8156289458274841, "step": 491 }, { "epoch": 0.0984, "grad_norm": 7.988609313964844, "learning_rate": 1.8400000000000003e-05, "loss": 0.5245321989059448, "step": 492 }, { "epoch": 0.0986, "grad_norm": 10.861720085144043, "learning_rate": 1.839591836734694e-05, "loss": 0.42232275009155273, "step": 493 }, { "epoch": 0.0988, "grad_norm": 7.797056198120117, "learning_rate": 1.839183673469388e-05, "loss": 0.0740867406129837, "step": 494 }, { "epoch": 0.099, "grad_norm": 39.92266845703125, "learning_rate": 1.838775510204082e-05, "loss": 2.273386240005493, "step": 495 }, { "epoch": 0.0992, "grad_norm": 33.89152908325195, "learning_rate": 1.8383673469387757e-05, "loss": 1.6251716613769531, "step": 496 }, { "epoch": 0.0994, "grad_norm": 13.701949119567871, "learning_rate": 1.8379591836734695e-05, "loss": 0.48105981945991516, "step": 497 }, { "epoch": 0.0996, "grad_norm": 14.770544052124023, "learning_rate": 1.8375510204081636e-05, "loss": 0.45467638969421387, "step": 498 }, { "epoch": 0.0998, "grad_norm": 10.267172813415527, "learning_rate": 1.8371428571428574e-05, "loss": 0.27381423115730286, "step": 499 }, { "epoch": 0.1, "grad_norm": 12.922121047973633, "learning_rate": 1.836734693877551e-05, "loss": 0.3002593517303467, "step": 500 }, { "epoch": 0.1002, "grad_norm": 10.551634788513184, "learning_rate": 1.8363265306122452e-05, "loss": 0.22908790409564972, "step": 501 }, { "epoch": 0.1004, "grad_norm": 10.607389450073242, "learning_rate": 1.835918367346939e-05, "loss": 0.24098807573318481, "step": 502 }, { "epoch": 0.1006, "grad_norm": 8.937371253967285, "learning_rate": 1.8355102040816328e-05, "loss": 0.24399320781230927, "step": 503 }, { "epoch": 0.1008, "grad_norm": 8.051960945129395, "learning_rate": 1.835102040816327e-05, "loss": 0.28692182898521423, "step": 504 }, { "epoch": 0.101, "grad_norm": 14.802160263061523, "learning_rate": 1.8346938775510206e-05, "loss": 0.3659327030181885, "step": 505 }, { "epoch": 0.1012, "grad_norm": 9.041366577148438, "learning_rate": 1.8342857142857144e-05, "loss": 0.16635489463806152, "step": 506 }, { "epoch": 0.1014, "grad_norm": 9.571503639221191, "learning_rate": 1.833877551020408e-05, "loss": 0.6271096467971802, "step": 507 }, { "epoch": 0.1016, "grad_norm": 12.329565048217773, "learning_rate": 1.8334693877551023e-05, "loss": 0.8489407896995544, "step": 508 }, { "epoch": 0.1018, "grad_norm": 9.421364784240723, "learning_rate": 1.833061224489796e-05, "loss": 0.24840576946735382, "step": 509 }, { "epoch": 0.102, "grad_norm": 7.825616836547852, "learning_rate": 1.8326530612244898e-05, "loss": 0.12121107429265976, "step": 510 }, { "epoch": 0.1022, "grad_norm": 9.709155082702637, "learning_rate": 1.832244897959184e-05, "loss": 0.5488207340240479, "step": 511 }, { "epoch": 0.1024, "grad_norm": 9.480342864990234, "learning_rate": 1.8318367346938777e-05, "loss": 0.4018261730670929, "step": 512 }, { "epoch": 0.1026, "grad_norm": 8.605011940002441, "learning_rate": 1.8314285714285714e-05, "loss": 0.3484914302825928, "step": 513 }, { "epoch": 0.1028, "grad_norm": 12.490439414978027, "learning_rate": 1.8310204081632655e-05, "loss": 0.28063103556632996, "step": 514 }, { "epoch": 0.103, "grad_norm": 9.80734920501709, "learning_rate": 1.8306122448979593e-05, "loss": 0.3822651207447052, "step": 515 }, { "epoch": 0.1032, "grad_norm": 4.705074787139893, "learning_rate": 1.830204081632653e-05, "loss": 0.05691803619265556, "step": 516 }, { "epoch": 0.1034, "grad_norm": 20.423694610595703, "learning_rate": 1.829795918367347e-05, "loss": 0.4564323425292969, "step": 517 }, { "epoch": 0.1036, "grad_norm": 8.043682098388672, "learning_rate": 1.829387755102041e-05, "loss": 0.18622785806655884, "step": 518 }, { "epoch": 0.1038, "grad_norm": 11.38018798828125, "learning_rate": 1.8289795918367347e-05, "loss": 0.5270686745643616, "step": 519 }, { "epoch": 0.104, "grad_norm": 7.291547775268555, "learning_rate": 1.8285714285714288e-05, "loss": 0.1516893357038498, "step": 520 }, { "epoch": 0.1042, "grad_norm": 12.855888366699219, "learning_rate": 1.8281632653061225e-05, "loss": 0.6389946341514587, "step": 521 }, { "epoch": 0.1044, "grad_norm": 9.985026359558105, "learning_rate": 1.8277551020408163e-05, "loss": 0.7234979271888733, "step": 522 }, { "epoch": 0.1046, "grad_norm": 12.087357521057129, "learning_rate": 1.8273469387755104e-05, "loss": 0.7861669659614563, "step": 523 }, { "epoch": 0.1048, "grad_norm": 15.694661140441895, "learning_rate": 1.8269387755102042e-05, "loss": 0.5220512747764587, "step": 524 }, { "epoch": 0.105, "grad_norm": 12.429661750793457, "learning_rate": 1.826530612244898e-05, "loss": 0.6339209675788879, "step": 525 }, { "epoch": 0.1052, "grad_norm": 13.236414909362793, "learning_rate": 1.826122448979592e-05, "loss": 0.4826647937297821, "step": 526 }, { "epoch": 0.1054, "grad_norm": 10.18588924407959, "learning_rate": 1.8257142857142858e-05, "loss": 0.40340495109558105, "step": 527 }, { "epoch": 0.1056, "grad_norm": 9.78604793548584, "learning_rate": 1.8253061224489796e-05, "loss": 0.37776312232017517, "step": 528 }, { "epoch": 0.1058, "grad_norm": 10.337906837463379, "learning_rate": 1.8248979591836737e-05, "loss": 0.32520875334739685, "step": 529 }, { "epoch": 0.106, "grad_norm": 5.0009965896606445, "learning_rate": 1.8244897959183674e-05, "loss": 0.1371188759803772, "step": 530 }, { "epoch": 0.1062, "grad_norm": 10.34693431854248, "learning_rate": 1.8240816326530612e-05, "loss": 0.3658228814601898, "step": 531 }, { "epoch": 0.1064, "grad_norm": 12.065381050109863, "learning_rate": 1.8236734693877553e-05, "loss": 0.49137282371520996, "step": 532 }, { "epoch": 0.1066, "grad_norm": 12.316068649291992, "learning_rate": 1.823265306122449e-05, "loss": 0.39971527457237244, "step": 533 }, { "epoch": 0.1068, "grad_norm": 9.137246131896973, "learning_rate": 1.822857142857143e-05, "loss": 0.36186349391937256, "step": 534 }, { "epoch": 0.107, "grad_norm": 7.118947505950928, "learning_rate": 1.822448979591837e-05, "loss": 0.2133263200521469, "step": 535 }, { "epoch": 0.1072, "grad_norm": 6.141784191131592, "learning_rate": 1.8220408163265307e-05, "loss": 0.12082049995660782, "step": 536 }, { "epoch": 0.1074, "grad_norm": 12.222509384155273, "learning_rate": 1.8216326530612248e-05, "loss": 0.44458451867103577, "step": 537 }, { "epoch": 0.1076, "grad_norm": 14.006424903869629, "learning_rate": 1.8212244897959186e-05, "loss": 0.3920406401157379, "step": 538 }, { "epoch": 0.1078, "grad_norm": 13.02974796295166, "learning_rate": 1.8208163265306123e-05, "loss": 0.3564494848251343, "step": 539 }, { "epoch": 0.108, "grad_norm": 18.711116790771484, "learning_rate": 1.8204081632653064e-05, "loss": 0.344758003950119, "step": 540 }, { "epoch": 0.1082, "grad_norm": 20.329181671142578, "learning_rate": 1.8200000000000002e-05, "loss": 0.9119019508361816, "step": 541 }, { "epoch": 0.1084, "grad_norm": 19.114761352539062, "learning_rate": 1.819591836734694e-05, "loss": 0.7259604334831238, "step": 542 }, { "epoch": 0.1086, "grad_norm": 19.6895809173584, "learning_rate": 1.819183673469388e-05, "loss": 1.1039327383041382, "step": 543 }, { "epoch": 0.1088, "grad_norm": 14.621075630187988, "learning_rate": 1.8187755102040818e-05, "loss": 0.6637970805168152, "step": 544 }, { "epoch": 0.109, "grad_norm": 31.219099044799805, "learning_rate": 1.8183673469387756e-05, "loss": 0.852077305316925, "step": 545 }, { "epoch": 0.1092, "grad_norm": 26.689311981201172, "learning_rate": 1.8179591836734697e-05, "loss": 0.778461754322052, "step": 546 }, { "epoch": 0.1094, "grad_norm": 10.68387222290039, "learning_rate": 1.8175510204081635e-05, "loss": 0.7177695631980896, "step": 547 }, { "epoch": 0.1096, "grad_norm": 9.821282386779785, "learning_rate": 1.8171428571428572e-05, "loss": 0.755763828754425, "step": 548 }, { "epoch": 0.1098, "grad_norm": 14.495914459228516, "learning_rate": 1.8167346938775513e-05, "loss": 1.236759901046753, "step": 549 }, { "epoch": 0.11, "grad_norm": 8.800137519836426, "learning_rate": 1.816326530612245e-05, "loss": 1.1966830492019653, "step": 550 }, { "epoch": 0.1102, "grad_norm": 9.097552299499512, "learning_rate": 1.815918367346939e-05, "loss": 0.3670867383480072, "step": 551 }, { "epoch": 0.1104, "grad_norm": 4.9611430168151855, "learning_rate": 1.815510204081633e-05, "loss": 0.0949157252907753, "step": 552 }, { "epoch": 0.1106, "grad_norm": 13.15954303741455, "learning_rate": 1.8151020408163267e-05, "loss": 2.18172025680542, "step": 553 }, { "epoch": 0.1108, "grad_norm": 15.633612632751465, "learning_rate": 1.8146938775510205e-05, "loss": 2.1954867839813232, "step": 554 }, { "epoch": 0.111, "grad_norm": 21.19856834411621, "learning_rate": 1.8142857142857146e-05, "loss": 1.004319190979004, "step": 555 }, { "epoch": 0.1112, "grad_norm": 16.217754364013672, "learning_rate": 1.8138775510204083e-05, "loss": 0.8675961494445801, "step": 556 }, { "epoch": 0.1114, "grad_norm": 11.018362998962402, "learning_rate": 1.813469387755102e-05, "loss": 0.3288007080554962, "step": 557 }, { "epoch": 0.1116, "grad_norm": 12.571540832519531, "learning_rate": 1.8130612244897962e-05, "loss": 0.23565353453159332, "step": 558 }, { "epoch": 0.1118, "grad_norm": 13.624663352966309, "learning_rate": 1.81265306122449e-05, "loss": 0.33491167426109314, "step": 559 }, { "epoch": 0.112, "grad_norm": 12.053435325622559, "learning_rate": 1.8122448979591837e-05, "loss": 0.2845800817012787, "step": 560 }, { "epoch": 0.1122, "grad_norm": 12.036954879760742, "learning_rate": 1.8118367346938775e-05, "loss": 1.890631079673767, "step": 561 }, { "epoch": 0.1124, "grad_norm": 12.141047477722168, "learning_rate": 1.8114285714285716e-05, "loss": 1.7847900390625, "step": 562 }, { "epoch": 0.1126, "grad_norm": 14.690183639526367, "learning_rate": 1.8110204081632654e-05, "loss": 1.274082064628601, "step": 563 }, { "epoch": 0.1128, "grad_norm": 14.72066879272461, "learning_rate": 1.810612244897959e-05, "loss": 1.1627548933029175, "step": 564 }, { "epoch": 0.113, "grad_norm": 14.321584701538086, "learning_rate": 1.8102040816326532e-05, "loss": 2.852611541748047, "step": 565 }, { "epoch": 0.1132, "grad_norm": 11.83179759979248, "learning_rate": 1.809795918367347e-05, "loss": 2.830625295639038, "step": 566 }, { "epoch": 0.1134, "grad_norm": 12.19902515411377, "learning_rate": 1.8093877551020408e-05, "loss": 0.9268226623535156, "step": 567 }, { "epoch": 0.1136, "grad_norm": 6.607133865356445, "learning_rate": 1.808979591836735e-05, "loss": 0.3435347378253937, "step": 568 }, { "epoch": 0.1138, "grad_norm": 10.809368133544922, "learning_rate": 1.8085714285714286e-05, "loss": 0.43306198716163635, "step": 569 }, { "epoch": 0.114, "grad_norm": 10.219008445739746, "learning_rate": 1.8081632653061224e-05, "loss": 0.3536444902420044, "step": 570 }, { "epoch": 0.1142, "grad_norm": 7.8596367835998535, "learning_rate": 1.8077551020408165e-05, "loss": 2.3697917461395264, "step": 571 }, { "epoch": 0.1144, "grad_norm": 6.792211532592773, "learning_rate": 1.8073469387755103e-05, "loss": 2.2773189544677734, "step": 572 }, { "epoch": 0.1146, "grad_norm": 10.349200248718262, "learning_rate": 1.806938775510204e-05, "loss": 0.4243791997432709, "step": 573 }, { "epoch": 0.1148, "grad_norm": 3.6428349018096924, "learning_rate": 1.806530612244898e-05, "loss": 0.039050281047821045, "step": 574 }, { "epoch": 0.115, "grad_norm": 9.740432739257812, "learning_rate": 1.806122448979592e-05, "loss": 0.378023624420166, "step": 575 }, { "epoch": 0.1152, "grad_norm": 6.016699314117432, "learning_rate": 1.8057142857142857e-05, "loss": 0.06636029481887817, "step": 576 }, { "epoch": 0.1154, "grad_norm": 11.535165786743164, "learning_rate": 1.8053061224489798e-05, "loss": 1.3998335599899292, "step": 577 }, { "epoch": 0.1156, "grad_norm": 8.401366233825684, "learning_rate": 1.804897959183674e-05, "loss": 0.6503568291664124, "step": 578 }, { "epoch": 0.1158, "grad_norm": 18.115427017211914, "learning_rate": 1.8044897959183676e-05, "loss": 3.949636459350586, "step": 579 }, { "epoch": 0.116, "grad_norm": 15.45248794555664, "learning_rate": 1.8040816326530614e-05, "loss": 3.7633888721466064, "step": 580 }, { "epoch": 0.1162, "grad_norm": 20.35097312927246, "learning_rate": 1.8036734693877555e-05, "loss": 0.8000307679176331, "step": 581 }, { "epoch": 0.1164, "grad_norm": 21.418190002441406, "learning_rate": 1.8032653061224493e-05, "loss": 0.7529289722442627, "step": 582 }, { "epoch": 0.1166, "grad_norm": 13.8251953125, "learning_rate": 1.802857142857143e-05, "loss": 1.4717602729797363, "step": 583 }, { "epoch": 0.1168, "grad_norm": 15.150423049926758, "learning_rate": 1.802448979591837e-05, "loss": 0.936460018157959, "step": 584 }, { "epoch": 0.117, "grad_norm": 10.854708671569824, "learning_rate": 1.802040816326531e-05, "loss": 0.5200664401054382, "step": 585 }, { "epoch": 0.1172, "grad_norm": 9.888029098510742, "learning_rate": 1.8016326530612247e-05, "loss": 0.33132681250572205, "step": 586 }, { "epoch": 0.1174, "grad_norm": 11.01172924041748, "learning_rate": 1.8012244897959184e-05, "loss": 0.4022061824798584, "step": 587 }, { "epoch": 0.1176, "grad_norm": 5.216026782989502, "learning_rate": 1.8008163265306125e-05, "loss": 0.3725966513156891, "step": 588 }, { "epoch": 0.1178, "grad_norm": 11.804418563842773, "learning_rate": 1.8004081632653063e-05, "loss": 0.5011307597160339, "step": 589 }, { "epoch": 0.118, "grad_norm": 8.733563423156738, "learning_rate": 1.8e-05, "loss": 0.2053084820508957, "step": 590 }, { "epoch": 0.1182, "grad_norm": 18.12481117248535, "learning_rate": 1.799591836734694e-05, "loss": 0.7179663181304932, "step": 591 }, { "epoch": 0.1184, "grad_norm": 17.2807559967041, "learning_rate": 1.799183673469388e-05, "loss": 0.5297542214393616, "step": 592 }, { "epoch": 0.1186, "grad_norm": 38.80964279174805, "learning_rate": 1.7987755102040817e-05, "loss": 2.674657106399536, "step": 593 }, { "epoch": 0.1188, "grad_norm": 38.70146942138672, "learning_rate": 1.7983673469387758e-05, "loss": 2.700640916824341, "step": 594 }, { "epoch": 0.119, "grad_norm": 9.609235763549805, "learning_rate": 1.7979591836734695e-05, "loss": 0.6304271817207336, "step": 595 }, { "epoch": 0.1192, "grad_norm": 8.441871643066406, "learning_rate": 1.7975510204081633e-05, "loss": 0.40333864092826843, "step": 596 }, { "epoch": 0.1194, "grad_norm": 10.5399169921875, "learning_rate": 1.7971428571428574e-05, "loss": 1.5478981733322144, "step": 597 }, { "epoch": 0.1196, "grad_norm": 11.575724601745605, "learning_rate": 1.7967346938775512e-05, "loss": 1.4419304132461548, "step": 598 }, { "epoch": 0.1198, "grad_norm": 9.683016777038574, "learning_rate": 1.796326530612245e-05, "loss": 1.5211687088012695, "step": 599 }, { "epoch": 0.12, "grad_norm": 8.750804901123047, "learning_rate": 1.795918367346939e-05, "loss": 1.2413440942764282, "step": 600 }, { "epoch": 0.1202, "grad_norm": 9.264275550842285, "learning_rate": 1.7955102040816328e-05, "loss": 3.822951316833496, "step": 601 }, { "epoch": 0.1204, "grad_norm": 5.871342658996582, "learning_rate": 1.7951020408163266e-05, "loss": 3.6863601207733154, "step": 602 }, { "epoch": 0.1206, "grad_norm": 10.98366641998291, "learning_rate": 1.7946938775510207e-05, "loss": 0.30503085255622864, "step": 603 }, { "epoch": 0.1208, "grad_norm": 10.534703254699707, "learning_rate": 1.7942857142857144e-05, "loss": 0.4975493252277374, "step": 604 }, { "epoch": 0.121, "grad_norm": 14.222439765930176, "learning_rate": 1.7938775510204082e-05, "loss": 0.3344534933567047, "step": 605 }, { "epoch": 0.1212, "grad_norm": 12.61806869506836, "learning_rate": 1.7934693877551023e-05, "loss": 0.3309352695941925, "step": 606 }, { "epoch": 0.1214, "grad_norm": 13.753547668457031, "learning_rate": 1.793061224489796e-05, "loss": 0.3978783190250397, "step": 607 }, { "epoch": 0.1216, "grad_norm": 13.207645416259766, "learning_rate": 1.7926530612244898e-05, "loss": 0.36826494336128235, "step": 608 }, { "epoch": 0.1218, "grad_norm": 6.715635776519775, "learning_rate": 1.792244897959184e-05, "loss": 0.19177716970443726, "step": 609 }, { "epoch": 0.122, "grad_norm": 6.490627288818359, "learning_rate": 1.7918367346938777e-05, "loss": 0.12844403088092804, "step": 610 }, { "epoch": 0.1222, "grad_norm": 12.998241424560547, "learning_rate": 1.7914285714285715e-05, "loss": 0.4355250895023346, "step": 611 }, { "epoch": 0.1224, "grad_norm": 4.835819721221924, "learning_rate": 1.7910204081632652e-05, "loss": 0.06730211526155472, "step": 612 }, { "epoch": 0.1226, "grad_norm": 12.499910354614258, "learning_rate": 1.7906122448979593e-05, "loss": 0.9613510966300964, "step": 613 }, { "epoch": 0.1228, "grad_norm": 16.230520248413086, "learning_rate": 1.790204081632653e-05, "loss": 1.2226914167404175, "step": 614 }, { "epoch": 0.123, "grad_norm": 12.136612892150879, "learning_rate": 1.789795918367347e-05, "loss": 0.4556024372577667, "step": 615 }, { "epoch": 0.1232, "grad_norm": 11.854321479797363, "learning_rate": 1.789387755102041e-05, "loss": 0.7772054076194763, "step": 616 }, { "epoch": 0.1234, "grad_norm": 7.2010931968688965, "learning_rate": 1.7889795918367347e-05, "loss": 0.17583107948303223, "step": 617 }, { "epoch": 0.1236, "grad_norm": 8.672499656677246, "learning_rate": 1.7885714285714285e-05, "loss": 0.18448638916015625, "step": 618 }, { "epoch": 0.1238, "grad_norm": 9.116911888122559, "learning_rate": 1.7881632653061226e-05, "loss": 0.7564261555671692, "step": 619 }, { "epoch": 0.124, "grad_norm": 10.779072761535645, "learning_rate": 1.7877551020408164e-05, "loss": 0.8381101489067078, "step": 620 }, { "epoch": 0.1242, "grad_norm": 12.201075553894043, "learning_rate": 1.78734693877551e-05, "loss": 0.5493308901786804, "step": 621 }, { "epoch": 0.1244, "grad_norm": 16.000568389892578, "learning_rate": 1.7869387755102042e-05, "loss": 1.2077207565307617, "step": 622 }, { "epoch": 0.1246, "grad_norm": 7.620370388031006, "learning_rate": 1.7865306122448983e-05, "loss": 0.2562389373779297, "step": 623 }, { "epoch": 0.1248, "grad_norm": 7.897797107696533, "learning_rate": 1.786122448979592e-05, "loss": 0.43710216879844666, "step": 624 }, { "epoch": 0.125, "grad_norm": 10.918147087097168, "learning_rate": 1.785714285714286e-05, "loss": 0.3691123425960541, "step": 625 }, { "epoch": 0.1252, "grad_norm": 11.561279296875, "learning_rate": 1.78530612244898e-05, "loss": 0.7716384530067444, "step": 626 }, { "epoch": 0.1254, "grad_norm": 9.598849296569824, "learning_rate": 1.7848979591836737e-05, "loss": 0.599067747592926, "step": 627 }, { "epoch": 0.1256, "grad_norm": 10.998629570007324, "learning_rate": 1.7844897959183675e-05, "loss": 0.9171390533447266, "step": 628 }, { "epoch": 0.1258, "grad_norm": 11.740135192871094, "learning_rate": 1.7840816326530616e-05, "loss": 0.5126091241836548, "step": 629 }, { "epoch": 0.126, "grad_norm": 9.241000175476074, "learning_rate": 1.7836734693877553e-05, "loss": 0.5009287595748901, "step": 630 }, { "epoch": 0.1262, "grad_norm": 15.022573471069336, "learning_rate": 1.783265306122449e-05, "loss": 1.0993671417236328, "step": 631 }, { "epoch": 0.1264, "grad_norm": 14.095918655395508, "learning_rate": 1.7828571428571432e-05, "loss": 0.8669114708900452, "step": 632 }, { "epoch": 0.1266, "grad_norm": 32.64982604980469, "learning_rate": 1.782448979591837e-05, "loss": 1.9596363306045532, "step": 633 }, { "epoch": 0.1268, "grad_norm": 38.364131927490234, "learning_rate": 1.7820408163265307e-05, "loss": 2.6949901580810547, "step": 634 }, { "epoch": 0.127, "grad_norm": 7.988552570343018, "learning_rate": 1.781632653061225e-05, "loss": 0.4111875593662262, "step": 635 }, { "epoch": 0.1272, "grad_norm": 14.480839729309082, "learning_rate": 1.7812244897959186e-05, "loss": 0.436307817697525, "step": 636 }, { "epoch": 0.1274, "grad_norm": 20.296417236328125, "learning_rate": 1.7808163265306124e-05, "loss": 1.3135007619857788, "step": 637 }, { "epoch": 0.1276, "grad_norm": 14.601682662963867, "learning_rate": 1.780408163265306e-05, "loss": 1.0407058000564575, "step": 638 }, { "epoch": 0.1278, "grad_norm": 11.520345687866211, "learning_rate": 1.7800000000000002e-05, "loss": 0.6323835253715515, "step": 639 }, { "epoch": 0.128, "grad_norm": 8.412186622619629, "learning_rate": 1.779591836734694e-05, "loss": 0.3591906726360321, "step": 640 }, { "epoch": 0.1282, "grad_norm": 7.596136569976807, "learning_rate": 1.7791836734693878e-05, "loss": 0.2633102834224701, "step": 641 }, { "epoch": 0.1284, "grad_norm": 7.652013301849365, "learning_rate": 1.778775510204082e-05, "loss": 0.2844131290912628, "step": 642 }, { "epoch": 0.1286, "grad_norm": 14.937522888183594, "learning_rate": 1.7783673469387756e-05, "loss": 0.931900680065155, "step": 643 }, { "epoch": 0.1288, "grad_norm": 11.996570587158203, "learning_rate": 1.7779591836734694e-05, "loss": 0.8581549525260925, "step": 644 }, { "epoch": 0.129, "grad_norm": 11.707404136657715, "learning_rate": 1.7775510204081635e-05, "loss": 1.1206721067428589, "step": 645 }, { "epoch": 0.1292, "grad_norm": 9.904662132263184, "learning_rate": 1.7771428571428573e-05, "loss": 1.031593680381775, "step": 646 }, { "epoch": 0.1294, "grad_norm": 12.293519020080566, "learning_rate": 1.776734693877551e-05, "loss": 1.2122260332107544, "step": 647 }, { "epoch": 0.1296, "grad_norm": 10.546151161193848, "learning_rate": 1.776326530612245e-05, "loss": 1.0906754732131958, "step": 648 }, { "epoch": 0.1298, "grad_norm": 8.968172073364258, "learning_rate": 1.775918367346939e-05, "loss": 0.2922592759132385, "step": 649 }, { "epoch": 0.13, "grad_norm": 9.768681526184082, "learning_rate": 1.7755102040816327e-05, "loss": 0.4742162227630615, "step": 650 }, { "epoch": 0.1302, "grad_norm": 12.723608016967773, "learning_rate": 1.7751020408163268e-05, "loss": 0.7791746258735657, "step": 651 }, { "epoch": 0.1304, "grad_norm": 10.014815330505371, "learning_rate": 1.7746938775510205e-05, "loss": 0.8886891007423401, "step": 652 }, { "epoch": 0.1306, "grad_norm": 13.535554885864258, "learning_rate": 1.7742857142857143e-05, "loss": 0.861926257610321, "step": 653 }, { "epoch": 0.1308, "grad_norm": 14.146342277526855, "learning_rate": 1.7738775510204084e-05, "loss": 1.4108766317367554, "step": 654 }, { "epoch": 0.131, "grad_norm": 11.172720909118652, "learning_rate": 1.773469387755102e-05, "loss": 1.0398005247116089, "step": 655 }, { "epoch": 0.1312, "grad_norm": 12.420480728149414, "learning_rate": 1.773061224489796e-05, "loss": 1.098586082458496, "step": 656 }, { "epoch": 0.1314, "grad_norm": 13.521431922912598, "learning_rate": 1.77265306122449e-05, "loss": 0.5681458115577698, "step": 657 }, { "epoch": 0.1316, "grad_norm": 15.764737129211426, "learning_rate": 1.7722448979591838e-05, "loss": 0.48384639620780945, "step": 658 }, { "epoch": 0.1318, "grad_norm": 12.213311195373535, "learning_rate": 1.7718367346938775e-05, "loss": 0.695458710193634, "step": 659 }, { "epoch": 0.132, "grad_norm": 8.498339653015137, "learning_rate": 1.7714285714285717e-05, "loss": 0.09367576986551285, "step": 660 }, { "epoch": 0.1322, "grad_norm": 13.739344596862793, "learning_rate": 1.7710204081632654e-05, "loss": 0.9440757632255554, "step": 661 }, { "epoch": 0.1324, "grad_norm": 13.500205993652344, "learning_rate": 1.7706122448979592e-05, "loss": 0.5242319107055664, "step": 662 }, { "epoch": 0.1326, "grad_norm": 11.270493507385254, "learning_rate": 1.7702040816326533e-05, "loss": 0.6261625289916992, "step": 663 }, { "epoch": 0.1328, "grad_norm": 11.79617691040039, "learning_rate": 1.769795918367347e-05, "loss": 0.7476010918617249, "step": 664 }, { "epoch": 0.133, "grad_norm": 11.163064956665039, "learning_rate": 1.7693877551020408e-05, "loss": 0.42261502146720886, "step": 665 }, { "epoch": 0.1332, "grad_norm": 8.681324005126953, "learning_rate": 1.7689795918367346e-05, "loss": 0.2355920523405075, "step": 666 }, { "epoch": 0.1334, "grad_norm": 11.953144073486328, "learning_rate": 1.7685714285714287e-05, "loss": 0.447518914937973, "step": 667 }, { "epoch": 0.1336, "grad_norm": 7.357821941375732, "learning_rate": 1.7681632653061228e-05, "loss": 0.4788515865802765, "step": 668 }, { "epoch": 0.1338, "grad_norm": 7.640774726867676, "learning_rate": 1.7677551020408165e-05, "loss": 0.23292386531829834, "step": 669 }, { "epoch": 0.134, "grad_norm": 5.623747825622559, "learning_rate": 1.7673469387755103e-05, "loss": 0.06400511413812637, "step": 670 }, { "epoch": 0.1342, "grad_norm": 14.167566299438477, "learning_rate": 1.7669387755102044e-05, "loss": 1.2401551008224487, "step": 671 }, { "epoch": 0.1344, "grad_norm": 11.210840225219727, "learning_rate": 1.7665306122448982e-05, "loss": 0.9809815287590027, "step": 672 }, { "epoch": 0.1346, "grad_norm": 9.620011329650879, "learning_rate": 1.766122448979592e-05, "loss": 0.7593598961830139, "step": 673 }, { "epoch": 0.1348, "grad_norm": 9.036689758300781, "learning_rate": 1.765714285714286e-05, "loss": 0.7323796153068542, "step": 674 }, { "epoch": 0.135, "grad_norm": 8.841421127319336, "learning_rate": 1.7653061224489798e-05, "loss": 0.36300086975097656, "step": 675 }, { "epoch": 0.1352, "grad_norm": 5.024514198303223, "learning_rate": 1.7648979591836736e-05, "loss": 0.1856626719236374, "step": 676 }, { "epoch": 0.1354, "grad_norm": 10.672222137451172, "learning_rate": 1.7644897959183677e-05, "loss": 0.4185608923435211, "step": 677 }, { "epoch": 0.1356, "grad_norm": 5.480457782745361, "learning_rate": 1.7640816326530614e-05, "loss": 0.4270029067993164, "step": 678 }, { "epoch": 0.1358, "grad_norm": 13.384345054626465, "learning_rate": 1.7636734693877552e-05, "loss": 0.9341201782226562, "step": 679 }, { "epoch": 0.136, "grad_norm": 11.61329460144043, "learning_rate": 1.7632653061224493e-05, "loss": 0.8007194399833679, "step": 680 }, { "epoch": 0.1362, "grad_norm": 21.145790100097656, "learning_rate": 1.762857142857143e-05, "loss": 0.6441300511360168, "step": 681 }, { "epoch": 0.1364, "grad_norm": 18.466842651367188, "learning_rate": 1.7624489795918368e-05, "loss": 0.604012668132782, "step": 682 }, { "epoch": 0.1366, "grad_norm": 9.196843147277832, "learning_rate": 1.762040816326531e-05, "loss": 2.540626287460327, "step": 683 }, { "epoch": 0.1368, "grad_norm": 8.944679260253906, "learning_rate": 1.7616326530612247e-05, "loss": 2.4371800422668457, "step": 684 }, { "epoch": 0.137, "grad_norm": 33.24692153930664, "learning_rate": 1.7612244897959185e-05, "loss": 2.222153425216675, "step": 685 }, { "epoch": 0.1372, "grad_norm": 27.14830207824707, "learning_rate": 1.7608163265306126e-05, "loss": 2.2262842655181885, "step": 686 }, { "epoch": 0.1374, "grad_norm": 10.974462509155273, "learning_rate": 1.7604081632653063e-05, "loss": 0.42491415143013, "step": 687 }, { "epoch": 0.1376, "grad_norm": 10.665210723876953, "learning_rate": 1.76e-05, "loss": 0.31951087713241577, "step": 688 }, { "epoch": 0.1378, "grad_norm": 10.70345401763916, "learning_rate": 1.7595918367346942e-05, "loss": 0.4475421905517578, "step": 689 }, { "epoch": 0.138, "grad_norm": 8.484381675720215, "learning_rate": 1.759183673469388e-05, "loss": 0.23042087256908417, "step": 690 }, { "epoch": 0.1382, "grad_norm": 15.865165710449219, "learning_rate": 1.7587755102040817e-05, "loss": 0.6705883145332336, "step": 691 }, { "epoch": 0.1384, "grad_norm": 12.621859550476074, "learning_rate": 1.7583673469387755e-05, "loss": 0.438053697347641, "step": 692 }, { "epoch": 0.1386, "grad_norm": 9.627793312072754, "learning_rate": 1.7579591836734696e-05, "loss": 0.3794168531894684, "step": 693 }, { "epoch": 0.1388, "grad_norm": 4.623700141906738, "learning_rate": 1.7575510204081634e-05, "loss": 0.04528559371829033, "step": 694 }, { "epoch": 0.139, "grad_norm": 9.119061470031738, "learning_rate": 1.757142857142857e-05, "loss": 0.3755207061767578, "step": 695 }, { "epoch": 0.1392, "grad_norm": 9.938539505004883, "learning_rate": 1.7567346938775512e-05, "loss": 0.3410348892211914, "step": 696 }, { "epoch": 0.1394, "grad_norm": 12.629765510559082, "learning_rate": 1.756326530612245e-05, "loss": 0.7066850662231445, "step": 697 }, { "epoch": 0.1396, "grad_norm": 12.002217292785645, "learning_rate": 1.7559183673469387e-05, "loss": 0.7674385905265808, "step": 698 }, { "epoch": 0.1398, "grad_norm": 13.5888671875, "learning_rate": 1.755510204081633e-05, "loss": 1.082494854927063, "step": 699 }, { "epoch": 0.14, "grad_norm": 14.132606506347656, "learning_rate": 1.7551020408163266e-05, "loss": 0.565140426158905, "step": 700 }, { "epoch": 0.1402, "grad_norm": 10.079045295715332, "learning_rate": 1.7546938775510204e-05, "loss": 0.2579793930053711, "step": 701 }, { "epoch": 0.1404, "grad_norm": 8.026155471801758, "learning_rate": 1.7542857142857145e-05, "loss": 0.1645224541425705, "step": 702 }, { "epoch": 0.1406, "grad_norm": 9.82970142364502, "learning_rate": 1.7538775510204082e-05, "loss": 0.1751258820295334, "step": 703 }, { "epoch": 0.1408, "grad_norm": 8.215539932250977, "learning_rate": 1.753469387755102e-05, "loss": 0.1312454342842102, "step": 704 }, { "epoch": 0.141, "grad_norm": 8.542089462280273, "learning_rate": 1.753061224489796e-05, "loss": 0.38043227791786194, "step": 705 }, { "epoch": 0.1412, "grad_norm": 5.60847282409668, "learning_rate": 1.75265306122449e-05, "loss": 0.10540112853050232, "step": 706 }, { "epoch": 0.1414, "grad_norm": 10.964652061462402, "learning_rate": 1.7522448979591836e-05, "loss": 1.119620680809021, "step": 707 }, { "epoch": 0.1416, "grad_norm": 12.320873260498047, "learning_rate": 1.7518367346938777e-05, "loss": 0.46061721444129944, "step": 708 }, { "epoch": 0.1418, "grad_norm": 8.906105995178223, "learning_rate": 1.7514285714285715e-05, "loss": 1.1816500425338745, "step": 709 }, { "epoch": 0.142, "grad_norm": 11.270313262939453, "learning_rate": 1.7510204081632653e-05, "loss": 0.8878250122070312, "step": 710 }, { "epoch": 0.1422, "grad_norm": 14.09372615814209, "learning_rate": 1.7506122448979594e-05, "loss": 1.0754565000534058, "step": 711 }, { "epoch": 0.1424, "grad_norm": 14.562458992004395, "learning_rate": 1.750204081632653e-05, "loss": 0.9496583938598633, "step": 712 }, { "epoch": 0.1426, "grad_norm": 16.510957717895508, "learning_rate": 1.7497959183673472e-05, "loss": 0.6176168918609619, "step": 713 }, { "epoch": 0.1428, "grad_norm": 15.548184394836426, "learning_rate": 1.749387755102041e-05, "loss": 0.5802152156829834, "step": 714 }, { "epoch": 0.143, "grad_norm": 16.158681869506836, "learning_rate": 1.748979591836735e-05, "loss": 0.5384200215339661, "step": 715 }, { "epoch": 0.1432, "grad_norm": 13.481738090515137, "learning_rate": 1.748571428571429e-05, "loss": 1.1257078647613525, "step": 716 }, { "epoch": 0.1434, "grad_norm": 11.724091529846191, "learning_rate": 1.7481632653061226e-05, "loss": 0.45442309975624084, "step": 717 }, { "epoch": 0.1436, "grad_norm": 11.29008674621582, "learning_rate": 1.7477551020408164e-05, "loss": 0.45394429564476013, "step": 718 }, { "epoch": 0.1438, "grad_norm": 6.764264106750488, "learning_rate": 1.7473469387755105e-05, "loss": 0.2008994072675705, "step": 719 }, { "epoch": 0.144, "grad_norm": 5.740670204162598, "learning_rate": 1.7469387755102043e-05, "loss": 0.08834325522184372, "step": 720 }, { "epoch": 0.1442, "grad_norm": 9.330862045288086, "learning_rate": 1.746530612244898e-05, "loss": 0.24800275266170502, "step": 721 }, { "epoch": 0.1444, "grad_norm": 7.405993461608887, "learning_rate": 1.746122448979592e-05, "loss": 0.516298770904541, "step": 722 }, { "epoch": 0.1446, "grad_norm": 14.721088409423828, "learning_rate": 1.745714285714286e-05, "loss": 0.5837984085083008, "step": 723 }, { "epoch": 0.1448, "grad_norm": 10.000396728515625, "learning_rate": 1.7453061224489797e-05, "loss": 0.3792562782764435, "step": 724 }, { "epoch": 0.145, "grad_norm": 10.423584938049316, "learning_rate": 1.7448979591836738e-05, "loss": 0.32581838965415955, "step": 725 }, { "epoch": 0.1452, "grad_norm": 9.63492488861084, "learning_rate": 1.7444897959183675e-05, "loss": 0.3101356029510498, "step": 726 }, { "epoch": 0.1454, "grad_norm": 13.9413423538208, "learning_rate": 1.7440816326530613e-05, "loss": 0.6439652442932129, "step": 727 }, { "epoch": 0.1456, "grad_norm": 9.161519050598145, "learning_rate": 1.7436734693877554e-05, "loss": 0.5394307971000671, "step": 728 }, { "epoch": 0.1458, "grad_norm": 13.081731796264648, "learning_rate": 1.743265306122449e-05, "loss": 0.4912102222442627, "step": 729 }, { "epoch": 0.146, "grad_norm": 11.13801097869873, "learning_rate": 1.742857142857143e-05, "loss": 0.3857848644256592, "step": 730 }, { "epoch": 0.1462, "grad_norm": 13.22642993927002, "learning_rate": 1.742448979591837e-05, "loss": 0.4448005259037018, "step": 731 }, { "epoch": 0.1464, "grad_norm": 11.004619598388672, "learning_rate": 1.7420408163265308e-05, "loss": 0.24569053947925568, "step": 732 }, { "epoch": 0.1466, "grad_norm": 15.297962188720703, "learning_rate": 1.7416326530612245e-05, "loss": 0.5006201863288879, "step": 733 }, { "epoch": 0.1468, "grad_norm": 10.685734748840332, "learning_rate": 1.7412244897959187e-05, "loss": 0.38487866520881653, "step": 734 }, { "epoch": 0.147, "grad_norm": 9.32356071472168, "learning_rate": 1.7408163265306124e-05, "loss": 0.2797485291957855, "step": 735 }, { "epoch": 0.1472, "grad_norm": 11.006173133850098, "learning_rate": 1.7404081632653062e-05, "loss": 0.42651239037513733, "step": 736 }, { "epoch": 0.1474, "grad_norm": 16.145952224731445, "learning_rate": 1.7400000000000003e-05, "loss": 0.47780147194862366, "step": 737 }, { "epoch": 0.1476, "grad_norm": 16.576457977294922, "learning_rate": 1.739591836734694e-05, "loss": 0.5122077465057373, "step": 738 }, { "epoch": 0.1478, "grad_norm": 6.308256149291992, "learning_rate": 1.7391836734693878e-05, "loss": 3.6567018032073975, "step": 739 }, { "epoch": 0.148, "grad_norm": 6.102212429046631, "learning_rate": 1.738775510204082e-05, "loss": 3.6497976779937744, "step": 740 }, { "epoch": 0.1482, "grad_norm": 10.994969367980957, "learning_rate": 1.7383673469387757e-05, "loss": 0.416873574256897, "step": 741 }, { "epoch": 0.1484, "grad_norm": 10.662221908569336, "learning_rate": 1.7379591836734694e-05, "loss": 0.41474536061286926, "step": 742 }, { "epoch": 0.1486, "grad_norm": 16.18564224243164, "learning_rate": 1.7375510204081632e-05, "loss": 1.3894104957580566, "step": 743 }, { "epoch": 0.1488, "grad_norm": 11.509024620056152, "learning_rate": 1.7371428571428573e-05, "loss": 0.9457443356513977, "step": 744 }, { "epoch": 0.149, "grad_norm": 11.563676834106445, "learning_rate": 1.736734693877551e-05, "loss": 1.4715752601623535, "step": 745 }, { "epoch": 0.1492, "grad_norm": 12.36917781829834, "learning_rate": 1.736326530612245e-05, "loss": 1.0810216665267944, "step": 746 }, { "epoch": 0.1494, "grad_norm": 23.451833724975586, "learning_rate": 1.735918367346939e-05, "loss": 1.1819260120391846, "step": 747 }, { "epoch": 0.1496, "grad_norm": 21.954042434692383, "learning_rate": 1.7355102040816327e-05, "loss": 0.9266975522041321, "step": 748 }, { "epoch": 0.1498, "grad_norm": 10.645434379577637, "learning_rate": 1.7351020408163265e-05, "loss": 0.4366590082645416, "step": 749 }, { "epoch": 0.15, "grad_norm": 10.085464477539062, "learning_rate": 1.7346938775510206e-05, "loss": 0.39136600494384766, "step": 750 }, { "epoch": 0.1502, "grad_norm": 13.359373092651367, "learning_rate": 1.7342857142857143e-05, "loss": 0.6837975382804871, "step": 751 }, { "epoch": 0.1504, "grad_norm": 8.370230674743652, "learning_rate": 1.733877551020408e-05, "loss": 1.0935190916061401, "step": 752 }, { "epoch": 0.1506, "grad_norm": 14.436971664428711, "learning_rate": 1.7334693877551022e-05, "loss": 0.5633415579795837, "step": 753 }, { "epoch": 0.1508, "grad_norm": 10.484057426452637, "learning_rate": 1.733061224489796e-05, "loss": 1.3053311109542847, "step": 754 }, { "epoch": 0.151, "grad_norm": 13.887869834899902, "learning_rate": 1.7326530612244897e-05, "loss": 0.6421354413032532, "step": 755 }, { "epoch": 0.1512, "grad_norm": 7.413531303405762, "learning_rate": 1.7322448979591838e-05, "loss": 0.4527202546596527, "step": 756 }, { "epoch": 0.1514, "grad_norm": 40.82472229003906, "learning_rate": 1.7318367346938776e-05, "loss": 2.967447280883789, "step": 757 }, { "epoch": 0.1516, "grad_norm": 19.505496978759766, "learning_rate": 1.7314285714285717e-05, "loss": 2.8037269115448, "step": 758 }, { "epoch": 0.1518, "grad_norm": 11.161591529846191, "learning_rate": 1.7310204081632655e-05, "loss": 1.213711142539978, "step": 759 }, { "epoch": 0.152, "grad_norm": 10.131704330444336, "learning_rate": 1.7306122448979596e-05, "loss": 1.0836795568466187, "step": 760 }, { "epoch": 0.1522, "grad_norm": 21.120372772216797, "learning_rate": 1.7302040816326533e-05, "loss": 0.5193700194358826, "step": 761 }, { "epoch": 0.1524, "grad_norm": 7.548417091369629, "learning_rate": 1.729795918367347e-05, "loss": 0.33356955647468567, "step": 762 }, { "epoch": 0.1526, "grad_norm": 10.09682846069336, "learning_rate": 1.7293877551020412e-05, "loss": 0.945113480091095, "step": 763 }, { "epoch": 0.1528, "grad_norm": 8.723321914672852, "learning_rate": 1.728979591836735e-05, "loss": 1.0624339580535889, "step": 764 }, { "epoch": 0.153, "grad_norm": 11.091486930847168, "learning_rate": 1.7285714285714287e-05, "loss": 0.49018755555152893, "step": 765 }, { "epoch": 0.1532, "grad_norm": 7.9845170974731445, "learning_rate": 1.7281632653061228e-05, "loss": 0.07966050505638123, "step": 766 }, { "epoch": 0.1534, "grad_norm": 9.461292266845703, "learning_rate": 1.7277551020408166e-05, "loss": 1.2714213132858276, "step": 767 }, { "epoch": 0.1536, "grad_norm": 7.521483898162842, "learning_rate": 1.7273469387755104e-05, "loss": 0.8111341595649719, "step": 768 }, { "epoch": 0.1538, "grad_norm": 7.449307918548584, "learning_rate": 1.726938775510204e-05, "loss": 0.582644522190094, "step": 769 }, { "epoch": 0.154, "grad_norm": 7.701537609100342, "learning_rate": 1.7265306122448982e-05, "loss": 0.2264944314956665, "step": 770 }, { "epoch": 0.1542, "grad_norm": 9.337274551391602, "learning_rate": 1.726122448979592e-05, "loss": 0.8635380268096924, "step": 771 }, { "epoch": 0.1544, "grad_norm": 8.135647773742676, "learning_rate": 1.7257142857142857e-05, "loss": 0.22340886294841766, "step": 772 }, { "epoch": 0.1546, "grad_norm": 10.464387893676758, "learning_rate": 1.72530612244898e-05, "loss": 0.7764620780944824, "step": 773 }, { "epoch": 0.1548, "grad_norm": 11.095023155212402, "learning_rate": 1.7248979591836736e-05, "loss": 0.4418260157108307, "step": 774 }, { "epoch": 0.155, "grad_norm": 11.342694282531738, "learning_rate": 1.7244897959183674e-05, "loss": 1.4569212198257446, "step": 775 }, { "epoch": 0.1552, "grad_norm": 14.055512428283691, "learning_rate": 1.7240816326530615e-05, "loss": 1.60598886013031, "step": 776 }, { "epoch": 0.1554, "grad_norm": 17.556177139282227, "learning_rate": 1.7236734693877552e-05, "loss": 0.31339550018310547, "step": 777 }, { "epoch": 0.1556, "grad_norm": 10.759800910949707, "learning_rate": 1.723265306122449e-05, "loss": 0.34965983033180237, "step": 778 }, { "epoch": 0.1558, "grad_norm": 4.928140163421631, "learning_rate": 1.722857142857143e-05, "loss": 0.0968223437666893, "step": 779 }, { "epoch": 0.156, "grad_norm": 4.8700361251831055, "learning_rate": 1.722448979591837e-05, "loss": 0.07303012162446976, "step": 780 }, { "epoch": 0.1562, "grad_norm": 13.173325538635254, "learning_rate": 1.7220408163265306e-05, "loss": 0.7579927444458008, "step": 781 }, { "epoch": 0.1564, "grad_norm": 11.799232482910156, "learning_rate": 1.7216326530612247e-05, "loss": 1.2781928777694702, "step": 782 }, { "epoch": 0.1566, "grad_norm": 11.70905876159668, "learning_rate": 1.7212244897959185e-05, "loss": 0.45815756916999817, "step": 783 }, { "epoch": 0.1568, "grad_norm": 18.038320541381836, "learning_rate": 1.7208163265306123e-05, "loss": 0.3250984251499176, "step": 784 }, { "epoch": 0.157, "grad_norm": 17.403844833374023, "learning_rate": 1.7204081632653064e-05, "loss": 1.2099705934524536, "step": 785 }, { "epoch": 0.1572, "grad_norm": 15.40157413482666, "learning_rate": 1.72e-05, "loss": 1.0935746431350708, "step": 786 }, { "epoch": 0.1574, "grad_norm": 8.310811042785645, "learning_rate": 1.719591836734694e-05, "loss": 0.8277642130851746, "step": 787 }, { "epoch": 0.1576, "grad_norm": 8.334830284118652, "learning_rate": 1.719183673469388e-05, "loss": 0.8446757197380066, "step": 788 }, { "epoch": 0.1578, "grad_norm": 8.584236145019531, "learning_rate": 1.7187755102040818e-05, "loss": 0.38698020577430725, "step": 789 }, { "epoch": 0.158, "grad_norm": 9.955022811889648, "learning_rate": 1.7183673469387755e-05, "loss": 0.5164220929145813, "step": 790 }, { "epoch": 0.1582, "grad_norm": 9.958945274353027, "learning_rate": 1.7179591836734696e-05, "loss": 1.1173901557922363, "step": 791 }, { "epoch": 0.1584, "grad_norm": 8.765222549438477, "learning_rate": 1.7175510204081634e-05, "loss": 0.9976674914360046, "step": 792 }, { "epoch": 0.1586, "grad_norm": 23.056232452392578, "learning_rate": 1.717142857142857e-05, "loss": 1.370761752128601, "step": 793 }, { "epoch": 0.1588, "grad_norm": 23.940549850463867, "learning_rate": 1.7167346938775513e-05, "loss": 1.3595713376998901, "step": 794 }, { "epoch": 0.159, "grad_norm": 18.20630645751953, "learning_rate": 1.716326530612245e-05, "loss": 1.7061136960983276, "step": 795 }, { "epoch": 0.1592, "grad_norm": 16.01688003540039, "learning_rate": 1.7159183673469388e-05, "loss": 1.3681236505508423, "step": 796 }, { "epoch": 0.1594, "grad_norm": 8.32278823852539, "learning_rate": 1.7155102040816326e-05, "loss": 0.9235571026802063, "step": 797 }, { "epoch": 0.1596, "grad_norm": 10.294227600097656, "learning_rate": 1.7151020408163267e-05, "loss": 0.5937687754631042, "step": 798 }, { "epoch": 0.1598, "grad_norm": 11.361611366271973, "learning_rate": 1.7146938775510204e-05, "loss": 0.8640332221984863, "step": 799 }, { "epoch": 0.16, "grad_norm": 7.6554789543151855, "learning_rate": 1.7142857142857142e-05, "loss": 0.8463423848152161, "step": 800 }, { "epoch": 0.1602, "grad_norm": 8.576823234558105, "learning_rate": 1.7138775510204083e-05, "loss": 0.3656291961669922, "step": 801 }, { "epoch": 0.1604, "grad_norm": 6.198919773101807, "learning_rate": 1.713469387755102e-05, "loss": 0.22547852993011475, "step": 802 }, { "epoch": 0.1606, "grad_norm": 8.521926879882812, "learning_rate": 1.7130612244897958e-05, "loss": 0.3603907525539398, "step": 803 }, { "epoch": 0.1608, "grad_norm": 4.4991044998168945, "learning_rate": 1.71265306122449e-05, "loss": 0.04183853790163994, "step": 804 }, { "epoch": 0.161, "grad_norm": 7.233295440673828, "learning_rate": 1.712244897959184e-05, "loss": 0.22137486934661865, "step": 805 }, { "epoch": 0.1612, "grad_norm": 3.7667109966278076, "learning_rate": 1.7118367346938778e-05, "loss": 0.04676097258925438, "step": 806 }, { "epoch": 0.1614, "grad_norm": 8.665865898132324, "learning_rate": 1.7114285714285715e-05, "loss": 0.3963099420070648, "step": 807 }, { "epoch": 0.1616, "grad_norm": 5.754110336303711, "learning_rate": 1.7110204081632657e-05, "loss": 0.07381335645914078, "step": 808 }, { "epoch": 0.1618, "grad_norm": 15.004722595214844, "learning_rate": 1.7106122448979594e-05, "loss": 0.8166268467903137, "step": 809 }, { "epoch": 0.162, "grad_norm": 14.96613883972168, "learning_rate": 1.7102040816326532e-05, "loss": 0.2907205820083618, "step": 810 }, { "epoch": 0.1622, "grad_norm": 16.329971313476562, "learning_rate": 1.7097959183673473e-05, "loss": 0.6810183525085449, "step": 811 }, { "epoch": 0.1624, "grad_norm": 11.588258743286133, "learning_rate": 1.709387755102041e-05, "loss": 0.3183893859386444, "step": 812 }, { "epoch": 0.1626, "grad_norm": 11.18985652923584, "learning_rate": 1.7089795918367348e-05, "loss": 0.33114203810691833, "step": 813 }, { "epoch": 0.1628, "grad_norm": 11.678988456726074, "learning_rate": 1.708571428571429e-05, "loss": 0.7535752654075623, "step": 814 }, { "epoch": 0.163, "grad_norm": 13.68851375579834, "learning_rate": 1.7081632653061227e-05, "loss": 0.889157772064209, "step": 815 }, { "epoch": 0.1632, "grad_norm": 14.774972915649414, "learning_rate": 1.7077551020408164e-05, "loss": 1.3850507736206055, "step": 816 }, { "epoch": 0.1634, "grad_norm": 11.22496223449707, "learning_rate": 1.7073469387755105e-05, "loss": 0.2509547472000122, "step": 817 }, { "epoch": 0.1636, "grad_norm": 10.1691312789917, "learning_rate": 1.7069387755102043e-05, "loss": 0.150825634598732, "step": 818 }, { "epoch": 0.1638, "grad_norm": 63.16236114501953, "learning_rate": 1.706530612244898e-05, "loss": 2.487028121948242, "step": 819 }, { "epoch": 0.164, "grad_norm": 29.83321189880371, "learning_rate": 1.7061224489795922e-05, "loss": 1.8375195264816284, "step": 820 }, { "epoch": 0.1642, "grad_norm": 13.816812515258789, "learning_rate": 1.705714285714286e-05, "loss": 0.6691530346870422, "step": 821 }, { "epoch": 0.1644, "grad_norm": 4.308398723602295, "learning_rate": 1.7053061224489797e-05, "loss": 0.05395898222923279, "step": 822 }, { "epoch": 0.1646, "grad_norm": 12.161376953125, "learning_rate": 1.7048979591836735e-05, "loss": 0.49681249260902405, "step": 823 }, { "epoch": 0.1648, "grad_norm": 7.281305313110352, "learning_rate": 1.7044897959183676e-05, "loss": 0.2690201997756958, "step": 824 }, { "epoch": 0.165, "grad_norm": 15.228219032287598, "learning_rate": 1.7040816326530613e-05, "loss": 2.183903932571411, "step": 825 }, { "epoch": 0.1652, "grad_norm": 11.266607284545898, "learning_rate": 1.703673469387755e-05, "loss": 1.8891977071762085, "step": 826 }, { "epoch": 0.1654, "grad_norm": 9.80399227142334, "learning_rate": 1.7032653061224492e-05, "loss": 1.9456223249435425, "step": 827 }, { "epoch": 0.1656, "grad_norm": 6.174814224243164, "learning_rate": 1.702857142857143e-05, "loss": 1.6220054626464844, "step": 828 }, { "epoch": 0.1658, "grad_norm": 9.70406723022461, "learning_rate": 1.7024489795918367e-05, "loss": 1.2355018854141235, "step": 829 }, { "epoch": 0.166, "grad_norm": 8.401901245117188, "learning_rate": 1.7020408163265308e-05, "loss": 0.797677755355835, "step": 830 }, { "epoch": 0.1662, "grad_norm": 10.09656810760498, "learning_rate": 1.7016326530612246e-05, "loss": 0.30209818482398987, "step": 831 }, { "epoch": 0.1664, "grad_norm": 7.124859809875488, "learning_rate": 1.7012244897959184e-05, "loss": 0.24133729934692383, "step": 832 }, { "epoch": 0.1666, "grad_norm": 8.425739288330078, "learning_rate": 1.7008163265306125e-05, "loss": 0.19166846573352814, "step": 833 }, { "epoch": 0.1668, "grad_norm": 5.5932769775390625, "learning_rate": 1.7004081632653062e-05, "loss": 0.12489122152328491, "step": 834 }, { "epoch": 0.167, "grad_norm": 7.666117191314697, "learning_rate": 1.7e-05, "loss": 0.3184269368648529, "step": 835 }, { "epoch": 0.1672, "grad_norm": 7.162036895751953, "learning_rate": 1.699591836734694e-05, "loss": 0.14772318303585052, "step": 836 }, { "epoch": 0.1674, "grad_norm": 23.604835510253906, "learning_rate": 1.699183673469388e-05, "loss": 1.2217166423797607, "step": 837 }, { "epoch": 0.1676, "grad_norm": 19.6046142578125, "learning_rate": 1.6987755102040816e-05, "loss": 1.1713870763778687, "step": 838 }, { "epoch": 0.1678, "grad_norm": 10.156699180603027, "learning_rate": 1.6983673469387757e-05, "loss": 0.7312700152397156, "step": 839 }, { "epoch": 0.168, "grad_norm": 6.102944850921631, "learning_rate": 1.6979591836734695e-05, "loss": 0.21069572865962982, "step": 840 }, { "epoch": 0.1682, "grad_norm": 7.401151657104492, "learning_rate": 1.6975510204081632e-05, "loss": 0.18337900936603546, "step": 841 }, { "epoch": 0.1684, "grad_norm": 5.875607490539551, "learning_rate": 1.6971428571428574e-05, "loss": 0.06811075657606125, "step": 842 }, { "epoch": 0.1686, "grad_norm": 15.57354736328125, "learning_rate": 1.696734693877551e-05, "loss": 2.1977641582489014, "step": 843 }, { "epoch": 0.1688, "grad_norm": 12.335282325744629, "learning_rate": 1.696326530612245e-05, "loss": 2.071246862411499, "step": 844 }, { "epoch": 0.169, "grad_norm": 9.727235794067383, "learning_rate": 1.695918367346939e-05, "loss": 1.200844168663025, "step": 845 }, { "epoch": 0.1692, "grad_norm": 11.321294784545898, "learning_rate": 1.6955102040816327e-05, "loss": 1.156197428703308, "step": 846 }, { "epoch": 0.1694, "grad_norm": 27.516820907592773, "learning_rate": 1.6951020408163265e-05, "loss": 1.9044551849365234, "step": 847 }, { "epoch": 0.1696, "grad_norm": 26.124889373779297, "learning_rate": 1.6946938775510203e-05, "loss": 1.5935903787612915, "step": 848 }, { "epoch": 0.1698, "grad_norm": 10.166653633117676, "learning_rate": 1.6942857142857144e-05, "loss": 1.2305039167404175, "step": 849 }, { "epoch": 0.17, "grad_norm": 10.783432006835938, "learning_rate": 1.6938775510204085e-05, "loss": 0.37452611327171326, "step": 850 }, { "epoch": 0.1702, "grad_norm": 8.736797332763672, "learning_rate": 1.6934693877551022e-05, "loss": 0.39462193846702576, "step": 851 }, { "epoch": 0.1704, "grad_norm": 4.686192989349365, "learning_rate": 1.693061224489796e-05, "loss": 0.05741189420223236, "step": 852 }, { "epoch": 0.1706, "grad_norm": 8.015236854553223, "learning_rate": 1.69265306122449e-05, "loss": 0.346451997756958, "step": 853 }, { "epoch": 0.1708, "grad_norm": 4.675577163696289, "learning_rate": 1.692244897959184e-05, "loss": 0.04520675167441368, "step": 854 }, { "epoch": 0.171, "grad_norm": 12.066052436828613, "learning_rate": 1.6918367346938776e-05, "loss": 0.5323671698570251, "step": 855 }, { "epoch": 0.1712, "grad_norm": 4.285087585449219, "learning_rate": 1.6914285714285717e-05, "loss": 0.05052861571311951, "step": 856 }, { "epoch": 0.1714, "grad_norm": 8.832772254943848, "learning_rate": 1.6910204081632655e-05, "loss": 2.343698501586914, "step": 857 }, { "epoch": 0.1716, "grad_norm": 7.151556491851807, "learning_rate": 1.6906122448979593e-05, "loss": 2.344592332839966, "step": 858 }, { "epoch": 0.1718, "grad_norm": 9.483006477355957, "learning_rate": 1.6902040816326534e-05, "loss": 1.2930094003677368, "step": 859 }, { "epoch": 0.172, "grad_norm": 10.535362243652344, "learning_rate": 1.689795918367347e-05, "loss": 0.9568271636962891, "step": 860 }, { "epoch": 0.1722, "grad_norm": 12.547567367553711, "learning_rate": 1.689387755102041e-05, "loss": 0.5192998051643372, "step": 861 }, { "epoch": 0.1724, "grad_norm": 8.66522216796875, "learning_rate": 1.688979591836735e-05, "loss": 0.18473176658153534, "step": 862 }, { "epoch": 0.1726, "grad_norm": 11.593875885009766, "learning_rate": 1.6885714285714288e-05, "loss": 1.230906367301941, "step": 863 }, { "epoch": 0.1728, "grad_norm": 12.751099586486816, "learning_rate": 1.6881632653061225e-05, "loss": 1.3743547201156616, "step": 864 }, { "epoch": 0.173, "grad_norm": 11.993965148925781, "learning_rate": 1.6877551020408166e-05, "loss": 1.8797513246536255, "step": 865 }, { "epoch": 0.1732, "grad_norm": 10.8436279296875, "learning_rate": 1.6873469387755104e-05, "loss": 1.398915410041809, "step": 866 }, { "epoch": 0.1734, "grad_norm": 12.852522850036621, "learning_rate": 1.686938775510204e-05, "loss": 1.0060895681381226, "step": 867 }, { "epoch": 0.1736, "grad_norm": 12.911187171936035, "learning_rate": 1.6865306122448983e-05, "loss": 0.9872347712516785, "step": 868 }, { "epoch": 0.1738, "grad_norm": 10.305516242980957, "learning_rate": 1.686122448979592e-05, "loss": 0.8250768184661865, "step": 869 }, { "epoch": 0.174, "grad_norm": 8.301410675048828, "learning_rate": 1.6857142857142858e-05, "loss": 0.1378517895936966, "step": 870 }, { "epoch": 0.1742, "grad_norm": 10.204079627990723, "learning_rate": 1.68530612244898e-05, "loss": 0.8009190559387207, "step": 871 }, { "epoch": 0.1744, "grad_norm": 11.249664306640625, "learning_rate": 1.6848979591836737e-05, "loss": 0.389691025018692, "step": 872 }, { "epoch": 0.1746, "grad_norm": 10.197457313537598, "learning_rate": 1.6844897959183674e-05, "loss": 0.3324334919452667, "step": 873 }, { "epoch": 0.1748, "grad_norm": 6.4466400146484375, "learning_rate": 1.6840816326530612e-05, "loss": 0.16674156486988068, "step": 874 }, { "epoch": 0.175, "grad_norm": 8.318696022033691, "learning_rate": 1.6836734693877553e-05, "loss": 0.4006370007991791, "step": 875 }, { "epoch": 0.1752, "grad_norm": 4.881451606750488, "learning_rate": 1.683265306122449e-05, "loss": 0.163467139005661, "step": 876 }, { "epoch": 0.1754, "grad_norm": 9.221220970153809, "learning_rate": 1.6828571428571428e-05, "loss": 0.5745043754577637, "step": 877 }, { "epoch": 0.1756, "grad_norm": 9.977005958557129, "learning_rate": 1.682448979591837e-05, "loss": 0.2814178466796875, "step": 878 }, { "epoch": 0.1758, "grad_norm": 10.049405097961426, "learning_rate": 1.6820408163265307e-05, "loss": 0.39915338158607483, "step": 879 }, { "epoch": 0.176, "grad_norm": 8.953500747680664, "learning_rate": 1.6816326530612244e-05, "loss": 0.6644017696380615, "step": 880 }, { "epoch": 0.1762, "grad_norm": 35.30412292480469, "learning_rate": 1.6812244897959185e-05, "loss": 2.2476003170013428, "step": 881 }, { "epoch": 0.1764, "grad_norm": 23.374265670776367, "learning_rate": 1.6808163265306123e-05, "loss": 1.5329824686050415, "step": 882 }, { "epoch": 0.1766, "grad_norm": 20.165687561035156, "learning_rate": 1.680408163265306e-05, "loss": 0.6441766023635864, "step": 883 }, { "epoch": 0.1768, "grad_norm": 17.912281036376953, "learning_rate": 1.6800000000000002e-05, "loss": 0.4463423490524292, "step": 884 }, { "epoch": 0.177, "grad_norm": 20.251941680908203, "learning_rate": 1.679591836734694e-05, "loss": 0.8454023003578186, "step": 885 }, { "epoch": 0.1772, "grad_norm": 22.15752410888672, "learning_rate": 1.6791836734693877e-05, "loss": 0.892297089099884, "step": 886 }, { "epoch": 0.1774, "grad_norm": 10.843847274780273, "learning_rate": 1.6787755102040818e-05, "loss": 0.5362411141395569, "step": 887 }, { "epoch": 0.1776, "grad_norm": 4.842869758605957, "learning_rate": 1.6783673469387756e-05, "loss": 0.15559206902980804, "step": 888 }, { "epoch": 0.1778, "grad_norm": 8.601651191711426, "learning_rate": 1.6779591836734693e-05, "loss": 0.2904387414455414, "step": 889 }, { "epoch": 0.178, "grad_norm": 3.9437415599823, "learning_rate": 1.6775510204081634e-05, "loss": 0.04863588139414787, "step": 890 }, { "epoch": 0.1782, "grad_norm": 7.6431708335876465, "learning_rate": 1.6771428571428572e-05, "loss": 0.18728196620941162, "step": 891 }, { "epoch": 0.1784, "grad_norm": 10.038166046142578, "learning_rate": 1.676734693877551e-05, "loss": 0.2966366410255432, "step": 892 }, { "epoch": 0.1786, "grad_norm": 11.422334671020508, "learning_rate": 1.676326530612245e-05, "loss": 0.5634375214576721, "step": 893 }, { "epoch": 0.1788, "grad_norm": 11.061003684997559, "learning_rate": 1.6759183673469392e-05, "loss": 0.6100184321403503, "step": 894 }, { "epoch": 0.179, "grad_norm": 10.992979049682617, "learning_rate": 1.675510204081633e-05, "loss": 1.1781855821609497, "step": 895 }, { "epoch": 0.1792, "grad_norm": 13.457104682922363, "learning_rate": 1.6751020408163267e-05, "loss": 1.1264137029647827, "step": 896 }, { "epoch": 0.1794, "grad_norm": 7.429170608520508, "learning_rate": 1.6746938775510208e-05, "loss": 0.5296066403388977, "step": 897 }, { "epoch": 0.1796, "grad_norm": 12.779236793518066, "learning_rate": 1.6742857142857146e-05, "loss": 0.47797414660453796, "step": 898 }, { "epoch": 0.1798, "grad_norm": 12.133633613586426, "learning_rate": 1.6738775510204083e-05, "loss": 1.237507939338684, "step": 899 }, { "epoch": 0.18, "grad_norm": 14.93108081817627, "learning_rate": 1.673469387755102e-05, "loss": 1.5555381774902344, "step": 900 }, { "epoch": 0.1802, "grad_norm": 7.929037570953369, "learning_rate": 1.6730612244897962e-05, "loss": 0.27353933453559875, "step": 901 }, { "epoch": 0.1804, "grad_norm": 9.74111557006836, "learning_rate": 1.67265306122449e-05, "loss": 0.3268223702907562, "step": 902 }, { "epoch": 0.1806, "grad_norm": 12.810250282287598, "learning_rate": 1.6722448979591837e-05, "loss": 1.1125568151474, "step": 903 }, { "epoch": 0.1808, "grad_norm": 10.80146312713623, "learning_rate": 1.6718367346938778e-05, "loss": 0.8165103793144226, "step": 904 }, { "epoch": 0.181, "grad_norm": 12.98203182220459, "learning_rate": 1.6714285714285716e-05, "loss": 0.8082551956176758, "step": 905 }, { "epoch": 0.1812, "grad_norm": 15.881731033325195, "learning_rate": 1.6710204081632654e-05, "loss": 0.8954185843467712, "step": 906 }, { "epoch": 0.1814, "grad_norm": 7.476495742797852, "learning_rate": 1.6706122448979595e-05, "loss": 2.61548113822937, "step": 907 }, { "epoch": 0.1816, "grad_norm": 11.759521484375, "learning_rate": 1.6702040816326532e-05, "loss": 2.5834901332855225, "step": 908 }, { "epoch": 0.1818, "grad_norm": 17.707563400268555, "learning_rate": 1.669795918367347e-05, "loss": 0.46600961685180664, "step": 909 }, { "epoch": 0.182, "grad_norm": 17.35898208618164, "learning_rate": 1.669387755102041e-05, "loss": 0.5477342009544373, "step": 910 }, { "epoch": 0.1822, "grad_norm": 16.38886260986328, "learning_rate": 1.668979591836735e-05, "loss": 0.241845965385437, "step": 911 }, { "epoch": 0.1824, "grad_norm": 9.18626594543457, "learning_rate": 1.6685714285714286e-05, "loss": 0.3251483738422394, "step": 912 }, { "epoch": 0.1826, "grad_norm": 11.48183536529541, "learning_rate": 1.6681632653061227e-05, "loss": 1.201566457748413, "step": 913 }, { "epoch": 0.1828, "grad_norm": 12.863166809082031, "learning_rate": 1.6677551020408165e-05, "loss": 1.3929704427719116, "step": 914 }, { "epoch": 0.183, "grad_norm": 8.982046127319336, "learning_rate": 1.6673469387755102e-05, "loss": 0.30986925959587097, "step": 915 }, { "epoch": 0.1832, "grad_norm": 11.40433120727539, "learning_rate": 1.6669387755102044e-05, "loss": 0.4559279978275299, "step": 916 }, { "epoch": 0.1834, "grad_norm": 7.865514755249023, "learning_rate": 1.666530612244898e-05, "loss": 0.21401290595531464, "step": 917 }, { "epoch": 0.1836, "grad_norm": 5.85783052444458, "learning_rate": 1.666122448979592e-05, "loss": 0.058106567710638046, "step": 918 }, { "epoch": 0.1838, "grad_norm": 12.835536003112793, "learning_rate": 1.665714285714286e-05, "loss": 0.5007133483886719, "step": 919 }, { "epoch": 0.184, "grad_norm": 10.269946098327637, "learning_rate": 1.6653061224489797e-05, "loss": 1.027040958404541, "step": 920 }, { "epoch": 0.1842, "grad_norm": 28.42228889465332, "learning_rate": 1.6648979591836735e-05, "loss": 1.9055904150009155, "step": 921 }, { "epoch": 0.1844, "grad_norm": 30.98174476623535, "learning_rate": 1.6644897959183676e-05, "loss": 1.6907411813735962, "step": 922 }, { "epoch": 0.1846, "grad_norm": 11.129870414733887, "learning_rate": 1.6640816326530614e-05, "loss": 0.4462185800075531, "step": 923 }, { "epoch": 0.1848, "grad_norm": 9.945165634155273, "learning_rate": 1.663673469387755e-05, "loss": 0.38100337982177734, "step": 924 }, { "epoch": 0.185, "grad_norm": 9.017776489257812, "learning_rate": 1.6632653061224492e-05, "loss": 0.3078192174434662, "step": 925 }, { "epoch": 0.1852, "grad_norm": 7.9817118644714355, "learning_rate": 1.662857142857143e-05, "loss": 0.24498671293258667, "step": 926 }, { "epoch": 0.1854, "grad_norm": 10.647873878479004, "learning_rate": 1.6624489795918368e-05, "loss": 0.6621216535568237, "step": 927 }, { "epoch": 0.1856, "grad_norm": 8.087475776672363, "learning_rate": 1.6620408163265305e-05, "loss": 0.9449856877326965, "step": 928 }, { "epoch": 0.1858, "grad_norm": 9.353774070739746, "learning_rate": 1.6616326530612246e-05, "loss": 0.6571586728096008, "step": 929 }, { "epoch": 0.186, "grad_norm": 5.978039264678955, "learning_rate": 1.6612244897959184e-05, "loss": 0.3160820007324219, "step": 930 }, { "epoch": 0.1862, "grad_norm": 9.821951866149902, "learning_rate": 1.660816326530612e-05, "loss": 1.3220912218093872, "step": 931 }, { "epoch": 0.1864, "grad_norm": 16.91419792175293, "learning_rate": 1.6604081632653063e-05, "loss": 1.5503568649291992, "step": 932 }, { "epoch": 0.1866, "grad_norm": 10.032426834106445, "learning_rate": 1.66e-05, "loss": 0.3008374869823456, "step": 933 }, { "epoch": 0.1868, "grad_norm": 9.833456993103027, "learning_rate": 1.6595918367346938e-05, "loss": 0.3403104841709137, "step": 934 }, { "epoch": 0.187, "grad_norm": 10.148767471313477, "learning_rate": 1.659183673469388e-05, "loss": 0.35658565163612366, "step": 935 }, { "epoch": 0.1872, "grad_norm": 9.864377975463867, "learning_rate": 1.6587755102040817e-05, "loss": 0.3419613540172577, "step": 936 }, { "epoch": 0.1874, "grad_norm": 10.93486213684082, "learning_rate": 1.6583673469387754e-05, "loss": 0.3023720681667328, "step": 937 }, { "epoch": 0.1876, "grad_norm": 11.544412612915039, "learning_rate": 1.6579591836734695e-05, "loss": 0.3000563085079193, "step": 938 }, { "epoch": 0.1878, "grad_norm": 26.628463745117188, "learning_rate": 1.6575510204081633e-05, "loss": 1.5193748474121094, "step": 939 }, { "epoch": 0.188, "grad_norm": 34.582759857177734, "learning_rate": 1.6571428571428574e-05, "loss": 1.9608640670776367, "step": 940 }, { "epoch": 0.1882, "grad_norm": 12.25756549835205, "learning_rate": 1.656734693877551e-05, "loss": 0.35635626316070557, "step": 941 }, { "epoch": 0.1884, "grad_norm": 9.758035659790039, "learning_rate": 1.6563265306122453e-05, "loss": 0.5577664375305176, "step": 942 }, { "epoch": 0.1886, "grad_norm": 11.022090911865234, "learning_rate": 1.655918367346939e-05, "loss": 0.7006524205207825, "step": 943 }, { "epoch": 0.1888, "grad_norm": 5.026852607727051, "learning_rate": 1.6555102040816328e-05, "loss": 0.07087205350399017, "step": 944 }, { "epoch": 0.189, "grad_norm": 17.114988327026367, "learning_rate": 1.655102040816327e-05, "loss": 0.38376402854919434, "step": 945 }, { "epoch": 0.1892, "grad_norm": 7.769924640655518, "learning_rate": 1.6546938775510207e-05, "loss": 0.42324718832969666, "step": 946 }, { "epoch": 0.1894, "grad_norm": 6.787831783294678, "learning_rate": 1.6542857142857144e-05, "loss": 0.15425239503383636, "step": 947 }, { "epoch": 0.1896, "grad_norm": 9.386636734008789, "learning_rate": 1.6538775510204085e-05, "loss": 0.12651686370372772, "step": 948 }, { "epoch": 0.1898, "grad_norm": 9.607634544372559, "learning_rate": 1.6534693877551023e-05, "loss": 1.2049074172973633, "step": 949 }, { "epoch": 0.19, "grad_norm": 6.797369480133057, "learning_rate": 1.653061224489796e-05, "loss": 0.4870491027832031, "step": 950 }, { "epoch": 0.1902, "grad_norm": 11.430431365966797, "learning_rate": 1.65265306122449e-05, "loss": 0.5907344818115234, "step": 951 }, { "epoch": 0.1904, "grad_norm": 5.133027076721191, "learning_rate": 1.652244897959184e-05, "loss": 0.05749613419175148, "step": 952 }, { "epoch": 0.1906, "grad_norm": 12.587271690368652, "learning_rate": 1.6518367346938777e-05, "loss": 0.5050173997879028, "step": 953 }, { "epoch": 0.1908, "grad_norm": 8.202754974365234, "learning_rate": 1.6514285714285714e-05, "loss": 0.12933553755283356, "step": 954 }, { "epoch": 0.191, "grad_norm": 16.069805145263672, "learning_rate": 1.6510204081632655e-05, "loss": 0.9935396313667297, "step": 955 }, { "epoch": 0.1912, "grad_norm": 12.470759391784668, "learning_rate": 1.6506122448979593e-05, "loss": 0.9283061623573303, "step": 956 }, { "epoch": 0.1914, "grad_norm": 11.212996482849121, "learning_rate": 1.650204081632653e-05, "loss": 0.42528319358825684, "step": 957 }, { "epoch": 0.1916, "grad_norm": 11.121280670166016, "learning_rate": 1.6497959183673472e-05, "loss": 0.3700462281703949, "step": 958 }, { "epoch": 0.1918, "grad_norm": 7.70326042175293, "learning_rate": 1.649387755102041e-05, "loss": 0.29956695437431335, "step": 959 }, { "epoch": 0.192, "grad_norm": 7.19465970993042, "learning_rate": 1.6489795918367347e-05, "loss": 0.12585383653640747, "step": 960 }, { "epoch": 0.1922, "grad_norm": 9.511792182922363, "learning_rate": 1.6485714285714288e-05, "loss": 0.7968559265136719, "step": 961 }, { "epoch": 0.1924, "grad_norm": 9.611586570739746, "learning_rate": 1.6481632653061226e-05, "loss": 0.5876891016960144, "step": 962 }, { "epoch": 0.1926, "grad_norm": 13.071715354919434, "learning_rate": 1.6477551020408163e-05, "loss": 2.1680548191070557, "step": 963 }, { "epoch": 0.1928, "grad_norm": 13.33552074432373, "learning_rate": 1.6473469387755104e-05, "loss": 2.052441358566284, "step": 964 }, { "epoch": 0.193, "grad_norm": 6.807119369506836, "learning_rate": 1.6469387755102042e-05, "loss": 0.11891964823007584, "step": 965 }, { "epoch": 0.1932, "grad_norm": 7.7496867179870605, "learning_rate": 1.646530612244898e-05, "loss": 0.09799344092607498, "step": 966 }, { "epoch": 0.1934, "grad_norm": 7.558412551879883, "learning_rate": 1.646122448979592e-05, "loss": 3.6984379291534424, "step": 967 }, { "epoch": 0.1936, "grad_norm": 9.672758102416992, "learning_rate": 1.645714285714286e-05, "loss": 3.672480344772339, "step": 968 }, { "epoch": 0.1938, "grad_norm": 11.874825477600098, "learning_rate": 1.6453061224489796e-05, "loss": 1.5774186849594116, "step": 969 }, { "epoch": 0.194, "grad_norm": 9.94528865814209, "learning_rate": 1.6448979591836737e-05, "loss": 1.4908758401870728, "step": 970 }, { "epoch": 0.1942, "grad_norm": 8.902161598205566, "learning_rate": 1.6444897959183675e-05, "loss": 1.4055157899856567, "step": 971 }, { "epoch": 0.1944, "grad_norm": 8.480953216552734, "learning_rate": 1.6440816326530612e-05, "loss": 1.4322876930236816, "step": 972 }, { "epoch": 0.1946, "grad_norm": 8.42033576965332, "learning_rate": 1.6436734693877553e-05, "loss": 0.5924834609031677, "step": 973 }, { "epoch": 0.1948, "grad_norm": 9.280115127563477, "learning_rate": 1.643265306122449e-05, "loss": 1.2530088424682617, "step": 974 }, { "epoch": 0.195, "grad_norm": 11.592122077941895, "learning_rate": 1.642857142857143e-05, "loss": 2.4300496578216553, "step": 975 }, { "epoch": 0.1952, "grad_norm": 12.084485054016113, "learning_rate": 1.642448979591837e-05, "loss": 2.4004557132720947, "step": 976 }, { "epoch": 0.1954, "grad_norm": 15.528419494628906, "learning_rate": 1.6420408163265307e-05, "loss": 0.7741004824638367, "step": 977 }, { "epoch": 0.1956, "grad_norm": 21.244705200195312, "learning_rate": 1.6416326530612245e-05, "loss": 0.8380337357521057, "step": 978 }, { "epoch": 0.1958, "grad_norm": 45.149131774902344, "learning_rate": 1.6412244897959183e-05, "loss": 2.7999441623687744, "step": 979 }, { "epoch": 0.196, "grad_norm": 12.799947738647461, "learning_rate": 1.6408163265306124e-05, "loss": 2.601938486099243, "step": 980 }, { "epoch": 0.1962, "grad_norm": 12.487517356872559, "learning_rate": 1.640408163265306e-05, "loss": 0.842620313167572, "step": 981 }, { "epoch": 0.1964, "grad_norm": 11.241732597351074, "learning_rate": 1.64e-05, "loss": 1.266300082206726, "step": 982 }, { "epoch": 0.1966, "grad_norm": 13.937366485595703, "learning_rate": 1.639591836734694e-05, "loss": 0.48552432656288147, "step": 983 }, { "epoch": 0.1968, "grad_norm": 12.574949264526367, "learning_rate": 1.6391836734693878e-05, "loss": 0.39523372054100037, "step": 984 }, { "epoch": 0.197, "grad_norm": 12.430194854736328, "learning_rate": 1.638775510204082e-05, "loss": 0.41487932205200195, "step": 985 }, { "epoch": 0.1972, "grad_norm": 12.363018035888672, "learning_rate": 1.6383673469387756e-05, "loss": 0.5536958575248718, "step": 986 }, { "epoch": 0.1974, "grad_norm": 12.69167709350586, "learning_rate": 1.6379591836734697e-05, "loss": 0.532865583896637, "step": 987 }, { "epoch": 0.1976, "grad_norm": 12.416764259338379, "learning_rate": 1.6375510204081635e-05, "loss": 0.49868759512901306, "step": 988 }, { "epoch": 0.1978, "grad_norm": 21.368391036987305, "learning_rate": 1.6371428571428572e-05, "loss": 0.9303390979766846, "step": 989 }, { "epoch": 0.198, "grad_norm": 17.462474822998047, "learning_rate": 1.6367346938775513e-05, "loss": 0.6140914559364319, "step": 990 }, { "epoch": 0.1982, "grad_norm": 13.780851364135742, "learning_rate": 1.636326530612245e-05, "loss": 0.538284420967102, "step": 991 }, { "epoch": 0.1984, "grad_norm": 11.856263160705566, "learning_rate": 1.635918367346939e-05, "loss": 0.3810892403125763, "step": 992 }, { "epoch": 0.1986, "grad_norm": 13.552824974060059, "learning_rate": 1.635510204081633e-05, "loss": 1.638392448425293, "step": 993 }, { "epoch": 0.1988, "grad_norm": 13.62729263305664, "learning_rate": 1.6351020408163267e-05, "loss": 1.083305835723877, "step": 994 }, { "epoch": 0.199, "grad_norm": 13.757994651794434, "learning_rate": 1.6346938775510205e-05, "loss": 0.7081871032714844, "step": 995 }, { "epoch": 0.1992, "grad_norm": 11.198212623596191, "learning_rate": 1.6342857142857146e-05, "loss": 0.4434303343296051, "step": 996 }, { "epoch": 0.1994, "grad_norm": 9.911272048950195, "learning_rate": 1.6338775510204084e-05, "loss": 0.4623580276966095, "step": 997 }, { "epoch": 0.1996, "grad_norm": 6.290027618408203, "learning_rate": 1.633469387755102e-05, "loss": 0.2070257067680359, "step": 998 }, { "epoch": 0.1998, "grad_norm": 8.490586280822754, "learning_rate": 1.6330612244897962e-05, "loss": 0.41782307624816895, "step": 999 }, { "epoch": 0.2, "grad_norm": 6.666543006896973, "learning_rate": 1.63265306122449e-05, "loss": 0.0420907698571682, "step": 1000 } ], "logging_steps": 1, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 192, "trial_name": null, "trial_params": null }