{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25, "eval_steps": 500, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002, "grad_norm": 19.86993980407715, "learning_rate": 2.0000000000000002e-07, "loss": 0.7037215232849121, "step": 1 }, { "epoch": 0.0004, "grad_norm": 17.658987045288086, "learning_rate": 4.0000000000000003e-07, "loss": 0.4115454852581024, "step": 2 }, { "epoch": 0.0006, "grad_norm": 27.911306381225586, "learning_rate": 6.000000000000001e-07, "loss": 0.7317515015602112, "step": 3 }, { "epoch": 0.0008, "grad_norm": 23.786823272705078, "learning_rate": 8.000000000000001e-07, "loss": 0.4670589864253998, "step": 4 }, { "epoch": 0.001, "grad_norm": 736.2578125, "learning_rate": 1.0000000000000002e-06, "loss": 8.38817024230957, "step": 5 }, { "epoch": 0.0012, "grad_norm": 483.0341491699219, "learning_rate": 1.2000000000000002e-06, "loss": 7.923478603363037, "step": 6 }, { "epoch": 0.0014, "grad_norm": 26.421083450317383, "learning_rate": 1.4000000000000001e-06, "loss": 0.5545251369476318, "step": 7 }, { "epoch": 0.0016, "grad_norm": 25.409162521362305, "learning_rate": 1.6000000000000001e-06, "loss": 0.5838867425918579, "step": 8 }, { "epoch": 0.0018, "grad_norm": 23.164382934570312, "learning_rate": 1.8000000000000001e-06, "loss": 0.9057097434997559, "step": 9 }, { "epoch": 0.002, "grad_norm": 18.42571258544922, "learning_rate": 2.0000000000000003e-06, "loss": 0.3040672242641449, "step": 10 }, { "epoch": 0.0022, "grad_norm": 15.404990196228027, "learning_rate": 2.2e-06, "loss": 0.8651700615882874, "step": 11 }, { "epoch": 0.0024, "grad_norm": 11.985212326049805, "learning_rate": 2.4000000000000003e-06, "loss": 0.7416182160377502, "step": 12 }, { "epoch": 0.0026, "grad_norm": 29.11711311340332, "learning_rate": 2.6e-06, "loss": 1.2297133207321167, "step": 13 }, { "epoch": 0.0028, "grad_norm": 27.741609573364258, "learning_rate": 2.8000000000000003e-06, "loss": 1.1059151887893677, "step": 14 }, { "epoch": 0.003, "grad_norm": 26.51416015625, "learning_rate": 3e-06, "loss": 0.6906358599662781, "step": 15 }, { "epoch": 0.0032, "grad_norm": 29.349390029907227, "learning_rate": 3.2000000000000003e-06, "loss": 0.6707777380943298, "step": 16 }, { "epoch": 0.0034, "grad_norm": 262.01763916015625, "learning_rate": 3.4000000000000005e-06, "loss": 5.405397891998291, "step": 17 }, { "epoch": 0.0036, "grad_norm": 203.5963134765625, "learning_rate": 3.6000000000000003e-06, "loss": 5.416719913482666, "step": 18 }, { "epoch": 0.0038, "grad_norm": 17.128952026367188, "learning_rate": 3.8000000000000005e-06, "loss": 0.6377151608467102, "step": 19 }, { "epoch": 0.004, "grad_norm": 18.25572967529297, "learning_rate": 4.000000000000001e-06, "loss": 0.6638563871383667, "step": 20 }, { "epoch": 0.0042, "grad_norm": 224.96484375, "learning_rate": 4.2000000000000004e-06, "loss": 2.4988303184509277, "step": 21 }, { "epoch": 0.0044, "grad_norm": 186.37709045410156, "learning_rate": 4.4e-06, "loss": 2.37096905708313, "step": 22 }, { "epoch": 0.0046, "grad_norm": 301.34625244140625, "learning_rate": 4.600000000000001e-06, "loss": 6.065738201141357, "step": 23 }, { "epoch": 0.0048, "grad_norm": 124.02801513671875, "learning_rate": 4.800000000000001e-06, "loss": 4.959842681884766, "step": 24 }, { "epoch": 0.005, "grad_norm": 191.34832763671875, "learning_rate": 5e-06, "loss": 3.2318477630615234, "step": 25 }, { "epoch": 0.0052, "grad_norm": 102.69784545898438, "learning_rate": 5.2e-06, "loss": 2.0269229412078857, "step": 26 }, { "epoch": 0.0054, "grad_norm": 185.41197204589844, "learning_rate": 5.400000000000001e-06, "loss": 3.1023576259613037, "step": 27 }, { "epoch": 0.0056, "grad_norm": 115.0453872680664, "learning_rate": 5.600000000000001e-06, "loss": 2.033405065536499, "step": 28 }, { "epoch": 0.0058, "grad_norm": 19.503202438354492, "learning_rate": 5.8e-06, "loss": 0.6852450966835022, "step": 29 }, { "epoch": 0.006, "grad_norm": 16.313444137573242, "learning_rate": 6e-06, "loss": 0.6160387992858887, "step": 30 }, { "epoch": 0.0062, "grad_norm": 34.96455764770508, "learning_rate": 6.200000000000001e-06, "loss": 0.8548898696899414, "step": 31 }, { "epoch": 0.0064, "grad_norm": 28.085153579711914, "learning_rate": 6.4000000000000006e-06, "loss": 0.43956005573272705, "step": 32 }, { "epoch": 0.0066, "grad_norm": 15.484055519104004, "learning_rate": 6.600000000000001e-06, "loss": 0.6819882392883301, "step": 33 }, { "epoch": 0.0068, "grad_norm": 11.606240272521973, "learning_rate": 6.800000000000001e-06, "loss": 0.2640717327594757, "step": 34 }, { "epoch": 0.007, "grad_norm": 16.289966583251953, "learning_rate": 7e-06, "loss": 0.6590662598609924, "step": 35 }, { "epoch": 0.0072, "grad_norm": 16.290184020996094, "learning_rate": 7.2000000000000005e-06, "loss": 0.5614698529243469, "step": 36 }, { "epoch": 0.0074, "grad_norm": 9.540451049804688, "learning_rate": 7.4e-06, "loss": 3.6535511016845703, "step": 37 }, { "epoch": 0.0076, "grad_norm": 8.212828636169434, "learning_rate": 7.600000000000001e-06, "loss": 3.6082541942596436, "step": 38 }, { "epoch": 0.0078, "grad_norm": 110.34765625, "learning_rate": 7.800000000000002e-06, "loss": 2.08210825920105, "step": 39 }, { "epoch": 0.008, "grad_norm": 54.909912109375, "learning_rate": 8.000000000000001e-06, "loss": 1.8885221481323242, "step": 40 }, { "epoch": 0.0082, "grad_norm": 14.795517921447754, "learning_rate": 8.2e-06, "loss": 0.8238609433174133, "step": 41 }, { "epoch": 0.0084, "grad_norm": 11.740867614746094, "learning_rate": 8.400000000000001e-06, "loss": 0.2301148623228073, "step": 42 }, { "epoch": 0.0086, "grad_norm": 12.639492988586426, "learning_rate": 8.6e-06, "loss": 0.42430806159973145, "step": 43 }, { "epoch": 0.0088, "grad_norm": 14.877533912658691, "learning_rate": 8.8e-06, "loss": 0.5195145606994629, "step": 44 }, { "epoch": 0.009, "grad_norm": 16.365663528442383, "learning_rate": 9e-06, "loss": 0.7476734519004822, "step": 45 }, { "epoch": 0.0092, "grad_norm": 15.314481735229492, "learning_rate": 9.200000000000002e-06, "loss": 0.6565811634063721, "step": 46 }, { "epoch": 0.0094, "grad_norm": 15.30256175994873, "learning_rate": 9.4e-06, "loss": 0.7285122871398926, "step": 47 }, { "epoch": 0.0096, "grad_norm": 15.99083137512207, "learning_rate": 9.600000000000001e-06, "loss": 0.7699318528175354, "step": 48 }, { "epoch": 0.0098, "grad_norm": 15.664243698120117, "learning_rate": 9.800000000000001e-06, "loss": 0.9520533084869385, "step": 49 }, { "epoch": 0.01, "grad_norm": 12.90129566192627, "learning_rate": 1e-05, "loss": 0.8631060719490051, "step": 50 }, { "epoch": 0.0102, "grad_norm": 10.566743850708008, "learning_rate": 1.02e-05, "loss": 0.4236431121826172, "step": 51 }, { "epoch": 0.0104, "grad_norm": 9.604334831237793, "learning_rate": 1.04e-05, "loss": 0.36946138739585876, "step": 52 }, { "epoch": 0.0106, "grad_norm": 43.98063659667969, "learning_rate": 1.0600000000000002e-05, "loss": 1.8339109420776367, "step": 53 }, { "epoch": 0.0108, "grad_norm": 23.898250579833984, "learning_rate": 1.0800000000000002e-05, "loss": 1.3176811933517456, "step": 54 }, { "epoch": 0.011, "grad_norm": 13.772536277770996, "learning_rate": 1.1000000000000001e-05, "loss": 0.5442234873771667, "step": 55 }, { "epoch": 0.0112, "grad_norm": 6.4163737297058105, "learning_rate": 1.1200000000000001e-05, "loss": 0.14489132165908813, "step": 56 }, { "epoch": 0.0114, "grad_norm": 19.58946990966797, "learning_rate": 1.14e-05, "loss": 1.6732454299926758, "step": 57 }, { "epoch": 0.0116, "grad_norm": 29.295209884643555, "learning_rate": 1.16e-05, "loss": 1.700900673866272, "step": 58 }, { "epoch": 0.0118, "grad_norm": 10.337576866149902, "learning_rate": 1.18e-05, "loss": 0.6807141304016113, "step": 59 }, { "epoch": 0.012, "grad_norm": 9.932164192199707, "learning_rate": 1.2e-05, "loss": 0.2126649022102356, "step": 60 }, { "epoch": 0.0122, "grad_norm": 14.373818397521973, "learning_rate": 1.22e-05, "loss": 0.5442662835121155, "step": 61 }, { "epoch": 0.0124, "grad_norm": 7.414956092834473, "learning_rate": 1.2400000000000002e-05, "loss": 0.20551882684230804, "step": 62 }, { "epoch": 0.0126, "grad_norm": 22.96013641357422, "learning_rate": 1.2600000000000001e-05, "loss": 0.9355632662773132, "step": 63 }, { "epoch": 0.0128, "grad_norm": 30.31879234313965, "learning_rate": 1.2800000000000001e-05, "loss": 2.546799898147583, "step": 64 }, { "epoch": 0.013, "grad_norm": 18.958202362060547, "learning_rate": 1.3000000000000001e-05, "loss": 1.2627061605453491, "step": 65 }, { "epoch": 0.0132, "grad_norm": 19.17605209350586, "learning_rate": 1.3200000000000002e-05, "loss": 1.7123245000839233, "step": 66 }, { "epoch": 0.0134, "grad_norm": 17.46062469482422, "learning_rate": 1.3400000000000002e-05, "loss": 0.7560626864433289, "step": 67 }, { "epoch": 0.0136, "grad_norm": 17.04747200012207, "learning_rate": 1.3600000000000002e-05, "loss": 0.7125154137611389, "step": 68 }, { "epoch": 0.0138, "grad_norm": 14.661514282226562, "learning_rate": 1.38e-05, "loss": 0.5681678652763367, "step": 69 }, { "epoch": 0.014, "grad_norm": 11.718018531799316, "learning_rate": 1.4e-05, "loss": 0.39901670813560486, "step": 70 }, { "epoch": 0.0142, "grad_norm": 17.879074096679688, "learning_rate": 1.4200000000000001e-05, "loss": 1.1150238513946533, "step": 71 }, { "epoch": 0.0144, "grad_norm": 16.158737182617188, "learning_rate": 1.4400000000000001e-05, "loss": 1.1416844129562378, "step": 72 }, { "epoch": 0.0146, "grad_norm": 17.92641258239746, "learning_rate": 1.46e-05, "loss": 1.118418574333191, "step": 73 }, { "epoch": 0.0148, "grad_norm": 14.92599105834961, "learning_rate": 1.48e-05, "loss": 1.0633187294006348, "step": 74 }, { "epoch": 0.015, "grad_norm": 12.820063591003418, "learning_rate": 1.5000000000000002e-05, "loss": 0.53122878074646, "step": 75 }, { "epoch": 0.0152, "grad_norm": 14.567853927612305, "learning_rate": 1.5200000000000002e-05, "loss": 0.4186933934688568, "step": 76 }, { "epoch": 0.0154, "grad_norm": 17.807376861572266, "learning_rate": 1.54e-05, "loss": 1.7758623361587524, "step": 77 }, { "epoch": 0.0156, "grad_norm": 16.614009857177734, "learning_rate": 1.5600000000000003e-05, "loss": 1.3614182472229004, "step": 78 }, { "epoch": 0.0158, "grad_norm": 20.342607498168945, "learning_rate": 1.58e-05, "loss": 1.0765376091003418, "step": 79 }, { "epoch": 0.016, "grad_norm": 18.532209396362305, "learning_rate": 1.6000000000000003e-05, "loss": 1.0294160842895508, "step": 80 }, { "epoch": 0.0162, "grad_norm": 14.95114803314209, "learning_rate": 1.62e-05, "loss": 0.9725584387779236, "step": 81 }, { "epoch": 0.0164, "grad_norm": 16.384130477905273, "learning_rate": 1.64e-05, "loss": 1.2944960594177246, "step": 82 }, { "epoch": 0.0166, "grad_norm": 20.59752082824707, "learning_rate": 1.66e-05, "loss": 1.2144062519073486, "step": 83 }, { "epoch": 0.0168, "grad_norm": 14.727459907531738, "learning_rate": 1.6800000000000002e-05, "loss": 1.4357770681381226, "step": 84 }, { "epoch": 0.017, "grad_norm": 13.784932136535645, "learning_rate": 1.7e-05, "loss": 0.8966814875602722, "step": 85 }, { "epoch": 0.0172, "grad_norm": 8.46639347076416, "learning_rate": 1.72e-05, "loss": 0.7025353312492371, "step": 86 }, { "epoch": 0.0174, "grad_norm": 17.623991012573242, "learning_rate": 1.7400000000000003e-05, "loss": 1.518816351890564, "step": 87 }, { "epoch": 0.0176, "grad_norm": 15.044212341308594, "learning_rate": 1.76e-05, "loss": 1.7205009460449219, "step": 88 }, { "epoch": 0.0178, "grad_norm": 15.654994010925293, "learning_rate": 1.7800000000000002e-05, "loss": 0.5644822120666504, "step": 89 }, { "epoch": 0.018, "grad_norm": 23.13344955444336, "learning_rate": 1.8e-05, "loss": 1.4888534545898438, "step": 90 }, { "epoch": 0.0182, "grad_norm": 15.345812797546387, "learning_rate": 1.8200000000000002e-05, "loss": 2.276603937149048, "step": 91 }, { "epoch": 0.0184, "grad_norm": 12.848119735717773, "learning_rate": 1.8400000000000003e-05, "loss": 2.165219306945801, "step": 92 }, { "epoch": 0.0186, "grad_norm": 12.793383598327637, "learning_rate": 1.86e-05, "loss": 0.40768125653266907, "step": 93 }, { "epoch": 0.0188, "grad_norm": 13.451870918273926, "learning_rate": 1.88e-05, "loss": 0.33898743987083435, "step": 94 }, { "epoch": 0.019, "grad_norm": 8.729146003723145, "learning_rate": 1.9e-05, "loss": 0.3523624837398529, "step": 95 }, { "epoch": 0.0192, "grad_norm": 11.61637020111084, "learning_rate": 1.9200000000000003e-05, "loss": 0.36027684807777405, "step": 96 }, { "epoch": 0.0194, "grad_norm": 11.825678825378418, "learning_rate": 1.94e-05, "loss": 0.4324101507663727, "step": 97 }, { "epoch": 0.0196, "grad_norm": 6.112161159515381, "learning_rate": 1.9600000000000002e-05, "loss": 0.12024092674255371, "step": 98 }, { "epoch": 0.0198, "grad_norm": 8.166031837463379, "learning_rate": 1.98e-05, "loss": 2.617506504058838, "step": 99 }, { "epoch": 0.02, "grad_norm": 6.198336124420166, "learning_rate": 2e-05, "loss": 2.6058919429779053, "step": 100 }, { "epoch": 0.0202, "grad_norm": 23.16619300842285, "learning_rate": 1.999591836734694e-05, "loss": 1.200722098350525, "step": 101 }, { "epoch": 0.0204, "grad_norm": 36.44799041748047, "learning_rate": 1.999183673469388e-05, "loss": 2.272332191467285, "step": 102 }, { "epoch": 0.0206, "grad_norm": 14.700854301452637, "learning_rate": 1.9987755102040818e-05, "loss": 1.4091886281967163, "step": 103 }, { "epoch": 0.0208, "grad_norm": 38.54850769042969, "learning_rate": 1.9983673469387756e-05, "loss": 1.6711570024490356, "step": 104 }, { "epoch": 0.021, "grad_norm": 19.840042114257812, "learning_rate": 1.9979591836734697e-05, "loss": 1.034736156463623, "step": 105 }, { "epoch": 0.0212, "grad_norm": 15.678369522094727, "learning_rate": 1.9975510204081634e-05, "loss": 1.1222563982009888, "step": 106 }, { "epoch": 0.0214, "grad_norm": 16.824556350708008, "learning_rate": 1.9971428571428572e-05, "loss": 1.1417587995529175, "step": 107 }, { "epoch": 0.0216, "grad_norm": 20.7105770111084, "learning_rate": 1.9967346938775513e-05, "loss": 1.1519407033920288, "step": 108 }, { "epoch": 0.0218, "grad_norm": 10.305238723754883, "learning_rate": 1.996326530612245e-05, "loss": 0.4236001968383789, "step": 109 }, { "epoch": 0.022, "grad_norm": 10.24266242980957, "learning_rate": 1.9959183673469388e-05, "loss": 0.31372806429862976, "step": 110 }, { "epoch": 0.0222, "grad_norm": 9.28374195098877, "learning_rate": 1.995510204081633e-05, "loss": 0.48246851563453674, "step": 111 }, { "epoch": 0.0224, "grad_norm": 7.033731460571289, "learning_rate": 1.9951020408163267e-05, "loss": 0.1980251520872116, "step": 112 }, { "epoch": 0.0226, "grad_norm": 16.573246002197266, "learning_rate": 1.9946938775510204e-05, "loss": 2.1796398162841797, "step": 113 }, { "epoch": 0.0228, "grad_norm": 13.532843589782715, "learning_rate": 1.9942857142857142e-05, "loss": 2.140252113342285, "step": 114 }, { "epoch": 0.023, "grad_norm": 11.697552680969238, "learning_rate": 1.9938775510204083e-05, "loss": 0.422529935836792, "step": 115 }, { "epoch": 0.0232, "grad_norm": 6.95910120010376, "learning_rate": 1.993469387755102e-05, "loss": 0.12921731173992157, "step": 116 }, { "epoch": 0.0234, "grad_norm": 9.09284782409668, "learning_rate": 1.993061224489796e-05, "loss": 0.36724767088890076, "step": 117 }, { "epoch": 0.0236, "grad_norm": 10.362861633300781, "learning_rate": 1.99265306122449e-05, "loss": 0.3813866376876831, "step": 118 }, { "epoch": 0.0238, "grad_norm": 13.540061950683594, "learning_rate": 1.9922448979591837e-05, "loss": 1.1938616037368774, "step": 119 }, { "epoch": 0.024, "grad_norm": 20.51814842224121, "learning_rate": 1.9918367346938775e-05, "loss": 0.9086930155754089, "step": 120 }, { "epoch": 0.0242, "grad_norm": 13.426894187927246, "learning_rate": 1.9914285714285716e-05, "loss": 0.5107138752937317, "step": 121 }, { "epoch": 0.0244, "grad_norm": 12.804654121398926, "learning_rate": 1.9910204081632653e-05, "loss": 0.561721682548523, "step": 122 }, { "epoch": 0.0246, "grad_norm": 20.744733810424805, "learning_rate": 1.9906122448979594e-05, "loss": 0.6595413684844971, "step": 123 }, { "epoch": 0.0248, "grad_norm": 14.705329895019531, "learning_rate": 1.9902040816326532e-05, "loss": 0.7842282652854919, "step": 124 }, { "epoch": 0.025, "grad_norm": 12.542791366577148, "learning_rate": 1.9897959183673473e-05, "loss": 0.6271759867668152, "step": 125 }, { "epoch": 0.0252, "grad_norm": 10.929245948791504, "learning_rate": 1.989387755102041e-05, "loss": 0.4905226230621338, "step": 126 }, { "epoch": 0.0254, "grad_norm": 14.044593811035156, "learning_rate": 1.988979591836735e-05, "loss": 0.9779181480407715, "step": 127 }, { "epoch": 0.0256, "grad_norm": 13.47326946258545, "learning_rate": 1.988571428571429e-05, "loss": 0.8366219401359558, "step": 128 }, { "epoch": 0.0258, "grad_norm": 10.467619895935059, "learning_rate": 1.9881632653061227e-05, "loss": 0.4179813861846924, "step": 129 }, { "epoch": 0.026, "grad_norm": 11.507650375366211, "learning_rate": 1.9877551020408165e-05, "loss": 0.37075725197792053, "step": 130 }, { "epoch": 0.0262, "grad_norm": 17.782169342041016, "learning_rate": 1.9873469387755106e-05, "loss": 1.3293887376785278, "step": 131 }, { "epoch": 0.0264, "grad_norm": 12.833648681640625, "learning_rate": 1.9869387755102043e-05, "loss": 0.45509305596351624, "step": 132 }, { "epoch": 0.0266, "grad_norm": 22.548070907592773, "learning_rate": 1.986530612244898e-05, "loss": 0.5744470357894897, "step": 133 }, { "epoch": 0.0268, "grad_norm": 12.30579948425293, "learning_rate": 1.9861224489795922e-05, "loss": 0.20529121160507202, "step": 134 }, { "epoch": 0.027, "grad_norm": 17.774799346923828, "learning_rate": 1.985714285714286e-05, "loss": 1.0186904668807983, "step": 135 }, { "epoch": 0.0272, "grad_norm": 14.874011993408203, "learning_rate": 1.9853061224489797e-05, "loss": 0.779259204864502, "step": 136 }, { "epoch": 0.0274, "grad_norm": 10.167969703674316, "learning_rate": 1.984897959183674e-05, "loss": 0.1621553897857666, "step": 137 }, { "epoch": 0.0276, "grad_norm": 7.144793510437012, "learning_rate": 1.9844897959183676e-05, "loss": 0.09167339652776718, "step": 138 }, { "epoch": 0.0278, "grad_norm": 12.632438659667969, "learning_rate": 1.9840816326530614e-05, "loss": 0.5062233805656433, "step": 139 }, { "epoch": 0.028, "grad_norm": 10.130064964294434, "learning_rate": 1.983673469387755e-05, "loss": 0.16204434633255005, "step": 140 }, { "epoch": 0.0282, "grad_norm": 12.270245552062988, "learning_rate": 1.9832653061224492e-05, "loss": 0.43254661560058594, "step": 141 }, { "epoch": 0.0284, "grad_norm": 7.125492095947266, "learning_rate": 1.982857142857143e-05, "loss": 0.17016957700252533, "step": 142 }, { "epoch": 0.0286, "grad_norm": 50.75775909423828, "learning_rate": 1.9824489795918368e-05, "loss": 1.619408130645752, "step": 143 }, { "epoch": 0.0288, "grad_norm": 58.936527252197266, "learning_rate": 1.982040816326531e-05, "loss": 2.543792963027954, "step": 144 }, { "epoch": 0.029, "grad_norm": 11.407219886779785, "learning_rate": 1.9816326530612246e-05, "loss": 0.23832444846630096, "step": 145 }, { "epoch": 0.0292, "grad_norm": 3.9544060230255127, "learning_rate": 1.9812244897959184e-05, "loss": 0.11590949445962906, "step": 146 }, { "epoch": 0.0294, "grad_norm": 10.821979522705078, "learning_rate": 1.9808163265306125e-05, "loss": 0.5023337006568909, "step": 147 }, { "epoch": 0.0296, "grad_norm": 9.180195808410645, "learning_rate": 1.9804081632653063e-05, "loss": 0.38146814703941345, "step": 148 }, { "epoch": 0.0298, "grad_norm": 16.268226623535156, "learning_rate": 1.98e-05, "loss": 0.5793936848640442, "step": 149 }, { "epoch": 0.03, "grad_norm": 17.789043426513672, "learning_rate": 1.979591836734694e-05, "loss": 0.5812906622886658, "step": 150 }, { "epoch": 0.0302, "grad_norm": 13.205047607421875, "learning_rate": 1.979183673469388e-05, "loss": 0.7076604962348938, "step": 151 }, { "epoch": 0.0304, "grad_norm": 16.953746795654297, "learning_rate": 1.9787755102040816e-05, "loss": 0.7535090446472168, "step": 152 }, { "epoch": 0.0306, "grad_norm": 13.64711856842041, "learning_rate": 1.9783673469387757e-05, "loss": 0.5489466786384583, "step": 153 }, { "epoch": 0.0308, "grad_norm": 12.464900970458984, "learning_rate": 1.9779591836734695e-05, "loss": 0.3147149384021759, "step": 154 }, { "epoch": 0.031, "grad_norm": 15.660754203796387, "learning_rate": 1.9775510204081633e-05, "loss": 0.5840892195701599, "step": 155 }, { "epoch": 0.0312, "grad_norm": 12.983843803405762, "learning_rate": 1.9771428571428574e-05, "loss": 0.16638237237930298, "step": 156 }, { "epoch": 0.0314, "grad_norm": 10.69919490814209, "learning_rate": 1.976734693877551e-05, "loss": 0.4699489176273346, "step": 157 }, { "epoch": 0.0316, "grad_norm": 8.013006210327148, "learning_rate": 1.976326530612245e-05, "loss": 0.140150785446167, "step": 158 }, { "epoch": 0.0318, "grad_norm": 10.338350296020508, "learning_rate": 1.975918367346939e-05, "loss": 0.6970502734184265, "step": 159 }, { "epoch": 0.032, "grad_norm": 18.74736976623535, "learning_rate": 1.9755102040816328e-05, "loss": 0.6801952719688416, "step": 160 }, { "epoch": 0.0322, "grad_norm": 14.109589576721191, "learning_rate": 1.9751020408163265e-05, "loss": 1.7827507257461548, "step": 161 }, { "epoch": 0.0324, "grad_norm": 15.422320365905762, "learning_rate": 1.9746938775510206e-05, "loss": 1.477315902709961, "step": 162 }, { "epoch": 0.0326, "grad_norm": 21.485685348510742, "learning_rate": 1.9742857142857144e-05, "loss": 1.0516995191574097, "step": 163 }, { "epoch": 0.0328, "grad_norm": 18.223512649536133, "learning_rate": 1.973877551020408e-05, "loss": 1.0833985805511475, "step": 164 }, { "epoch": 0.033, "grad_norm": 10.750219345092773, "learning_rate": 1.9734693877551023e-05, "loss": 1.9473018646240234, "step": 165 }, { "epoch": 0.0332, "grad_norm": 10.971612930297852, "learning_rate": 1.973061224489796e-05, "loss": 1.8101927042007446, "step": 166 }, { "epoch": 0.0334, "grad_norm": 10.93656063079834, "learning_rate": 1.9726530612244898e-05, "loss": 2.7729265689849854, "step": 167 }, { "epoch": 0.0336, "grad_norm": 10.621392250061035, "learning_rate": 1.9722448979591836e-05, "loss": 2.4153614044189453, "step": 168 }, { "epoch": 0.0338, "grad_norm": 8.495170593261719, "learning_rate": 1.9718367346938777e-05, "loss": 0.2918655276298523, "step": 169 }, { "epoch": 0.034, "grad_norm": 9.634339332580566, "learning_rate": 1.9714285714285718e-05, "loss": 0.3897497355937958, "step": 170 }, { "epoch": 0.0342, "grad_norm": 7.359801769256592, "learning_rate": 1.9710204081632655e-05, "loss": 0.2934911847114563, "step": 171 }, { "epoch": 0.0344, "grad_norm": 19.19106674194336, "learning_rate": 1.9706122448979593e-05, "loss": 0.25113484263420105, "step": 172 }, { "epoch": 0.0346, "grad_norm": 11.436328887939453, "learning_rate": 1.9702040816326534e-05, "loss": 1.3752632141113281, "step": 173 }, { "epoch": 0.0348, "grad_norm": 11.581891059875488, "learning_rate": 1.969795918367347e-05, "loss": 1.111779808998108, "step": 174 }, { "epoch": 0.035, "grad_norm": 10.153788566589355, "learning_rate": 1.969387755102041e-05, "loss": 0.21362058818340302, "step": 175 }, { "epoch": 0.0352, "grad_norm": 9.029555320739746, "learning_rate": 1.968979591836735e-05, "loss": 0.18420429527759552, "step": 176 }, { "epoch": 0.0354, "grad_norm": 12.224593162536621, "learning_rate": 1.9685714285714288e-05, "loss": 0.8582136034965515, "step": 177 }, { "epoch": 0.0356, "grad_norm": 12.30530834197998, "learning_rate": 1.9681632653061226e-05, "loss": 0.790539562702179, "step": 178 }, { "epoch": 0.0358, "grad_norm": 11.196264266967773, "learning_rate": 1.9677551020408167e-05, "loss": 0.7992145419120789, "step": 179 }, { "epoch": 0.036, "grad_norm": 11.926046371459961, "learning_rate": 1.9673469387755104e-05, "loss": 1.205533742904663, "step": 180 }, { "epoch": 0.0362, "grad_norm": 11.32097053527832, "learning_rate": 1.9669387755102042e-05, "loss": 2.065629243850708, "step": 181 }, { "epoch": 0.0364, "grad_norm": 10.51060962677002, "learning_rate": 1.9665306122448983e-05, "loss": 1.9534316062927246, "step": 182 }, { "epoch": 0.0366, "grad_norm": 12.027915000915527, "learning_rate": 1.966122448979592e-05, "loss": 0.6187699437141418, "step": 183 }, { "epoch": 0.0368, "grad_norm": 10.165942192077637, "learning_rate": 1.9657142857142858e-05, "loss": 0.8004693984985352, "step": 184 }, { "epoch": 0.037, "grad_norm": 13.300896644592285, "learning_rate": 1.96530612244898e-05, "loss": 0.57137131690979, "step": 185 }, { "epoch": 0.0372, "grad_norm": 11.380193710327148, "learning_rate": 1.9648979591836737e-05, "loss": 0.24036674201488495, "step": 186 }, { "epoch": 0.0374, "grad_norm": 15.774130821228027, "learning_rate": 1.9644897959183674e-05, "loss": 0.5146348476409912, "step": 187 }, { "epoch": 0.0376, "grad_norm": 14.454614639282227, "learning_rate": 1.9640816326530616e-05, "loss": 1.4913374185562134, "step": 188 }, { "epoch": 0.0378, "grad_norm": 12.067485809326172, "learning_rate": 1.9636734693877553e-05, "loss": 0.6330704689025879, "step": 189 }, { "epoch": 0.038, "grad_norm": 12.86535358428955, "learning_rate": 1.963265306122449e-05, "loss": 1.4309087991714478, "step": 190 }, { "epoch": 0.0382, "grad_norm": 11.127947807312012, "learning_rate": 1.9628571428571432e-05, "loss": 0.7268363833427429, "step": 191 }, { "epoch": 0.0384, "grad_norm": 10.412270545959473, "learning_rate": 1.962448979591837e-05, "loss": 1.0172964334487915, "step": 192 }, { "epoch": 0.0386, "grad_norm": 8.217020034790039, "learning_rate": 1.9620408163265307e-05, "loss": 0.34921762347221375, "step": 193 }, { "epoch": 0.0388, "grad_norm": 9.945459365844727, "learning_rate": 1.9616326530612245e-05, "loss": 0.3862124979496002, "step": 194 }, { "epoch": 0.039, "grad_norm": 6.348339080810547, "learning_rate": 1.9612244897959186e-05, "loss": 0.26032719016075134, "step": 195 }, { "epoch": 0.0392, "grad_norm": 10.936368942260742, "learning_rate": 1.9608163265306123e-05, "loss": 0.3817369043827057, "step": 196 }, { "epoch": 0.0394, "grad_norm": 17.57430648803711, "learning_rate": 1.960408163265306e-05, "loss": 0.8593737483024597, "step": 197 }, { "epoch": 0.0396, "grad_norm": 12.015942573547363, "learning_rate": 1.9600000000000002e-05, "loss": 0.8226779103279114, "step": 198 }, { "epoch": 0.0398, "grad_norm": 14.96813678741455, "learning_rate": 1.959591836734694e-05, "loss": 2.1698710918426514, "step": 199 }, { "epoch": 0.04, "grad_norm": 13.83712387084961, "learning_rate": 1.9591836734693877e-05, "loss": 2.1152279376983643, "step": 200 }, { "epoch": 0.0402, "grad_norm": 12.426931381225586, "learning_rate": 1.958775510204082e-05, "loss": 0.5947772264480591, "step": 201 }, { "epoch": 0.0404, "grad_norm": 6.404804229736328, "learning_rate": 1.9583673469387756e-05, "loss": 0.09137320518493652, "step": 202 }, { "epoch": 0.0406, "grad_norm": 20.05866050720215, "learning_rate": 1.9579591836734694e-05, "loss": 0.9587116241455078, "step": 203 }, { "epoch": 0.0408, "grad_norm": 13.285184860229492, "learning_rate": 1.9575510204081635e-05, "loss": 0.7525708079338074, "step": 204 }, { "epoch": 0.041, "grad_norm": 18.103675842285156, "learning_rate": 1.9571428571428572e-05, "loss": 0.9542830586433411, "step": 205 }, { "epoch": 0.0412, "grad_norm": 18.912912368774414, "learning_rate": 1.956734693877551e-05, "loss": 1.5049800872802734, "step": 206 }, { "epoch": 0.0414, "grad_norm": 17.95879554748535, "learning_rate": 1.956326530612245e-05, "loss": 0.4391709864139557, "step": 207 }, { "epoch": 0.0416, "grad_norm": 16.980247497558594, "learning_rate": 1.955918367346939e-05, "loss": 0.880516529083252, "step": 208 }, { "epoch": 0.0418, "grad_norm": 25.475059509277344, "learning_rate": 1.9555102040816326e-05, "loss": 1.8526705503463745, "step": 209 }, { "epoch": 0.042, "grad_norm": 27.993799209594727, "learning_rate": 1.9551020408163267e-05, "loss": 2.4861631393432617, "step": 210 }, { "epoch": 0.0422, "grad_norm": 13.218708038330078, "learning_rate": 1.9546938775510205e-05, "loss": 0.7200023531913757, "step": 211 }, { "epoch": 0.0424, "grad_norm": 7.854865074157715, "learning_rate": 1.9542857142857143e-05, "loss": 0.4573878049850464, "step": 212 }, { "epoch": 0.0426, "grad_norm": 16.14672088623047, "learning_rate": 1.9538775510204084e-05, "loss": 0.5513072609901428, "step": 213 }, { "epoch": 0.0428, "grad_norm": 16.056869506835938, "learning_rate": 1.9534693877551025e-05, "loss": 0.41998934745788574, "step": 214 }, { "epoch": 0.043, "grad_norm": 15.660920143127441, "learning_rate": 1.9530612244897962e-05, "loss": 0.292898565530777, "step": 215 }, { "epoch": 0.0432, "grad_norm": 15.301045417785645, "learning_rate": 1.95265306122449e-05, "loss": 0.37798991799354553, "step": 216 }, { "epoch": 0.0434, "grad_norm": 12.420317649841309, "learning_rate": 1.952244897959184e-05, "loss": 1.0150760412216187, "step": 217 }, { "epoch": 0.0436, "grad_norm": 12.52476692199707, "learning_rate": 1.951836734693878e-05, "loss": 1.0119246244430542, "step": 218 }, { "epoch": 0.0438, "grad_norm": 13.150724411010742, "learning_rate": 1.9514285714285716e-05, "loss": 1.8081260919570923, "step": 219 }, { "epoch": 0.044, "grad_norm": 14.73547077178955, "learning_rate": 1.9510204081632654e-05, "loss": 1.8981086015701294, "step": 220 }, { "epoch": 0.0442, "grad_norm": 12.522819519042969, "learning_rate": 1.9506122448979595e-05, "loss": 0.4660833179950714, "step": 221 }, { "epoch": 0.0444, "grad_norm": 8.506572723388672, "learning_rate": 1.9502040816326533e-05, "loss": 0.36523711681365967, "step": 222 }, { "epoch": 0.0446, "grad_norm": 13.08540153503418, "learning_rate": 1.949795918367347e-05, "loss": 0.4142066538333893, "step": 223 }, { "epoch": 0.0448, "grad_norm": 4.7789154052734375, "learning_rate": 1.949387755102041e-05, "loss": 0.043511006981134415, "step": 224 }, { "epoch": 0.045, "grad_norm": 8.255807876586914, "learning_rate": 1.948979591836735e-05, "loss": 0.341126412153244, "step": 225 }, { "epoch": 0.0452, "grad_norm": 8.003602981567383, "learning_rate": 1.9485714285714286e-05, "loss": 0.30454233288764954, "step": 226 }, { "epoch": 0.0454, "grad_norm": 11.212254524230957, "learning_rate": 1.9481632653061227e-05, "loss": 0.4171302318572998, "step": 227 }, { "epoch": 0.0456, "grad_norm": 9.085224151611328, "learning_rate": 1.9477551020408165e-05, "loss": 0.4786769151687622, "step": 228 }, { "epoch": 0.0458, "grad_norm": 10.804896354675293, "learning_rate": 1.9473469387755103e-05, "loss": 1.325444221496582, "step": 229 }, { "epoch": 0.046, "grad_norm": 13.959992408752441, "learning_rate": 1.9469387755102044e-05, "loss": 0.5055341720581055, "step": 230 }, { "epoch": 0.0462, "grad_norm": 10.12248706817627, "learning_rate": 1.946530612244898e-05, "loss": 0.2702232301235199, "step": 231 }, { "epoch": 0.0464, "grad_norm": 6.399989128112793, "learning_rate": 1.946122448979592e-05, "loss": 0.13826696574687958, "step": 232 }, { "epoch": 0.0466, "grad_norm": 10.234905242919922, "learning_rate": 1.945714285714286e-05, "loss": 1.8679510354995728, "step": 233 }, { "epoch": 0.0468, "grad_norm": 10.236428260803223, "learning_rate": 1.9453061224489798e-05, "loss": 1.8090623617172241, "step": 234 }, { "epoch": 0.047, "grad_norm": 8.79781723022461, "learning_rate": 1.9448979591836735e-05, "loss": 1.9289888143539429, "step": 235 }, { "epoch": 0.0472, "grad_norm": 10.299851417541504, "learning_rate": 1.9444897959183676e-05, "loss": 1.7997150421142578, "step": 236 }, { "epoch": 0.0474, "grad_norm": 10.11899185180664, "learning_rate": 1.9440816326530614e-05, "loss": 0.24996638298034668, "step": 237 }, { "epoch": 0.0476, "grad_norm": 6.126742839813232, "learning_rate": 1.943673469387755e-05, "loss": 0.10375798493623734, "step": 238 }, { "epoch": 0.0478, "grad_norm": 12.840261459350586, "learning_rate": 1.9432653061224493e-05, "loss": 0.5607141852378845, "step": 239 }, { "epoch": 0.048, "grad_norm": 13.04357624053955, "learning_rate": 1.942857142857143e-05, "loss": 0.19896751642227173, "step": 240 }, { "epoch": 0.0482, "grad_norm": 12.065544128417969, "learning_rate": 1.9424489795918368e-05, "loss": 0.5035838484764099, "step": 241 }, { "epoch": 0.0484, "grad_norm": 6.4033379554748535, "learning_rate": 1.942040816326531e-05, "loss": 0.08241923898458481, "step": 242 }, { "epoch": 0.0486, "grad_norm": 13.771985054016113, "learning_rate": 1.9416326530612247e-05, "loss": 0.9879512190818787, "step": 243 }, { "epoch": 0.0488, "grad_norm": 9.499734878540039, "learning_rate": 1.9412244897959184e-05, "loss": 0.2659399211406708, "step": 244 }, { "epoch": 0.049, "grad_norm": 9.90380573272705, "learning_rate": 1.9408163265306122e-05, "loss": 0.4616040289402008, "step": 245 }, { "epoch": 0.0492, "grad_norm": 8.802172660827637, "learning_rate": 1.9404081632653063e-05, "loss": 0.2890472114086151, "step": 246 }, { "epoch": 0.0494, "grad_norm": 12.497790336608887, "learning_rate": 1.94e-05, "loss": 1.005435585975647, "step": 247 }, { "epoch": 0.0496, "grad_norm": 13.029925346374512, "learning_rate": 1.9395918367346938e-05, "loss": 0.9115619659423828, "step": 248 }, { "epoch": 0.0498, "grad_norm": 10.337368965148926, "learning_rate": 1.939183673469388e-05, "loss": 0.3756994903087616, "step": 249 }, { "epoch": 0.05, "grad_norm": 36.60479736328125, "learning_rate": 1.9387755102040817e-05, "loss": 0.40144655108451843, "step": 250 }, { "epoch": 0.0502, "grad_norm": 8.426389694213867, "learning_rate": 1.9383673469387755e-05, "loss": 0.38817885518074036, "step": 251 }, { "epoch": 0.0504, "grad_norm": 9.67792797088623, "learning_rate": 1.9379591836734696e-05, "loss": 0.13594162464141846, "step": 252 }, { "epoch": 0.0506, "grad_norm": 6.6829328536987305, "learning_rate": 1.9375510204081633e-05, "loss": 0.06705179065465927, "step": 253 }, { "epoch": 0.0508, "grad_norm": 7.050276279449463, "learning_rate": 1.937142857142857e-05, "loss": 0.06950942426919937, "step": 254 }, { "epoch": 0.051, "grad_norm": 9.589434623718262, "learning_rate": 1.9367346938775512e-05, "loss": 0.3774583637714386, "step": 255 }, { "epoch": 0.0512, "grad_norm": 10.09981918334961, "learning_rate": 1.936326530612245e-05, "loss": 0.3024117946624756, "step": 256 }, { "epoch": 0.0514, "grad_norm": 17.173498153686523, "learning_rate": 1.9359183673469387e-05, "loss": 1.305375337600708, "step": 257 }, { "epoch": 0.0516, "grad_norm": 9.821687698364258, "learning_rate": 1.9355102040816328e-05, "loss": 0.3765145540237427, "step": 258 }, { "epoch": 0.0518, "grad_norm": 9.418091773986816, "learning_rate": 1.935102040816327e-05, "loss": 0.2349083572626114, "step": 259 }, { "epoch": 0.052, "grad_norm": 11.253679275512695, "learning_rate": 1.9346938775510207e-05, "loss": 0.2814233601093292, "step": 260 }, { "epoch": 0.0522, "grad_norm": 17.10232925415039, "learning_rate": 1.9342857142857144e-05, "loss": 0.7681066989898682, "step": 261 }, { "epoch": 0.0524, "grad_norm": 16.147369384765625, "learning_rate": 1.9338775510204086e-05, "loss": 0.9630041718482971, "step": 262 }, { "epoch": 0.0526, "grad_norm": 14.772577285766602, "learning_rate": 1.9334693877551023e-05, "loss": 0.7542390823364258, "step": 263 }, { "epoch": 0.0528, "grad_norm": 8.295753479003906, "learning_rate": 1.933061224489796e-05, "loss": 0.5259800553321838, "step": 264 }, { "epoch": 0.053, "grad_norm": 18.530479431152344, "learning_rate": 1.9326530612244902e-05, "loss": 0.5631456971168518, "step": 265 }, { "epoch": 0.0532, "grad_norm": 18.48293685913086, "learning_rate": 1.932244897959184e-05, "loss": 0.5957385301589966, "step": 266 }, { "epoch": 0.0534, "grad_norm": 10.628904342651367, "learning_rate": 1.9318367346938777e-05, "loss": 0.5219213366508484, "step": 267 }, { "epoch": 0.0536, "grad_norm": 9.826141357421875, "learning_rate": 1.9314285714285718e-05, "loss": 0.16056272387504578, "step": 268 }, { "epoch": 0.0538, "grad_norm": 10.817519187927246, "learning_rate": 1.9310204081632656e-05, "loss": 0.3732954263687134, "step": 269 }, { "epoch": 0.054, "grad_norm": 9.00774097442627, "learning_rate": 1.9306122448979593e-05, "loss": 0.5658559799194336, "step": 270 }, { "epoch": 0.0542, "grad_norm": 16.364824295043945, "learning_rate": 1.930204081632653e-05, "loss": 1.1479178667068481, "step": 271 }, { "epoch": 0.0544, "grad_norm": 15.974719047546387, "learning_rate": 1.9297959183673472e-05, "loss": 1.3299745321273804, "step": 272 }, { "epoch": 0.0546, "grad_norm": 13.28140640258789, "learning_rate": 1.929387755102041e-05, "loss": 0.43030405044555664, "step": 273 }, { "epoch": 0.0548, "grad_norm": 12.720165252685547, "learning_rate": 1.9289795918367347e-05, "loss": 0.46472522616386414, "step": 274 }, { "epoch": 0.055, "grad_norm": 16.799394607543945, "learning_rate": 1.928571428571429e-05, "loss": 0.9537304043769836, "step": 275 }, { "epoch": 0.0552, "grad_norm": 15.223164558410645, "learning_rate": 1.9281632653061226e-05, "loss": 0.7769754528999329, "step": 276 }, { "epoch": 0.0554, "grad_norm": 13.078629493713379, "learning_rate": 1.9277551020408164e-05, "loss": 1.3505254983901978, "step": 277 }, { "epoch": 0.0556, "grad_norm": 16.278898239135742, "learning_rate": 1.9273469387755105e-05, "loss": 1.538793683052063, "step": 278 }, { "epoch": 0.0558, "grad_norm": 17.326444625854492, "learning_rate": 1.9269387755102042e-05, "loss": 1.2157453298568726, "step": 279 }, { "epoch": 0.056, "grad_norm": 13.043142318725586, "learning_rate": 1.926530612244898e-05, "loss": 1.0745325088500977, "step": 280 }, { "epoch": 0.0562, "grad_norm": 13.386103630065918, "learning_rate": 1.926122448979592e-05, "loss": 1.6535282135009766, "step": 281 }, { "epoch": 0.0564, "grad_norm": 14.423442840576172, "learning_rate": 1.925714285714286e-05, "loss": 1.681785225868225, "step": 282 }, { "epoch": 0.0566, "grad_norm": 20.163217544555664, "learning_rate": 1.9253061224489796e-05, "loss": 0.7129221558570862, "step": 283 }, { "epoch": 0.0568, "grad_norm": 18.918493270874023, "learning_rate": 1.9248979591836737e-05, "loss": 0.8880770802497864, "step": 284 }, { "epoch": 0.057, "grad_norm": 13.713382720947266, "learning_rate": 1.9244897959183675e-05, "loss": 0.7181944847106934, "step": 285 }, { "epoch": 0.0572, "grad_norm": 6.4058518409729, "learning_rate": 1.9240816326530613e-05, "loss": 0.08155521005392075, "step": 286 }, { "epoch": 0.0574, "grad_norm": 12.686845779418945, "learning_rate": 1.9236734693877554e-05, "loss": 0.6097404360771179, "step": 287 }, { "epoch": 0.0576, "grad_norm": 5.447309494018555, "learning_rate": 1.923265306122449e-05, "loss": 0.09038770943880081, "step": 288 }, { "epoch": 0.0578, "grad_norm": 17.65273666381836, "learning_rate": 1.922857142857143e-05, "loss": 1.1048628091812134, "step": 289 }, { "epoch": 0.058, "grad_norm": 16.5939998626709, "learning_rate": 1.922448979591837e-05, "loss": 0.8333539962768555, "step": 290 }, { "epoch": 0.0582, "grad_norm": 11.614035606384277, "learning_rate": 1.9220408163265308e-05, "loss": 0.4503566026687622, "step": 291 }, { "epoch": 0.0584, "grad_norm": 7.452908515930176, "learning_rate": 1.9216326530612245e-05, "loss": 0.14592118561267853, "step": 292 }, { "epoch": 0.0586, "grad_norm": 12.480263710021973, "learning_rate": 1.9212244897959186e-05, "loss": 0.598111093044281, "step": 293 }, { "epoch": 0.0588, "grad_norm": 10.65322494506836, "learning_rate": 1.9208163265306124e-05, "loss": 0.4835715591907501, "step": 294 }, { "epoch": 0.059, "grad_norm": 13.473339080810547, "learning_rate": 1.920408163265306e-05, "loss": 1.2906862497329712, "step": 295 }, { "epoch": 0.0592, "grad_norm": 11.87508487701416, "learning_rate": 1.9200000000000003e-05, "loss": 1.0924484729766846, "step": 296 }, { "epoch": 0.0594, "grad_norm": 11.235217094421387, "learning_rate": 1.919591836734694e-05, "loss": 0.70457524061203, "step": 297 }, { "epoch": 0.0596, "grad_norm": 9.966221809387207, "learning_rate": 1.9191836734693878e-05, "loss": 0.7067977786064148, "step": 298 }, { "epoch": 0.0598, "grad_norm": 24.72576904296875, "learning_rate": 1.9187755102040815e-05, "loss": 1.2337020635604858, "step": 299 }, { "epoch": 0.06, "grad_norm": 32.2328987121582, "learning_rate": 1.9183673469387756e-05, "loss": 2.0896799564361572, "step": 300 }, { "epoch": 0.0602, "grad_norm": 11.01812744140625, "learning_rate": 1.9179591836734694e-05, "loss": 0.8730123043060303, "step": 301 }, { "epoch": 0.0604, "grad_norm": 9.47146987915039, "learning_rate": 1.9175510204081632e-05, "loss": 0.849687397480011, "step": 302 }, { "epoch": 0.0606, "grad_norm": 13.432365417480469, "learning_rate": 1.9171428571428573e-05, "loss": 1.5861668586730957, "step": 303 }, { "epoch": 0.0608, "grad_norm": 13.760697364807129, "learning_rate": 1.916734693877551e-05, "loss": 2.072270631790161, "step": 304 }, { "epoch": 0.061, "grad_norm": 15.910550117492676, "learning_rate": 1.916326530612245e-05, "loss": 0.7796235680580139, "step": 305 }, { "epoch": 0.0612, "grad_norm": 17.948802947998047, "learning_rate": 1.915918367346939e-05, "loss": 1.462945580482483, "step": 306 }, { "epoch": 0.0614, "grad_norm": 20.616914749145508, "learning_rate": 1.915510204081633e-05, "loss": 1.0277634859085083, "step": 307 }, { "epoch": 0.0616, "grad_norm": 19.797767639160156, "learning_rate": 1.9151020408163268e-05, "loss": 1.071796178817749, "step": 308 }, { "epoch": 0.0618, "grad_norm": 9.767040252685547, "learning_rate": 1.9146938775510205e-05, "loss": 0.3060181736946106, "step": 309 }, { "epoch": 0.062, "grad_norm": 8.75549602508545, "learning_rate": 1.9142857142857146e-05, "loss": 0.3348238170146942, "step": 310 }, { "epoch": 0.0622, "grad_norm": 10.77705192565918, "learning_rate": 1.9138775510204084e-05, "loss": 0.69964998960495, "step": 311 }, { "epoch": 0.0624, "grad_norm": 10.151534080505371, "learning_rate": 1.913469387755102e-05, "loss": 1.0875051021575928, "step": 312 }, { "epoch": 0.0626, "grad_norm": 9.344226837158203, "learning_rate": 1.9130612244897963e-05, "loss": 0.2267366647720337, "step": 313 }, { "epoch": 0.0628, "grad_norm": 7.4025397300720215, "learning_rate": 1.91265306122449e-05, "loss": 0.10137917846441269, "step": 314 }, { "epoch": 0.063, "grad_norm": 10.967966079711914, "learning_rate": 1.9122448979591838e-05, "loss": 0.4692521393299103, "step": 315 }, { "epoch": 0.0632, "grad_norm": 11.424884796142578, "learning_rate": 1.911836734693878e-05, "loss": 0.244221031665802, "step": 316 }, { "epoch": 0.0634, "grad_norm": 13.277408599853516, "learning_rate": 1.9114285714285717e-05, "loss": 0.6109925508499146, "step": 317 }, { "epoch": 0.0636, "grad_norm": 7.46019983291626, "learning_rate": 1.9110204081632654e-05, "loss": 0.13937990367412567, "step": 318 }, { "epoch": 0.0638, "grad_norm": 19.776010513305664, "learning_rate": 1.9106122448979595e-05, "loss": 0.6616011261940002, "step": 319 }, { "epoch": 0.064, "grad_norm": 19.98860740661621, "learning_rate": 1.9102040816326533e-05, "loss": 0.7814601063728333, "step": 320 }, { "epoch": 0.0642, "grad_norm": 11.662542343139648, "learning_rate": 1.909795918367347e-05, "loss": 0.426974892616272, "step": 321 }, { "epoch": 0.0644, "grad_norm": 5.7308526039123535, "learning_rate": 1.909387755102041e-05, "loss": 0.05896879360079765, "step": 322 }, { "epoch": 0.0646, "grad_norm": 10.513677597045898, "learning_rate": 1.908979591836735e-05, "loss": 0.4508655071258545, "step": 323 }, { "epoch": 0.0648, "grad_norm": 7.710137844085693, "learning_rate": 1.9085714285714287e-05, "loss": 0.2659468948841095, "step": 324 }, { "epoch": 0.065, "grad_norm": 14.837307929992676, "learning_rate": 1.9081632653061225e-05, "loss": 0.975304365158081, "step": 325 }, { "epoch": 0.0652, "grad_norm": 17.25518035888672, "learning_rate": 1.9077551020408166e-05, "loss": 0.8888130187988281, "step": 326 }, { "epoch": 0.0654, "grad_norm": 12.68120288848877, "learning_rate": 1.9073469387755103e-05, "loss": 0.40455499291419983, "step": 327 }, { "epoch": 0.0656, "grad_norm": 8.175586700439453, "learning_rate": 1.906938775510204e-05, "loss": 0.4724791944026947, "step": 328 }, { "epoch": 0.0658, "grad_norm": 13.331406593322754, "learning_rate": 1.9065306122448982e-05, "loss": 1.1158009767532349, "step": 329 }, { "epoch": 0.066, "grad_norm": 13.087471008300781, "learning_rate": 1.906122448979592e-05, "loss": 0.8100793361663818, "step": 330 }, { "epoch": 0.0662, "grad_norm": 11.871191024780273, "learning_rate": 1.9057142857142857e-05, "loss": 0.5084172487258911, "step": 331 }, { "epoch": 0.0664, "grad_norm": 15.806512832641602, "learning_rate": 1.9053061224489798e-05, "loss": 0.4743104875087738, "step": 332 }, { "epoch": 0.0666, "grad_norm": 9.702177047729492, "learning_rate": 1.9048979591836736e-05, "loss": 0.6350889801979065, "step": 333 }, { "epoch": 0.0668, "grad_norm": 9.146988868713379, "learning_rate": 1.9044897959183673e-05, "loss": 0.49416258931159973, "step": 334 }, { "epoch": 0.067, "grad_norm": 11.399431228637695, "learning_rate": 1.9040816326530614e-05, "loss": 0.8340964913368225, "step": 335 }, { "epoch": 0.0672, "grad_norm": 7.515564441680908, "learning_rate": 1.9036734693877552e-05, "loss": 0.6184783577919006, "step": 336 }, { "epoch": 0.0674, "grad_norm": 12.320131301879883, "learning_rate": 1.903265306122449e-05, "loss": 0.41817808151245117, "step": 337 }, { "epoch": 0.0676, "grad_norm": 9.78826904296875, "learning_rate": 1.902857142857143e-05, "loss": 0.34572771191596985, "step": 338 }, { "epoch": 0.0678, "grad_norm": 11.655746459960938, "learning_rate": 1.902448979591837e-05, "loss": 0.5032246708869934, "step": 339 }, { "epoch": 0.068, "grad_norm": 10.898517608642578, "learning_rate": 1.9020408163265306e-05, "loss": 0.48967209458351135, "step": 340 }, { "epoch": 0.0682, "grad_norm": 13.701338768005371, "learning_rate": 1.9016326530612247e-05, "loss": 0.5528108477592468, "step": 341 }, { "epoch": 0.0684, "grad_norm": 4.497371196746826, "learning_rate": 1.9012244897959185e-05, "loss": 0.04467979073524475, "step": 342 }, { "epoch": 0.0686, "grad_norm": 8.856463432312012, "learning_rate": 1.9008163265306122e-05, "loss": 0.42360737919807434, "step": 343 }, { "epoch": 0.0688, "grad_norm": 18.086170196533203, "learning_rate": 1.9004081632653063e-05, "loss": 0.14533616602420807, "step": 344 }, { "epoch": 0.069, "grad_norm": 15.018651008605957, "learning_rate": 1.9e-05, "loss": 2.9416942596435547, "step": 345 }, { "epoch": 0.0692, "grad_norm": 12.553912162780762, "learning_rate": 1.899591836734694e-05, "loss": 2.9571597576141357, "step": 346 }, { "epoch": 0.0694, "grad_norm": 10.689021110534668, "learning_rate": 1.899183673469388e-05, "loss": 0.6087742447853088, "step": 347 }, { "epoch": 0.0696, "grad_norm": 9.105196952819824, "learning_rate": 1.8987755102040817e-05, "loss": 0.35657715797424316, "step": 348 }, { "epoch": 0.0698, "grad_norm": 11.22412109375, "learning_rate": 1.8983673469387755e-05, "loss": 0.46766042709350586, "step": 349 }, { "epoch": 0.07, "grad_norm": 6.606272220611572, "learning_rate": 1.8979591836734696e-05, "loss": 0.06433617323637009, "step": 350 }, { "epoch": 0.0702, "grad_norm": 16.827150344848633, "learning_rate": 1.8975510204081634e-05, "loss": 0.88922518491745, "step": 351 }, { "epoch": 0.0704, "grad_norm": 13.23413372039795, "learning_rate": 1.8971428571428575e-05, "loss": 0.843842089176178, "step": 352 }, { "epoch": 0.0706, "grad_norm": 8.296690940856934, "learning_rate": 1.8967346938775512e-05, "loss": 0.18257860839366913, "step": 353 }, { "epoch": 0.0708, "grad_norm": 7.721277713775635, "learning_rate": 1.896326530612245e-05, "loss": 0.2024276703596115, "step": 354 }, { "epoch": 0.071, "grad_norm": 19.015888214111328, "learning_rate": 1.895918367346939e-05, "loss": 1.2812749147415161, "step": 355 }, { "epoch": 0.0712, "grad_norm": 29.96820640563965, "learning_rate": 1.895510204081633e-05, "loss": 2.0008366107940674, "step": 356 }, { "epoch": 0.0714, "grad_norm": 12.12214469909668, "learning_rate": 1.8951020408163266e-05, "loss": 0.4490646421909332, "step": 357 }, { "epoch": 0.0716, "grad_norm": 11.902840614318848, "learning_rate": 1.8946938775510207e-05, "loss": 0.34503689408302307, "step": 358 }, { "epoch": 0.0718, "grad_norm": 18.720752716064453, "learning_rate": 1.8942857142857145e-05, "loss": 2.382197618484497, "step": 359 }, { "epoch": 0.072, "grad_norm": 16.927722930908203, "learning_rate": 1.8938775510204083e-05, "loss": 2.2210466861724854, "step": 360 }, { "epoch": 0.0722, "grad_norm": 10.560834884643555, "learning_rate": 1.8934693877551024e-05, "loss": 2.395998954772949, "step": 361 }, { "epoch": 0.0724, "grad_norm": 11.43918228149414, "learning_rate": 1.893061224489796e-05, "loss": 2.432565927505493, "step": 362 }, { "epoch": 0.0726, "grad_norm": 12.733336448669434, "learning_rate": 1.89265306122449e-05, "loss": 2.5630064010620117, "step": 363 }, { "epoch": 0.0728, "grad_norm": 10.23054027557373, "learning_rate": 1.892244897959184e-05, "loss": 2.4672763347625732, "step": 364 }, { "epoch": 0.073, "grad_norm": 13.976507186889648, "learning_rate": 1.8918367346938778e-05, "loss": 1.5226325988769531, "step": 365 }, { "epoch": 0.0732, "grad_norm": 12.58265495300293, "learning_rate": 1.8914285714285715e-05, "loss": 0.5800626873970032, "step": 366 }, { "epoch": 0.0734, "grad_norm": 12.559310913085938, "learning_rate": 1.8910204081632656e-05, "loss": 0.43674007058143616, "step": 367 }, { "epoch": 0.0736, "grad_norm": 13.056805610656738, "learning_rate": 1.8906122448979594e-05, "loss": 0.6725685000419617, "step": 368 }, { "epoch": 0.0738, "grad_norm": 12.201833724975586, "learning_rate": 1.890204081632653e-05, "loss": 0.5566837787628174, "step": 369 }, { "epoch": 0.074, "grad_norm": 10.063617706298828, "learning_rate": 1.8897959183673473e-05, "loss": 0.7213773131370544, "step": 370 }, { "epoch": 0.0742, "grad_norm": 10.36479663848877, "learning_rate": 1.889387755102041e-05, "loss": 0.5325279831886292, "step": 371 }, { "epoch": 0.0744, "grad_norm": 8.085536003112793, "learning_rate": 1.8889795918367348e-05, "loss": 0.3209850490093231, "step": 372 }, { "epoch": 0.0746, "grad_norm": 17.15733528137207, "learning_rate": 1.888571428571429e-05, "loss": 1.9678653478622437, "step": 373 }, { "epoch": 0.0748, "grad_norm": 14.497695922851562, "learning_rate": 1.8881632653061226e-05, "loss": 1.6297591924667358, "step": 374 }, { "epoch": 0.075, "grad_norm": 16.877817153930664, "learning_rate": 1.8877551020408164e-05, "loss": 1.9466644525527954, "step": 375 }, { "epoch": 0.0752, "grad_norm": 13.411498069763184, "learning_rate": 1.8873469387755102e-05, "loss": 1.7723740339279175, "step": 376 }, { "epoch": 0.0754, "grad_norm": 14.190561294555664, "learning_rate": 1.8869387755102043e-05, "loss": 0.6084811091423035, "step": 377 }, { "epoch": 0.0756, "grad_norm": 14.289806365966797, "learning_rate": 1.886530612244898e-05, "loss": 0.6782891154289246, "step": 378 }, { "epoch": 0.0758, "grad_norm": 15.375550270080566, "learning_rate": 1.8861224489795918e-05, "loss": 1.4179891347885132, "step": 379 }, { "epoch": 0.076, "grad_norm": 9.811424255371094, "learning_rate": 1.885714285714286e-05, "loss": 0.7442667484283447, "step": 380 }, { "epoch": 0.0762, "grad_norm": 13.01511287689209, "learning_rate": 1.8853061224489797e-05, "loss": 1.3766447305679321, "step": 381 }, { "epoch": 0.0764, "grad_norm": 12.5484037399292, "learning_rate": 1.8848979591836734e-05, "loss": 1.2930976152420044, "step": 382 }, { "epoch": 0.0766, "grad_norm": 11.38595962524414, "learning_rate": 1.8844897959183675e-05, "loss": 1.223589301109314, "step": 383 }, { "epoch": 0.0768, "grad_norm": 10.15343189239502, "learning_rate": 1.8840816326530613e-05, "loss": 1.350658893585205, "step": 384 }, { "epoch": 0.077, "grad_norm": 11.184205055236816, "learning_rate": 1.883673469387755e-05, "loss": 1.2625024318695068, "step": 385 }, { "epoch": 0.0772, "grad_norm": 9.304605484008789, "learning_rate": 1.883265306122449e-05, "loss": 1.0793894529342651, "step": 386 }, { "epoch": 0.0774, "grad_norm": 9.564764022827148, "learning_rate": 1.882857142857143e-05, "loss": 0.29348039627075195, "step": 387 }, { "epoch": 0.0776, "grad_norm": 4.9206318855285645, "learning_rate": 1.8824489795918367e-05, "loss": 0.06144172325730324, "step": 388 }, { "epoch": 0.0778, "grad_norm": 14.901082992553711, "learning_rate": 1.8820408163265308e-05, "loss": 2.588120222091675, "step": 389 }, { "epoch": 0.078, "grad_norm": 10.4075288772583, "learning_rate": 1.8816326530612246e-05, "loss": 2.262578248977661, "step": 390 }, { "epoch": 0.0782, "grad_norm": 13.037023544311523, "learning_rate": 1.8812244897959183e-05, "loss": 0.175909161567688, "step": 391 }, { "epoch": 0.0784, "grad_norm": 3.5371270179748535, "learning_rate": 1.8808163265306124e-05, "loss": 0.25394168496131897, "step": 392 }, { "epoch": 0.0786, "grad_norm": 11.433444023132324, "learning_rate": 1.8804081632653062e-05, "loss": 0.4397641122341156, "step": 393 }, { "epoch": 0.0788, "grad_norm": 6.777037143707275, "learning_rate": 1.88e-05, "loss": 0.052677273750305176, "step": 394 }, { "epoch": 0.079, "grad_norm": 9.813986778259277, "learning_rate": 1.879591836734694e-05, "loss": 0.242705300450325, "step": 395 }, { "epoch": 0.0792, "grad_norm": 5.816673278808594, "learning_rate": 1.879183673469388e-05, "loss": 0.08530381321907043, "step": 396 }, { "epoch": 0.0794, "grad_norm": 11.603211402893066, "learning_rate": 1.878775510204082e-05, "loss": 0.48026981949806213, "step": 397 }, { "epoch": 0.0796, "grad_norm": 4.812192440032959, "learning_rate": 1.8783673469387757e-05, "loss": 0.05827462673187256, "step": 398 }, { "epoch": 0.0798, "grad_norm": 13.11069107055664, "learning_rate": 1.8779591836734698e-05, "loss": 1.2811168432235718, "step": 399 }, { "epoch": 0.08, "grad_norm": 10.038864135742188, "learning_rate": 1.8775510204081636e-05, "loss": 0.673806369304657, "step": 400 }, { "epoch": 0.0802, "grad_norm": 11.571066856384277, "learning_rate": 1.8771428571428573e-05, "loss": 0.5589026808738708, "step": 401 }, { "epoch": 0.0804, "grad_norm": 9.705341339111328, "learning_rate": 1.876734693877551e-05, "loss": 0.24957221746444702, "step": 402 }, { "epoch": 0.0806, "grad_norm": 13.06927490234375, "learning_rate": 1.8763265306122452e-05, "loss": 1.4396060705184937, "step": 403 }, { "epoch": 0.0808, "grad_norm": 17.341108322143555, "learning_rate": 1.875918367346939e-05, "loss": 0.6699572205543518, "step": 404 }, { "epoch": 0.081, "grad_norm": 10.573736190795898, "learning_rate": 1.8755102040816327e-05, "loss": 1.288001298904419, "step": 405 }, { "epoch": 0.0812, "grad_norm": 12.364386558532715, "learning_rate": 1.8751020408163268e-05, "loss": 0.603476881980896, "step": 406 }, { "epoch": 0.0814, "grad_norm": 13.087056159973145, "learning_rate": 1.8746938775510206e-05, "loss": 0.4683491289615631, "step": 407 }, { "epoch": 0.0816, "grad_norm": 9.821493148803711, "learning_rate": 1.8742857142857143e-05, "loss": 0.3452199399471283, "step": 408 }, { "epoch": 0.0818, "grad_norm": 10.179647445678711, "learning_rate": 1.8738775510204084e-05, "loss": 0.6895151138305664, "step": 409 }, { "epoch": 0.082, "grad_norm": 12.020050048828125, "learning_rate": 1.8734693877551022e-05, "loss": 0.35574761033058167, "step": 410 }, { "epoch": 0.0822, "grad_norm": 14.25110149383545, "learning_rate": 1.873061224489796e-05, "loss": 0.3857842683792114, "step": 411 }, { "epoch": 0.0824, "grad_norm": 8.994257926940918, "learning_rate": 1.87265306122449e-05, "loss": 0.4511173665523529, "step": 412 }, { "epoch": 0.0826, "grad_norm": 18.089815139770508, "learning_rate": 1.872244897959184e-05, "loss": 0.9559245109558105, "step": 413 }, { "epoch": 0.0828, "grad_norm": 17.2407169342041, "learning_rate": 1.8718367346938776e-05, "loss": 0.8183043599128723, "step": 414 }, { "epoch": 0.083, "grad_norm": 14.020536422729492, "learning_rate": 1.8714285714285717e-05, "loss": 0.5215857028961182, "step": 415 }, { "epoch": 0.0832, "grad_norm": 12.235998153686523, "learning_rate": 1.8710204081632655e-05, "loss": 0.47453007102012634, "step": 416 }, { "epoch": 0.0834, "grad_norm": 10.737859725952148, "learning_rate": 1.8706122448979592e-05, "loss": 0.3702681362628937, "step": 417 }, { "epoch": 0.0836, "grad_norm": 12.272703170776367, "learning_rate": 1.8702040816326533e-05, "loss": 0.4834024906158447, "step": 418 }, { "epoch": 0.0838, "grad_norm": 10.39461612701416, "learning_rate": 1.869795918367347e-05, "loss": 0.5086528062820435, "step": 419 }, { "epoch": 0.084, "grad_norm": 9.570913314819336, "learning_rate": 1.869387755102041e-05, "loss": 0.49558886885643005, "step": 420 }, { "epoch": 0.0842, "grad_norm": 12.726007461547852, "learning_rate": 1.868979591836735e-05, "loss": 0.38814687728881836, "step": 421 }, { "epoch": 0.0844, "grad_norm": 10.535909652709961, "learning_rate": 1.8685714285714287e-05, "loss": 0.5980618596076965, "step": 422 }, { "epoch": 0.0846, "grad_norm": 12.708730697631836, "learning_rate": 1.8681632653061225e-05, "loss": 1.2424190044403076, "step": 423 }, { "epoch": 0.0848, "grad_norm": 8.670951843261719, "learning_rate": 1.8677551020408166e-05, "loss": 0.856053352355957, "step": 424 }, { "epoch": 0.085, "grad_norm": 7.738373279571533, "learning_rate": 1.8673469387755104e-05, "loss": 0.2559834420681, "step": 425 }, { "epoch": 0.0852, "grad_norm": 12.78547477722168, "learning_rate": 1.866938775510204e-05, "loss": 0.24063949286937714, "step": 426 }, { "epoch": 0.0854, "grad_norm": 10.760578155517578, "learning_rate": 1.8665306122448982e-05, "loss": 0.21334774792194366, "step": 427 }, { "epoch": 0.0856, "grad_norm": 8.15738582611084, "learning_rate": 1.866122448979592e-05, "loss": 0.12981675565242767, "step": 428 }, { "epoch": 0.0858, "grad_norm": 13.824641227722168, "learning_rate": 1.8657142857142858e-05, "loss": 0.8066617846488953, "step": 429 }, { "epoch": 0.086, "grad_norm": 10.609761238098145, "learning_rate": 1.8653061224489795e-05, "loss": 0.8554279208183289, "step": 430 }, { "epoch": 0.0862, "grad_norm": 14.266653060913086, "learning_rate": 1.8648979591836736e-05, "loss": 0.6035276055335999, "step": 431 }, { "epoch": 0.0864, "grad_norm": 12.586504936218262, "learning_rate": 1.8644897959183674e-05, "loss": 0.8313179016113281, "step": 432 }, { "epoch": 0.0866, "grad_norm": 10.952289581298828, "learning_rate": 1.864081632653061e-05, "loss": 0.7162933349609375, "step": 433 }, { "epoch": 0.0868, "grad_norm": 10.059403419494629, "learning_rate": 1.8636734693877553e-05, "loss": 0.2089420109987259, "step": 434 }, { "epoch": 0.087, "grad_norm": 12.50546932220459, "learning_rate": 1.863265306122449e-05, "loss": 0.8918299078941345, "step": 435 }, { "epoch": 0.0872, "grad_norm": 8.853002548217773, "learning_rate": 1.8628571428571428e-05, "loss": 0.1686689257621765, "step": 436 }, { "epoch": 0.0874, "grad_norm": 11.31886100769043, "learning_rate": 1.862448979591837e-05, "loss": 1.5455905199050903, "step": 437 }, { "epoch": 0.0876, "grad_norm": 19.86179542541504, "learning_rate": 1.8620408163265307e-05, "loss": 0.8918477892875671, "step": 438 }, { "epoch": 0.0878, "grad_norm": 40.09619903564453, "learning_rate": 1.8616326530612244e-05, "loss": 2.142383337020874, "step": 439 }, { "epoch": 0.088, "grad_norm": 38.148292541503906, "learning_rate": 1.8612244897959185e-05, "loss": 2.0496737957000732, "step": 440 }, { "epoch": 0.0882, "grad_norm": 14.241689682006836, "learning_rate": 1.8608163265306126e-05, "loss": 0.4535129964351654, "step": 441 }, { "epoch": 0.0884, "grad_norm": 9.968354225158691, "learning_rate": 1.8604081632653064e-05, "loss": 0.1674795150756836, "step": 442 }, { "epoch": 0.0886, "grad_norm": 15.795212745666504, "learning_rate": 1.86e-05, "loss": 0.9672284126281738, "step": 443 }, { "epoch": 0.0888, "grad_norm": 10.975865364074707, "learning_rate": 1.8595918367346943e-05, "loss": 1.0675843954086304, "step": 444 }, { "epoch": 0.089, "grad_norm": 11.84468936920166, "learning_rate": 1.859183673469388e-05, "loss": 0.47408929467201233, "step": 445 }, { "epoch": 0.0892, "grad_norm": 10.550907135009766, "learning_rate": 1.8587755102040818e-05, "loss": 0.29741260409355164, "step": 446 }, { "epoch": 0.0894, "grad_norm": 12.465574264526367, "learning_rate": 1.858367346938776e-05, "loss": 1.206751823425293, "step": 447 }, { "epoch": 0.0896, "grad_norm": 13.958642959594727, "learning_rate": 1.8579591836734696e-05, "loss": 0.9328053593635559, "step": 448 }, { "epoch": 0.0898, "grad_norm": 18.270723342895508, "learning_rate": 1.8575510204081634e-05, "loss": 0.27732086181640625, "step": 449 }, { "epoch": 0.09, "grad_norm": 15.656776428222656, "learning_rate": 1.8571428571428575e-05, "loss": 0.2517017722129822, "step": 450 }, { "epoch": 0.0902, "grad_norm": 16.57853126525879, "learning_rate": 1.8567346938775513e-05, "loss": 0.6180245280265808, "step": 451 }, { "epoch": 0.0904, "grad_norm": 19.03170394897461, "learning_rate": 1.856326530612245e-05, "loss": 0.49093642830848694, "step": 452 }, { "epoch": 0.0906, "grad_norm": 12.310887336730957, "learning_rate": 1.855918367346939e-05, "loss": 0.3317486643791199, "step": 453 }, { "epoch": 0.0908, "grad_norm": 11.586458206176758, "learning_rate": 1.855510204081633e-05, "loss": 0.35807475447654724, "step": 454 }, { "epoch": 0.091, "grad_norm": 10.65620231628418, "learning_rate": 1.8551020408163267e-05, "loss": 0.36819103360176086, "step": 455 }, { "epoch": 0.0912, "grad_norm": 10.369810104370117, "learning_rate": 1.8546938775510204e-05, "loss": 0.30374273657798767, "step": 456 }, { "epoch": 0.0914, "grad_norm": 12.69101333618164, "learning_rate": 1.8542857142857145e-05, "loss": 1.052322268486023, "step": 457 }, { "epoch": 0.0916, "grad_norm": 8.62214469909668, "learning_rate": 1.8538775510204083e-05, "loss": 0.3165879547595978, "step": 458 }, { "epoch": 0.0918, "grad_norm": 13.178669929504395, "learning_rate": 1.853469387755102e-05, "loss": 0.9034677147865295, "step": 459 }, { "epoch": 0.092, "grad_norm": 8.487058639526367, "learning_rate": 1.853061224489796e-05, "loss": 0.5634719133377075, "step": 460 }, { "epoch": 0.0922, "grad_norm": 10.703817367553711, "learning_rate": 1.85265306122449e-05, "loss": 0.4245595932006836, "step": 461 }, { "epoch": 0.0924, "grad_norm": 5.564105987548828, "learning_rate": 1.8522448979591837e-05, "loss": 0.05999644473195076, "step": 462 }, { "epoch": 0.0926, "grad_norm": 5.57875919342041, "learning_rate": 1.8518367346938778e-05, "loss": 0.19407449662685394, "step": 463 }, { "epoch": 0.0928, "grad_norm": 5.318073272705078, "learning_rate": 1.8514285714285716e-05, "loss": 0.1788109540939331, "step": 464 }, { "epoch": 0.093, "grad_norm": 15.313523292541504, "learning_rate": 1.8510204081632653e-05, "loss": 0.68692547082901, "step": 465 }, { "epoch": 0.0932, "grad_norm": 11.062304496765137, "learning_rate": 1.8506122448979594e-05, "loss": 0.8494117856025696, "step": 466 }, { "epoch": 0.0934, "grad_norm": 16.3399658203125, "learning_rate": 1.8502040816326532e-05, "loss": 0.786237895488739, "step": 467 }, { "epoch": 0.0936, "grad_norm": 10.19522476196289, "learning_rate": 1.849795918367347e-05, "loss": 0.9438123106956482, "step": 468 }, { "epoch": 0.0938, "grad_norm": 12.138736724853516, "learning_rate": 1.849387755102041e-05, "loss": 0.4503382742404938, "step": 469 }, { "epoch": 0.094, "grad_norm": 10.992502212524414, "learning_rate": 1.8489795918367348e-05, "loss": 0.6347247958183289, "step": 470 }, { "epoch": 0.0942, "grad_norm": 12.836503028869629, "learning_rate": 1.8485714285714286e-05, "loss": 1.1122783422470093, "step": 471 }, { "epoch": 0.0944, "grad_norm": 14.83193302154541, "learning_rate": 1.8481632653061227e-05, "loss": 1.1462980508804321, "step": 472 }, { "epoch": 0.0946, "grad_norm": 11.722692489624023, "learning_rate": 1.8477551020408165e-05, "loss": 0.5339711308479309, "step": 473 }, { "epoch": 0.0948, "grad_norm": 9.716527938842773, "learning_rate": 1.8473469387755102e-05, "loss": 0.5523082613945007, "step": 474 }, { "epoch": 0.095, "grad_norm": 11.501835823059082, "learning_rate": 1.8469387755102043e-05, "loss": 0.6342002749443054, "step": 475 }, { "epoch": 0.0952, "grad_norm": 7.19667911529541, "learning_rate": 1.846530612244898e-05, "loss": 0.17063356935977936, "step": 476 }, { "epoch": 0.0954, "grad_norm": 11.882790565490723, "learning_rate": 1.846122448979592e-05, "loss": 0.7495070099830627, "step": 477 }, { "epoch": 0.0956, "grad_norm": 7.023852348327637, "learning_rate": 1.845714285714286e-05, "loss": 0.2240893840789795, "step": 478 }, { "epoch": 0.0958, "grad_norm": 13.528419494628906, "learning_rate": 1.8453061224489797e-05, "loss": 1.1490691900253296, "step": 479 }, { "epoch": 0.096, "grad_norm": 14.662690162658691, "learning_rate": 1.8448979591836735e-05, "loss": 1.295188307762146, "step": 480 }, { "epoch": 0.0962, "grad_norm": 10.792596817016602, "learning_rate": 1.8444897959183672e-05, "loss": 0.4174814224243164, "step": 481 }, { "epoch": 0.0964, "grad_norm": 6.941728115081787, "learning_rate": 1.8440816326530613e-05, "loss": 0.15877938270568848, "step": 482 }, { "epoch": 0.0966, "grad_norm": 19.152055740356445, "learning_rate": 1.843673469387755e-05, "loss": 1.4050027132034302, "step": 483 }, { "epoch": 0.0968, "grad_norm": 11.851791381835938, "learning_rate": 1.843265306122449e-05, "loss": 0.5855047106742859, "step": 484 }, { "epoch": 0.097, "grad_norm": 15.214301109313965, "learning_rate": 1.842857142857143e-05, "loss": 0.3991682529449463, "step": 485 }, { "epoch": 0.0972, "grad_norm": 11.383341789245605, "learning_rate": 1.842448979591837e-05, "loss": 0.36300501227378845, "step": 486 }, { "epoch": 0.0974, "grad_norm": 8.519134521484375, "learning_rate": 1.842040816326531e-05, "loss": 0.5041393637657166, "step": 487 }, { "epoch": 0.0976, "grad_norm": 10.494160652160645, "learning_rate": 1.8416326530612246e-05, "loss": 0.8635578751564026, "step": 488 }, { "epoch": 0.0978, "grad_norm": 11.271151542663574, "learning_rate": 1.8412244897959187e-05, "loss": 0.6950626969337463, "step": 489 }, { "epoch": 0.098, "grad_norm": 6.710206031799316, "learning_rate": 1.8408163265306125e-05, "loss": 0.4448658525943756, "step": 490 }, { "epoch": 0.0982, "grad_norm": 11.673338890075684, "learning_rate": 1.8404081632653062e-05, "loss": 0.8156289458274841, "step": 491 }, { "epoch": 0.0984, "grad_norm": 7.988609313964844, "learning_rate": 1.8400000000000003e-05, "loss": 0.5245321989059448, "step": 492 }, { "epoch": 0.0986, "grad_norm": 10.861720085144043, "learning_rate": 1.839591836734694e-05, "loss": 0.42232275009155273, "step": 493 }, { "epoch": 0.0988, "grad_norm": 7.797056198120117, "learning_rate": 1.839183673469388e-05, "loss": 0.0740867406129837, "step": 494 }, { "epoch": 0.099, "grad_norm": 39.92266845703125, "learning_rate": 1.838775510204082e-05, "loss": 2.273386240005493, "step": 495 }, { "epoch": 0.0992, "grad_norm": 33.89152908325195, "learning_rate": 1.8383673469387757e-05, "loss": 1.6251716613769531, "step": 496 }, { "epoch": 0.0994, "grad_norm": 13.701949119567871, "learning_rate": 1.8379591836734695e-05, "loss": 0.48105981945991516, "step": 497 }, { "epoch": 0.0996, "grad_norm": 14.770544052124023, "learning_rate": 1.8375510204081636e-05, "loss": 0.45467638969421387, "step": 498 }, { "epoch": 0.0998, "grad_norm": 10.267172813415527, "learning_rate": 1.8371428571428574e-05, "loss": 0.27381423115730286, "step": 499 }, { "epoch": 0.1, "grad_norm": 12.922121047973633, "learning_rate": 1.836734693877551e-05, "loss": 0.3002593517303467, "step": 500 }, { "epoch": 0.1002, "grad_norm": 10.551634788513184, "learning_rate": 1.8363265306122452e-05, "loss": 0.22908790409564972, "step": 501 }, { "epoch": 0.1004, "grad_norm": 10.607389450073242, "learning_rate": 1.835918367346939e-05, "loss": 0.24098807573318481, "step": 502 }, { "epoch": 0.1006, "grad_norm": 8.937371253967285, "learning_rate": 1.8355102040816328e-05, "loss": 0.24399320781230927, "step": 503 }, { "epoch": 0.1008, "grad_norm": 8.051960945129395, "learning_rate": 1.835102040816327e-05, "loss": 0.28692182898521423, "step": 504 }, { "epoch": 0.101, "grad_norm": 14.802160263061523, "learning_rate": 1.8346938775510206e-05, "loss": 0.3659327030181885, "step": 505 }, { "epoch": 0.1012, "grad_norm": 9.041366577148438, "learning_rate": 1.8342857142857144e-05, "loss": 0.16635489463806152, "step": 506 }, { "epoch": 0.1014, "grad_norm": 9.571503639221191, "learning_rate": 1.833877551020408e-05, "loss": 0.6271096467971802, "step": 507 }, { "epoch": 0.1016, "grad_norm": 12.329565048217773, "learning_rate": 1.8334693877551023e-05, "loss": 0.8489407896995544, "step": 508 }, { "epoch": 0.1018, "grad_norm": 9.421364784240723, "learning_rate": 1.833061224489796e-05, "loss": 0.24840576946735382, "step": 509 }, { "epoch": 0.102, "grad_norm": 7.825616836547852, "learning_rate": 1.8326530612244898e-05, "loss": 0.12121107429265976, "step": 510 }, { "epoch": 0.1022, "grad_norm": 9.709155082702637, "learning_rate": 1.832244897959184e-05, "loss": 0.5488207340240479, "step": 511 }, { "epoch": 0.1024, "grad_norm": 9.480342864990234, "learning_rate": 1.8318367346938777e-05, "loss": 0.4018261730670929, "step": 512 }, { "epoch": 0.1026, "grad_norm": 8.605011940002441, "learning_rate": 1.8314285714285714e-05, "loss": 0.3484914302825928, "step": 513 }, { "epoch": 0.1028, "grad_norm": 12.490439414978027, "learning_rate": 1.8310204081632655e-05, "loss": 0.28063103556632996, "step": 514 }, { "epoch": 0.103, "grad_norm": 9.80734920501709, "learning_rate": 1.8306122448979593e-05, "loss": 0.3822651207447052, "step": 515 }, { "epoch": 0.1032, "grad_norm": 4.705074787139893, "learning_rate": 1.830204081632653e-05, "loss": 0.05691803619265556, "step": 516 }, { "epoch": 0.1034, "grad_norm": 20.423694610595703, "learning_rate": 1.829795918367347e-05, "loss": 0.4564323425292969, "step": 517 }, { "epoch": 0.1036, "grad_norm": 8.043682098388672, "learning_rate": 1.829387755102041e-05, "loss": 0.18622785806655884, "step": 518 }, { "epoch": 0.1038, "grad_norm": 11.38018798828125, "learning_rate": 1.8289795918367347e-05, "loss": 0.5270686745643616, "step": 519 }, { "epoch": 0.104, "grad_norm": 7.291547775268555, "learning_rate": 1.8285714285714288e-05, "loss": 0.1516893357038498, "step": 520 }, { "epoch": 0.1042, "grad_norm": 12.855888366699219, "learning_rate": 1.8281632653061225e-05, "loss": 0.6389946341514587, "step": 521 }, { "epoch": 0.1044, "grad_norm": 9.985026359558105, "learning_rate": 1.8277551020408163e-05, "loss": 0.7234979271888733, "step": 522 }, { "epoch": 0.1046, "grad_norm": 12.087357521057129, "learning_rate": 1.8273469387755104e-05, "loss": 0.7861669659614563, "step": 523 }, { "epoch": 0.1048, "grad_norm": 15.694661140441895, "learning_rate": 1.8269387755102042e-05, "loss": 0.5220512747764587, "step": 524 }, { "epoch": 0.105, "grad_norm": 12.429661750793457, "learning_rate": 1.826530612244898e-05, "loss": 0.6339209675788879, "step": 525 }, { "epoch": 0.1052, "grad_norm": 13.236414909362793, "learning_rate": 1.826122448979592e-05, "loss": 0.4826647937297821, "step": 526 }, { "epoch": 0.1054, "grad_norm": 10.18588924407959, "learning_rate": 1.8257142857142858e-05, "loss": 0.40340495109558105, "step": 527 }, { "epoch": 0.1056, "grad_norm": 9.78604793548584, "learning_rate": 1.8253061224489796e-05, "loss": 0.37776312232017517, "step": 528 }, { "epoch": 0.1058, "grad_norm": 10.337906837463379, "learning_rate": 1.8248979591836737e-05, "loss": 0.32520875334739685, "step": 529 }, { "epoch": 0.106, "grad_norm": 5.0009965896606445, "learning_rate": 1.8244897959183674e-05, "loss": 0.1371188759803772, "step": 530 }, { "epoch": 0.1062, "grad_norm": 10.34693431854248, "learning_rate": 1.8240816326530612e-05, "loss": 0.3658228814601898, "step": 531 }, { "epoch": 0.1064, "grad_norm": 12.065381050109863, "learning_rate": 1.8236734693877553e-05, "loss": 0.49137282371520996, "step": 532 }, { "epoch": 0.1066, "grad_norm": 12.316068649291992, "learning_rate": 1.823265306122449e-05, "loss": 0.39971527457237244, "step": 533 }, { "epoch": 0.1068, "grad_norm": 9.137246131896973, "learning_rate": 1.822857142857143e-05, "loss": 0.36186349391937256, "step": 534 }, { "epoch": 0.107, "grad_norm": 7.118947505950928, "learning_rate": 1.822448979591837e-05, "loss": 0.2133263200521469, "step": 535 }, { "epoch": 0.1072, "grad_norm": 6.141784191131592, "learning_rate": 1.8220408163265307e-05, "loss": 0.12082049995660782, "step": 536 }, { "epoch": 0.1074, "grad_norm": 12.222509384155273, "learning_rate": 1.8216326530612248e-05, "loss": 0.44458451867103577, "step": 537 }, { "epoch": 0.1076, "grad_norm": 14.006424903869629, "learning_rate": 1.8212244897959186e-05, "loss": 0.3920406401157379, "step": 538 }, { "epoch": 0.1078, "grad_norm": 13.02974796295166, "learning_rate": 1.8208163265306123e-05, "loss": 0.3564494848251343, "step": 539 }, { "epoch": 0.108, "grad_norm": 18.711116790771484, "learning_rate": 1.8204081632653064e-05, "loss": 0.344758003950119, "step": 540 }, { "epoch": 0.1082, "grad_norm": 20.329181671142578, "learning_rate": 1.8200000000000002e-05, "loss": 0.9119019508361816, "step": 541 }, { "epoch": 0.1084, "grad_norm": 19.114761352539062, "learning_rate": 1.819591836734694e-05, "loss": 0.7259604334831238, "step": 542 }, { "epoch": 0.1086, "grad_norm": 19.6895809173584, "learning_rate": 1.819183673469388e-05, "loss": 1.1039327383041382, "step": 543 }, { "epoch": 0.1088, "grad_norm": 14.621075630187988, "learning_rate": 1.8187755102040818e-05, "loss": 0.6637970805168152, "step": 544 }, { "epoch": 0.109, "grad_norm": 31.219099044799805, "learning_rate": 1.8183673469387756e-05, "loss": 0.852077305316925, "step": 545 }, { "epoch": 0.1092, "grad_norm": 26.689311981201172, "learning_rate": 1.8179591836734697e-05, "loss": 0.778461754322052, "step": 546 }, { "epoch": 0.1094, "grad_norm": 10.68387222290039, "learning_rate": 1.8175510204081635e-05, "loss": 0.7177695631980896, "step": 547 }, { "epoch": 0.1096, "grad_norm": 9.821282386779785, "learning_rate": 1.8171428571428572e-05, "loss": 0.755763828754425, "step": 548 }, { "epoch": 0.1098, "grad_norm": 14.495914459228516, "learning_rate": 1.8167346938775513e-05, "loss": 1.236759901046753, "step": 549 }, { "epoch": 0.11, "grad_norm": 8.800137519836426, "learning_rate": 1.816326530612245e-05, "loss": 1.1966830492019653, "step": 550 }, { "epoch": 0.1102, "grad_norm": 9.097552299499512, "learning_rate": 1.815918367346939e-05, "loss": 0.3670867383480072, "step": 551 }, { "epoch": 0.1104, "grad_norm": 4.9611430168151855, "learning_rate": 1.815510204081633e-05, "loss": 0.0949157252907753, "step": 552 }, { "epoch": 0.1106, "grad_norm": 13.15954303741455, "learning_rate": 1.8151020408163267e-05, "loss": 2.18172025680542, "step": 553 }, { "epoch": 0.1108, "grad_norm": 15.633612632751465, "learning_rate": 1.8146938775510205e-05, "loss": 2.1954867839813232, "step": 554 }, { "epoch": 0.111, "grad_norm": 21.19856834411621, "learning_rate": 1.8142857142857146e-05, "loss": 1.004319190979004, "step": 555 }, { "epoch": 0.1112, "grad_norm": 16.217754364013672, "learning_rate": 1.8138775510204083e-05, "loss": 0.8675961494445801, "step": 556 }, { "epoch": 0.1114, "grad_norm": 11.018362998962402, "learning_rate": 1.813469387755102e-05, "loss": 0.3288007080554962, "step": 557 }, { "epoch": 0.1116, "grad_norm": 12.571540832519531, "learning_rate": 1.8130612244897962e-05, "loss": 0.23565353453159332, "step": 558 }, { "epoch": 0.1118, "grad_norm": 13.624663352966309, "learning_rate": 1.81265306122449e-05, "loss": 0.33491167426109314, "step": 559 }, { "epoch": 0.112, "grad_norm": 12.053435325622559, "learning_rate": 1.8122448979591837e-05, "loss": 0.2845800817012787, "step": 560 }, { "epoch": 0.1122, "grad_norm": 12.036954879760742, "learning_rate": 1.8118367346938775e-05, "loss": 1.890631079673767, "step": 561 }, { "epoch": 0.1124, "grad_norm": 12.141047477722168, "learning_rate": 1.8114285714285716e-05, "loss": 1.7847900390625, "step": 562 }, { "epoch": 0.1126, "grad_norm": 14.690183639526367, "learning_rate": 1.8110204081632654e-05, "loss": 1.274082064628601, "step": 563 }, { "epoch": 0.1128, "grad_norm": 14.72066879272461, "learning_rate": 1.810612244897959e-05, "loss": 1.1627548933029175, "step": 564 }, { "epoch": 0.113, "grad_norm": 14.321584701538086, "learning_rate": 1.8102040816326532e-05, "loss": 2.852611541748047, "step": 565 }, { "epoch": 0.1132, "grad_norm": 11.83179759979248, "learning_rate": 1.809795918367347e-05, "loss": 2.830625295639038, "step": 566 }, { "epoch": 0.1134, "grad_norm": 12.19902515411377, "learning_rate": 1.8093877551020408e-05, "loss": 0.9268226623535156, "step": 567 }, { "epoch": 0.1136, "grad_norm": 6.607133865356445, "learning_rate": 1.808979591836735e-05, "loss": 0.3435347378253937, "step": 568 }, { "epoch": 0.1138, "grad_norm": 10.809368133544922, "learning_rate": 1.8085714285714286e-05, "loss": 0.43306198716163635, "step": 569 }, { "epoch": 0.114, "grad_norm": 10.219008445739746, "learning_rate": 1.8081632653061224e-05, "loss": 0.3536444902420044, "step": 570 }, { "epoch": 0.1142, "grad_norm": 7.8596367835998535, "learning_rate": 1.8077551020408165e-05, "loss": 2.3697917461395264, "step": 571 }, { "epoch": 0.1144, "grad_norm": 6.792211532592773, "learning_rate": 1.8073469387755103e-05, "loss": 2.2773189544677734, "step": 572 }, { "epoch": 0.1146, "grad_norm": 10.349200248718262, "learning_rate": 1.806938775510204e-05, "loss": 0.4243791997432709, "step": 573 }, { "epoch": 0.1148, "grad_norm": 3.6428349018096924, "learning_rate": 1.806530612244898e-05, "loss": 0.039050281047821045, "step": 574 }, { "epoch": 0.115, "grad_norm": 9.740432739257812, "learning_rate": 1.806122448979592e-05, "loss": 0.378023624420166, "step": 575 }, { "epoch": 0.1152, "grad_norm": 6.016699314117432, "learning_rate": 1.8057142857142857e-05, "loss": 0.06636029481887817, "step": 576 }, { "epoch": 0.1154, "grad_norm": 11.535165786743164, "learning_rate": 1.8053061224489798e-05, "loss": 1.3998335599899292, "step": 577 }, { "epoch": 0.1156, "grad_norm": 8.401366233825684, "learning_rate": 1.804897959183674e-05, "loss": 0.6503568291664124, "step": 578 }, { "epoch": 0.1158, "grad_norm": 18.115427017211914, "learning_rate": 1.8044897959183676e-05, "loss": 3.949636459350586, "step": 579 }, { "epoch": 0.116, "grad_norm": 15.45248794555664, "learning_rate": 1.8040816326530614e-05, "loss": 3.7633888721466064, "step": 580 }, { "epoch": 0.1162, "grad_norm": 20.35097312927246, "learning_rate": 1.8036734693877555e-05, "loss": 0.8000307679176331, "step": 581 }, { "epoch": 0.1164, "grad_norm": 21.418190002441406, "learning_rate": 1.8032653061224493e-05, "loss": 0.7529289722442627, "step": 582 }, { "epoch": 0.1166, "grad_norm": 13.8251953125, "learning_rate": 1.802857142857143e-05, "loss": 1.4717602729797363, "step": 583 }, { "epoch": 0.1168, "grad_norm": 15.150423049926758, "learning_rate": 1.802448979591837e-05, "loss": 0.936460018157959, "step": 584 }, { "epoch": 0.117, "grad_norm": 10.854708671569824, "learning_rate": 1.802040816326531e-05, "loss": 0.5200664401054382, "step": 585 }, { "epoch": 0.1172, "grad_norm": 9.888029098510742, "learning_rate": 1.8016326530612247e-05, "loss": 0.33132681250572205, "step": 586 }, { "epoch": 0.1174, "grad_norm": 11.01172924041748, "learning_rate": 1.8012244897959184e-05, "loss": 0.4022061824798584, "step": 587 }, { "epoch": 0.1176, "grad_norm": 5.216026782989502, "learning_rate": 1.8008163265306125e-05, "loss": 0.3725966513156891, "step": 588 }, { "epoch": 0.1178, "grad_norm": 11.804418563842773, "learning_rate": 1.8004081632653063e-05, "loss": 0.5011307597160339, "step": 589 }, { "epoch": 0.118, "grad_norm": 8.733563423156738, "learning_rate": 1.8e-05, "loss": 0.2053084820508957, "step": 590 }, { "epoch": 0.1182, "grad_norm": 18.12481117248535, "learning_rate": 1.799591836734694e-05, "loss": 0.7179663181304932, "step": 591 }, { "epoch": 0.1184, "grad_norm": 17.2807559967041, "learning_rate": 1.799183673469388e-05, "loss": 0.5297542214393616, "step": 592 }, { "epoch": 0.1186, "grad_norm": 38.80964279174805, "learning_rate": 1.7987755102040817e-05, "loss": 2.674657106399536, "step": 593 }, { "epoch": 0.1188, "grad_norm": 38.70146942138672, "learning_rate": 1.7983673469387758e-05, "loss": 2.700640916824341, "step": 594 }, { "epoch": 0.119, "grad_norm": 9.609235763549805, "learning_rate": 1.7979591836734695e-05, "loss": 0.6304271817207336, "step": 595 }, { "epoch": 0.1192, "grad_norm": 8.441871643066406, "learning_rate": 1.7975510204081633e-05, "loss": 0.40333864092826843, "step": 596 }, { "epoch": 0.1194, "grad_norm": 10.5399169921875, "learning_rate": 1.7971428571428574e-05, "loss": 1.5478981733322144, "step": 597 }, { "epoch": 0.1196, "grad_norm": 11.575724601745605, "learning_rate": 1.7967346938775512e-05, "loss": 1.4419304132461548, "step": 598 }, { "epoch": 0.1198, "grad_norm": 9.683016777038574, "learning_rate": 1.796326530612245e-05, "loss": 1.5211687088012695, "step": 599 }, { "epoch": 0.12, "grad_norm": 8.750804901123047, "learning_rate": 1.795918367346939e-05, "loss": 1.2413440942764282, "step": 600 }, { "epoch": 0.1202, "grad_norm": 9.264275550842285, "learning_rate": 1.7955102040816328e-05, "loss": 3.822951316833496, "step": 601 }, { "epoch": 0.1204, "grad_norm": 5.871342658996582, "learning_rate": 1.7951020408163266e-05, "loss": 3.6863601207733154, "step": 602 }, { "epoch": 0.1206, "grad_norm": 10.98366641998291, "learning_rate": 1.7946938775510207e-05, "loss": 0.30503085255622864, "step": 603 }, { "epoch": 0.1208, "grad_norm": 10.534703254699707, "learning_rate": 1.7942857142857144e-05, "loss": 0.4975493252277374, "step": 604 }, { "epoch": 0.121, "grad_norm": 14.222439765930176, "learning_rate": 1.7938775510204082e-05, "loss": 0.3344534933567047, "step": 605 }, { "epoch": 0.1212, "grad_norm": 12.61806869506836, "learning_rate": 1.7934693877551023e-05, "loss": 0.3309352695941925, "step": 606 }, { "epoch": 0.1214, "grad_norm": 13.753547668457031, "learning_rate": 1.793061224489796e-05, "loss": 0.3978783190250397, "step": 607 }, { "epoch": 0.1216, "grad_norm": 13.207645416259766, "learning_rate": 1.7926530612244898e-05, "loss": 0.36826494336128235, "step": 608 }, { "epoch": 0.1218, "grad_norm": 6.715635776519775, "learning_rate": 1.792244897959184e-05, "loss": 0.19177716970443726, "step": 609 }, { "epoch": 0.122, "grad_norm": 6.490627288818359, "learning_rate": 1.7918367346938777e-05, "loss": 0.12844403088092804, "step": 610 }, { "epoch": 0.1222, "grad_norm": 12.998241424560547, "learning_rate": 1.7914285714285715e-05, "loss": 0.4355250895023346, "step": 611 }, { "epoch": 0.1224, "grad_norm": 4.835819721221924, "learning_rate": 1.7910204081632652e-05, "loss": 0.06730211526155472, "step": 612 }, { "epoch": 0.1226, "grad_norm": 12.499910354614258, "learning_rate": 1.7906122448979593e-05, "loss": 0.9613510966300964, "step": 613 }, { "epoch": 0.1228, "grad_norm": 16.230520248413086, "learning_rate": 1.790204081632653e-05, "loss": 1.2226914167404175, "step": 614 }, { "epoch": 0.123, "grad_norm": 12.136612892150879, "learning_rate": 1.789795918367347e-05, "loss": 0.4556024372577667, "step": 615 }, { "epoch": 0.1232, "grad_norm": 11.854321479797363, "learning_rate": 1.789387755102041e-05, "loss": 0.7772054076194763, "step": 616 }, { "epoch": 0.1234, "grad_norm": 7.2010931968688965, "learning_rate": 1.7889795918367347e-05, "loss": 0.17583107948303223, "step": 617 }, { "epoch": 0.1236, "grad_norm": 8.672499656677246, "learning_rate": 1.7885714285714285e-05, "loss": 0.18448638916015625, "step": 618 }, { "epoch": 0.1238, "grad_norm": 9.116911888122559, "learning_rate": 1.7881632653061226e-05, "loss": 0.7564261555671692, "step": 619 }, { "epoch": 0.124, "grad_norm": 10.779072761535645, "learning_rate": 1.7877551020408164e-05, "loss": 0.8381101489067078, "step": 620 }, { "epoch": 0.1242, "grad_norm": 12.201075553894043, "learning_rate": 1.78734693877551e-05, "loss": 0.5493308901786804, "step": 621 }, { "epoch": 0.1244, "grad_norm": 16.000568389892578, "learning_rate": 1.7869387755102042e-05, "loss": 1.2077207565307617, "step": 622 }, { "epoch": 0.1246, "grad_norm": 7.620370388031006, "learning_rate": 1.7865306122448983e-05, "loss": 0.2562389373779297, "step": 623 }, { "epoch": 0.1248, "grad_norm": 7.897797107696533, "learning_rate": 1.786122448979592e-05, "loss": 0.43710216879844666, "step": 624 }, { "epoch": 0.125, "grad_norm": 10.918147087097168, "learning_rate": 1.785714285714286e-05, "loss": 0.3691123425960541, "step": 625 }, { "epoch": 0.1252, "grad_norm": 11.561279296875, "learning_rate": 1.78530612244898e-05, "loss": 0.7716384530067444, "step": 626 }, { "epoch": 0.1254, "grad_norm": 9.598849296569824, "learning_rate": 1.7848979591836737e-05, "loss": 0.599067747592926, "step": 627 }, { "epoch": 0.1256, "grad_norm": 10.998629570007324, "learning_rate": 1.7844897959183675e-05, "loss": 0.9171390533447266, "step": 628 }, { "epoch": 0.1258, "grad_norm": 11.740135192871094, "learning_rate": 1.7840816326530616e-05, "loss": 0.5126091241836548, "step": 629 }, { "epoch": 0.126, "grad_norm": 9.241000175476074, "learning_rate": 1.7836734693877553e-05, "loss": 0.5009287595748901, "step": 630 }, { "epoch": 0.1262, "grad_norm": 15.022573471069336, "learning_rate": 1.783265306122449e-05, "loss": 1.0993671417236328, "step": 631 }, { "epoch": 0.1264, "grad_norm": 14.095918655395508, "learning_rate": 1.7828571428571432e-05, "loss": 0.8669114708900452, "step": 632 }, { "epoch": 0.1266, "grad_norm": 32.64982604980469, "learning_rate": 1.782448979591837e-05, "loss": 1.9596363306045532, "step": 633 }, { "epoch": 0.1268, "grad_norm": 38.364131927490234, "learning_rate": 1.7820408163265307e-05, "loss": 2.6949901580810547, "step": 634 }, { "epoch": 0.127, "grad_norm": 7.988552570343018, "learning_rate": 1.781632653061225e-05, "loss": 0.4111875593662262, "step": 635 }, { "epoch": 0.1272, "grad_norm": 14.480839729309082, "learning_rate": 1.7812244897959186e-05, "loss": 0.436307817697525, "step": 636 }, { "epoch": 0.1274, "grad_norm": 20.296417236328125, "learning_rate": 1.7808163265306124e-05, "loss": 1.3135007619857788, "step": 637 }, { "epoch": 0.1276, "grad_norm": 14.601682662963867, "learning_rate": 1.780408163265306e-05, "loss": 1.0407058000564575, "step": 638 }, { "epoch": 0.1278, "grad_norm": 11.520345687866211, "learning_rate": 1.7800000000000002e-05, "loss": 0.6323835253715515, "step": 639 }, { "epoch": 0.128, "grad_norm": 8.412186622619629, "learning_rate": 1.779591836734694e-05, "loss": 0.3591906726360321, "step": 640 }, { "epoch": 0.1282, "grad_norm": 7.596136569976807, "learning_rate": 1.7791836734693878e-05, "loss": 0.2633102834224701, "step": 641 }, { "epoch": 0.1284, "grad_norm": 7.652013301849365, "learning_rate": 1.778775510204082e-05, "loss": 0.2844131290912628, "step": 642 }, { "epoch": 0.1286, "grad_norm": 14.937522888183594, "learning_rate": 1.7783673469387756e-05, "loss": 0.931900680065155, "step": 643 }, { "epoch": 0.1288, "grad_norm": 11.996570587158203, "learning_rate": 1.7779591836734694e-05, "loss": 0.8581549525260925, "step": 644 }, { "epoch": 0.129, "grad_norm": 11.707404136657715, "learning_rate": 1.7775510204081635e-05, "loss": 1.1206721067428589, "step": 645 }, { "epoch": 0.1292, "grad_norm": 9.904662132263184, "learning_rate": 1.7771428571428573e-05, "loss": 1.031593680381775, "step": 646 }, { "epoch": 0.1294, "grad_norm": 12.293519020080566, "learning_rate": 1.776734693877551e-05, "loss": 1.2122260332107544, "step": 647 }, { "epoch": 0.1296, "grad_norm": 10.546151161193848, "learning_rate": 1.776326530612245e-05, "loss": 1.0906754732131958, "step": 648 }, { "epoch": 0.1298, "grad_norm": 8.968172073364258, "learning_rate": 1.775918367346939e-05, "loss": 0.2922592759132385, "step": 649 }, { "epoch": 0.13, "grad_norm": 9.768681526184082, "learning_rate": 1.7755102040816327e-05, "loss": 0.4742162227630615, "step": 650 }, { "epoch": 0.1302, "grad_norm": 12.723608016967773, "learning_rate": 1.7751020408163268e-05, "loss": 0.7791746258735657, "step": 651 }, { "epoch": 0.1304, "grad_norm": 10.014815330505371, "learning_rate": 1.7746938775510205e-05, "loss": 0.8886891007423401, "step": 652 }, { "epoch": 0.1306, "grad_norm": 13.535554885864258, "learning_rate": 1.7742857142857143e-05, "loss": 0.861926257610321, "step": 653 }, { "epoch": 0.1308, "grad_norm": 14.146342277526855, "learning_rate": 1.7738775510204084e-05, "loss": 1.4108766317367554, "step": 654 }, { "epoch": 0.131, "grad_norm": 11.172720909118652, "learning_rate": 1.773469387755102e-05, "loss": 1.0398005247116089, "step": 655 }, { "epoch": 0.1312, "grad_norm": 12.420480728149414, "learning_rate": 1.773061224489796e-05, "loss": 1.098586082458496, "step": 656 }, { "epoch": 0.1314, "grad_norm": 13.521431922912598, "learning_rate": 1.77265306122449e-05, "loss": 0.5681458115577698, "step": 657 }, { "epoch": 0.1316, "grad_norm": 15.764737129211426, "learning_rate": 1.7722448979591838e-05, "loss": 0.48384639620780945, "step": 658 }, { "epoch": 0.1318, "grad_norm": 12.213311195373535, "learning_rate": 1.7718367346938775e-05, "loss": 0.695458710193634, "step": 659 }, { "epoch": 0.132, "grad_norm": 8.498339653015137, "learning_rate": 1.7714285714285717e-05, "loss": 0.09367576986551285, "step": 660 }, { "epoch": 0.1322, "grad_norm": 13.739344596862793, "learning_rate": 1.7710204081632654e-05, "loss": 0.9440757632255554, "step": 661 }, { "epoch": 0.1324, "grad_norm": 13.500205993652344, "learning_rate": 1.7706122448979592e-05, "loss": 0.5242319107055664, "step": 662 }, { "epoch": 0.1326, "grad_norm": 11.270493507385254, "learning_rate": 1.7702040816326533e-05, "loss": 0.6261625289916992, "step": 663 }, { "epoch": 0.1328, "grad_norm": 11.79617691040039, "learning_rate": 1.769795918367347e-05, "loss": 0.7476010918617249, "step": 664 }, { "epoch": 0.133, "grad_norm": 11.163064956665039, "learning_rate": 1.7693877551020408e-05, "loss": 0.42261502146720886, "step": 665 }, { "epoch": 0.1332, "grad_norm": 8.681324005126953, "learning_rate": 1.7689795918367346e-05, "loss": 0.2355920523405075, "step": 666 }, { "epoch": 0.1334, "grad_norm": 11.953144073486328, "learning_rate": 1.7685714285714287e-05, "loss": 0.447518914937973, "step": 667 }, { "epoch": 0.1336, "grad_norm": 7.357821941375732, "learning_rate": 1.7681632653061228e-05, "loss": 0.4788515865802765, "step": 668 }, { "epoch": 0.1338, "grad_norm": 7.640774726867676, "learning_rate": 1.7677551020408165e-05, "loss": 0.23292386531829834, "step": 669 }, { "epoch": 0.134, "grad_norm": 5.623747825622559, "learning_rate": 1.7673469387755103e-05, "loss": 0.06400511413812637, "step": 670 }, { "epoch": 0.1342, "grad_norm": 14.167566299438477, "learning_rate": 1.7669387755102044e-05, "loss": 1.2401551008224487, "step": 671 }, { "epoch": 0.1344, "grad_norm": 11.210840225219727, "learning_rate": 1.7665306122448982e-05, "loss": 0.9809815287590027, "step": 672 }, { "epoch": 0.1346, "grad_norm": 9.620011329650879, "learning_rate": 1.766122448979592e-05, "loss": 0.7593598961830139, "step": 673 }, { "epoch": 0.1348, "grad_norm": 9.036689758300781, "learning_rate": 1.765714285714286e-05, "loss": 0.7323796153068542, "step": 674 }, { "epoch": 0.135, "grad_norm": 8.841421127319336, "learning_rate": 1.7653061224489798e-05, "loss": 0.36300086975097656, "step": 675 }, { "epoch": 0.1352, "grad_norm": 5.024514198303223, "learning_rate": 1.7648979591836736e-05, "loss": 0.1856626719236374, "step": 676 }, { "epoch": 0.1354, "grad_norm": 10.672222137451172, "learning_rate": 1.7644897959183677e-05, "loss": 0.4185608923435211, "step": 677 }, { "epoch": 0.1356, "grad_norm": 5.480457782745361, "learning_rate": 1.7640816326530614e-05, "loss": 0.4270029067993164, "step": 678 }, { "epoch": 0.1358, "grad_norm": 13.384345054626465, "learning_rate": 1.7636734693877552e-05, "loss": 0.9341201782226562, "step": 679 }, { "epoch": 0.136, "grad_norm": 11.61329460144043, "learning_rate": 1.7632653061224493e-05, "loss": 0.8007194399833679, "step": 680 }, { "epoch": 0.1362, "grad_norm": 21.145790100097656, "learning_rate": 1.762857142857143e-05, "loss": 0.6441300511360168, "step": 681 }, { "epoch": 0.1364, "grad_norm": 18.466842651367188, "learning_rate": 1.7624489795918368e-05, "loss": 0.604012668132782, "step": 682 }, { "epoch": 0.1366, "grad_norm": 9.196843147277832, "learning_rate": 1.762040816326531e-05, "loss": 2.540626287460327, "step": 683 }, { "epoch": 0.1368, "grad_norm": 8.944679260253906, "learning_rate": 1.7616326530612247e-05, "loss": 2.4371800422668457, "step": 684 }, { "epoch": 0.137, "grad_norm": 33.24692153930664, "learning_rate": 1.7612244897959185e-05, "loss": 2.222153425216675, "step": 685 }, { "epoch": 0.1372, "grad_norm": 27.14830207824707, "learning_rate": 1.7608163265306126e-05, "loss": 2.2262842655181885, "step": 686 }, { "epoch": 0.1374, "grad_norm": 10.974462509155273, "learning_rate": 1.7604081632653063e-05, "loss": 0.42491415143013, "step": 687 }, { "epoch": 0.1376, "grad_norm": 10.665210723876953, "learning_rate": 1.76e-05, "loss": 0.31951087713241577, "step": 688 }, { "epoch": 0.1378, "grad_norm": 10.70345401763916, "learning_rate": 1.7595918367346942e-05, "loss": 0.4475421905517578, "step": 689 }, { "epoch": 0.138, "grad_norm": 8.484381675720215, "learning_rate": 1.759183673469388e-05, "loss": 0.23042087256908417, "step": 690 }, { "epoch": 0.1382, "grad_norm": 15.865165710449219, "learning_rate": 1.7587755102040817e-05, "loss": 0.6705883145332336, "step": 691 }, { "epoch": 0.1384, "grad_norm": 12.621859550476074, "learning_rate": 1.7583673469387755e-05, "loss": 0.438053697347641, "step": 692 }, { "epoch": 0.1386, "grad_norm": 9.627793312072754, "learning_rate": 1.7579591836734696e-05, "loss": 0.3794168531894684, "step": 693 }, { "epoch": 0.1388, "grad_norm": 4.623700141906738, "learning_rate": 1.7575510204081634e-05, "loss": 0.04528559371829033, "step": 694 }, { "epoch": 0.139, "grad_norm": 9.119061470031738, "learning_rate": 1.757142857142857e-05, "loss": 0.3755207061767578, "step": 695 }, { "epoch": 0.1392, "grad_norm": 9.938539505004883, "learning_rate": 1.7567346938775512e-05, "loss": 0.3410348892211914, "step": 696 }, { "epoch": 0.1394, "grad_norm": 12.629765510559082, "learning_rate": 1.756326530612245e-05, "loss": 0.7066850662231445, "step": 697 }, { "epoch": 0.1396, "grad_norm": 12.002217292785645, "learning_rate": 1.7559183673469387e-05, "loss": 0.7674385905265808, "step": 698 }, { "epoch": 0.1398, "grad_norm": 13.5888671875, "learning_rate": 1.755510204081633e-05, "loss": 1.082494854927063, "step": 699 }, { "epoch": 0.14, "grad_norm": 14.132606506347656, "learning_rate": 1.7551020408163266e-05, "loss": 0.565140426158905, "step": 700 }, { "epoch": 0.1402, "grad_norm": 10.079045295715332, "learning_rate": 1.7546938775510204e-05, "loss": 0.2579793930053711, "step": 701 }, { "epoch": 0.1404, "grad_norm": 8.026155471801758, "learning_rate": 1.7542857142857145e-05, "loss": 0.1645224541425705, "step": 702 }, { "epoch": 0.1406, "grad_norm": 9.82970142364502, "learning_rate": 1.7538775510204082e-05, "loss": 0.1751258820295334, "step": 703 }, { "epoch": 0.1408, "grad_norm": 8.215539932250977, "learning_rate": 1.753469387755102e-05, "loss": 0.1312454342842102, "step": 704 }, { "epoch": 0.141, "grad_norm": 8.542089462280273, "learning_rate": 1.753061224489796e-05, "loss": 0.38043227791786194, "step": 705 }, { "epoch": 0.1412, "grad_norm": 5.60847282409668, "learning_rate": 1.75265306122449e-05, "loss": 0.10540112853050232, "step": 706 }, { "epoch": 0.1414, "grad_norm": 10.964652061462402, "learning_rate": 1.7522448979591836e-05, "loss": 1.119620680809021, "step": 707 }, { "epoch": 0.1416, "grad_norm": 12.320873260498047, "learning_rate": 1.7518367346938777e-05, "loss": 0.46061721444129944, "step": 708 }, { "epoch": 0.1418, "grad_norm": 8.906105995178223, "learning_rate": 1.7514285714285715e-05, "loss": 1.1816500425338745, "step": 709 }, { "epoch": 0.142, "grad_norm": 11.270313262939453, "learning_rate": 1.7510204081632653e-05, "loss": 0.8878250122070312, "step": 710 }, { "epoch": 0.1422, "grad_norm": 14.09372615814209, "learning_rate": 1.7506122448979594e-05, "loss": 1.0754565000534058, "step": 711 }, { "epoch": 0.1424, "grad_norm": 14.562458992004395, "learning_rate": 1.750204081632653e-05, "loss": 0.9496583938598633, "step": 712 }, { "epoch": 0.1426, "grad_norm": 16.510957717895508, "learning_rate": 1.7497959183673472e-05, "loss": 0.6176168918609619, "step": 713 }, { "epoch": 0.1428, "grad_norm": 15.548184394836426, "learning_rate": 1.749387755102041e-05, "loss": 0.5802152156829834, "step": 714 }, { "epoch": 0.143, "grad_norm": 16.158681869506836, "learning_rate": 1.748979591836735e-05, "loss": 0.5384200215339661, "step": 715 }, { "epoch": 0.1432, "grad_norm": 13.481738090515137, "learning_rate": 1.748571428571429e-05, "loss": 1.1257078647613525, "step": 716 }, { "epoch": 0.1434, "grad_norm": 11.724091529846191, "learning_rate": 1.7481632653061226e-05, "loss": 0.45442309975624084, "step": 717 }, { "epoch": 0.1436, "grad_norm": 11.29008674621582, "learning_rate": 1.7477551020408164e-05, "loss": 0.45394429564476013, "step": 718 }, { "epoch": 0.1438, "grad_norm": 6.764264106750488, "learning_rate": 1.7473469387755105e-05, "loss": 0.2008994072675705, "step": 719 }, { "epoch": 0.144, "grad_norm": 5.740670204162598, "learning_rate": 1.7469387755102043e-05, "loss": 0.08834325522184372, "step": 720 }, { "epoch": 0.1442, "grad_norm": 9.330862045288086, "learning_rate": 1.746530612244898e-05, "loss": 0.24800275266170502, "step": 721 }, { "epoch": 0.1444, "grad_norm": 7.405993461608887, "learning_rate": 1.746122448979592e-05, "loss": 0.516298770904541, "step": 722 }, { "epoch": 0.1446, "grad_norm": 14.721088409423828, "learning_rate": 1.745714285714286e-05, "loss": 0.5837984085083008, "step": 723 }, { "epoch": 0.1448, "grad_norm": 10.000396728515625, "learning_rate": 1.7453061224489797e-05, "loss": 0.3792562782764435, "step": 724 }, { "epoch": 0.145, "grad_norm": 10.423584938049316, "learning_rate": 1.7448979591836738e-05, "loss": 0.32581838965415955, "step": 725 }, { "epoch": 0.1452, "grad_norm": 9.63492488861084, "learning_rate": 1.7444897959183675e-05, "loss": 0.3101356029510498, "step": 726 }, { "epoch": 0.1454, "grad_norm": 13.9413423538208, "learning_rate": 1.7440816326530613e-05, "loss": 0.6439652442932129, "step": 727 }, { "epoch": 0.1456, "grad_norm": 9.161519050598145, "learning_rate": 1.7436734693877554e-05, "loss": 0.5394307971000671, "step": 728 }, { "epoch": 0.1458, "grad_norm": 13.081731796264648, "learning_rate": 1.743265306122449e-05, "loss": 0.4912102222442627, "step": 729 }, { "epoch": 0.146, "grad_norm": 11.13801097869873, "learning_rate": 1.742857142857143e-05, "loss": 0.3857848644256592, "step": 730 }, { "epoch": 0.1462, "grad_norm": 13.22642993927002, "learning_rate": 1.742448979591837e-05, "loss": 0.4448005259037018, "step": 731 }, { "epoch": 0.1464, "grad_norm": 11.004619598388672, "learning_rate": 1.7420408163265308e-05, "loss": 0.24569053947925568, "step": 732 }, { "epoch": 0.1466, "grad_norm": 15.297962188720703, "learning_rate": 1.7416326530612245e-05, "loss": 0.5006201863288879, "step": 733 }, { "epoch": 0.1468, "grad_norm": 10.685734748840332, "learning_rate": 1.7412244897959187e-05, "loss": 0.38487866520881653, "step": 734 }, { "epoch": 0.147, "grad_norm": 9.32356071472168, "learning_rate": 1.7408163265306124e-05, "loss": 0.2797485291957855, "step": 735 }, { "epoch": 0.1472, "grad_norm": 11.006173133850098, "learning_rate": 1.7404081632653062e-05, "loss": 0.42651239037513733, "step": 736 }, { "epoch": 0.1474, "grad_norm": 16.145952224731445, "learning_rate": 1.7400000000000003e-05, "loss": 0.47780147194862366, "step": 737 }, { "epoch": 0.1476, "grad_norm": 16.576457977294922, "learning_rate": 1.739591836734694e-05, "loss": 0.5122077465057373, "step": 738 }, { "epoch": 0.1478, "grad_norm": 6.308256149291992, "learning_rate": 1.7391836734693878e-05, "loss": 3.6567018032073975, "step": 739 }, { "epoch": 0.148, "grad_norm": 6.102212429046631, "learning_rate": 1.738775510204082e-05, "loss": 3.6497976779937744, "step": 740 }, { "epoch": 0.1482, "grad_norm": 10.994969367980957, "learning_rate": 1.7383673469387757e-05, "loss": 0.416873574256897, "step": 741 }, { "epoch": 0.1484, "grad_norm": 10.662221908569336, "learning_rate": 1.7379591836734694e-05, "loss": 0.41474536061286926, "step": 742 }, { "epoch": 0.1486, "grad_norm": 16.18564224243164, "learning_rate": 1.7375510204081632e-05, "loss": 1.3894104957580566, "step": 743 }, { "epoch": 0.1488, "grad_norm": 11.509024620056152, "learning_rate": 1.7371428571428573e-05, "loss": 0.9457443356513977, "step": 744 }, { "epoch": 0.149, "grad_norm": 11.563676834106445, "learning_rate": 1.736734693877551e-05, "loss": 1.4715752601623535, "step": 745 }, { "epoch": 0.1492, "grad_norm": 12.36917781829834, "learning_rate": 1.736326530612245e-05, "loss": 1.0810216665267944, "step": 746 }, { "epoch": 0.1494, "grad_norm": 23.451833724975586, "learning_rate": 1.735918367346939e-05, "loss": 1.1819260120391846, "step": 747 }, { "epoch": 0.1496, "grad_norm": 21.954042434692383, "learning_rate": 1.7355102040816327e-05, "loss": 0.9266975522041321, "step": 748 }, { "epoch": 0.1498, "grad_norm": 10.645434379577637, "learning_rate": 1.7351020408163265e-05, "loss": 0.4366590082645416, "step": 749 }, { "epoch": 0.15, "grad_norm": 10.085464477539062, "learning_rate": 1.7346938775510206e-05, "loss": 0.39136600494384766, "step": 750 }, { "epoch": 0.1502, "grad_norm": 13.359373092651367, "learning_rate": 1.7342857142857143e-05, "loss": 0.6837975382804871, "step": 751 }, { "epoch": 0.1504, "grad_norm": 8.370230674743652, "learning_rate": 1.733877551020408e-05, "loss": 1.0935190916061401, "step": 752 }, { "epoch": 0.1506, "grad_norm": 14.436971664428711, "learning_rate": 1.7334693877551022e-05, "loss": 0.5633415579795837, "step": 753 }, { "epoch": 0.1508, "grad_norm": 10.484057426452637, "learning_rate": 1.733061224489796e-05, "loss": 1.3053311109542847, "step": 754 }, { "epoch": 0.151, "grad_norm": 13.887869834899902, "learning_rate": 1.7326530612244897e-05, "loss": 0.6421354413032532, "step": 755 }, { "epoch": 0.1512, "grad_norm": 7.413531303405762, "learning_rate": 1.7322448979591838e-05, "loss": 0.4527202546596527, "step": 756 }, { "epoch": 0.1514, "grad_norm": 40.82472229003906, "learning_rate": 1.7318367346938776e-05, "loss": 2.967447280883789, "step": 757 }, { "epoch": 0.1516, "grad_norm": 19.505496978759766, "learning_rate": 1.7314285714285717e-05, "loss": 2.8037269115448, "step": 758 }, { "epoch": 0.1518, "grad_norm": 11.161591529846191, "learning_rate": 1.7310204081632655e-05, "loss": 1.213711142539978, "step": 759 }, { "epoch": 0.152, "grad_norm": 10.131704330444336, "learning_rate": 1.7306122448979596e-05, "loss": 1.0836795568466187, "step": 760 }, { "epoch": 0.1522, "grad_norm": 21.120372772216797, "learning_rate": 1.7302040816326533e-05, "loss": 0.5193700194358826, "step": 761 }, { "epoch": 0.1524, "grad_norm": 7.548417091369629, "learning_rate": 1.729795918367347e-05, "loss": 0.33356955647468567, "step": 762 }, { "epoch": 0.1526, "grad_norm": 10.09682846069336, "learning_rate": 1.7293877551020412e-05, "loss": 0.945113480091095, "step": 763 }, { "epoch": 0.1528, "grad_norm": 8.723321914672852, "learning_rate": 1.728979591836735e-05, "loss": 1.0624339580535889, "step": 764 }, { "epoch": 0.153, "grad_norm": 11.091486930847168, "learning_rate": 1.7285714285714287e-05, "loss": 0.49018755555152893, "step": 765 }, { "epoch": 0.1532, "grad_norm": 7.9845170974731445, "learning_rate": 1.7281632653061228e-05, "loss": 0.07966050505638123, "step": 766 }, { "epoch": 0.1534, "grad_norm": 9.461292266845703, "learning_rate": 1.7277551020408166e-05, "loss": 1.2714213132858276, "step": 767 }, { "epoch": 0.1536, "grad_norm": 7.521483898162842, "learning_rate": 1.7273469387755104e-05, "loss": 0.8111341595649719, "step": 768 }, { "epoch": 0.1538, "grad_norm": 7.449307918548584, "learning_rate": 1.726938775510204e-05, "loss": 0.582644522190094, "step": 769 }, { "epoch": 0.154, "grad_norm": 7.701537609100342, "learning_rate": 1.7265306122448982e-05, "loss": 0.2264944314956665, "step": 770 }, { "epoch": 0.1542, "grad_norm": 9.337274551391602, "learning_rate": 1.726122448979592e-05, "loss": 0.8635380268096924, "step": 771 }, { "epoch": 0.1544, "grad_norm": 8.135647773742676, "learning_rate": 1.7257142857142857e-05, "loss": 0.22340886294841766, "step": 772 }, { "epoch": 0.1546, "grad_norm": 10.464387893676758, "learning_rate": 1.72530612244898e-05, "loss": 0.7764620780944824, "step": 773 }, { "epoch": 0.1548, "grad_norm": 11.095023155212402, "learning_rate": 1.7248979591836736e-05, "loss": 0.4418260157108307, "step": 774 }, { "epoch": 0.155, "grad_norm": 11.342694282531738, "learning_rate": 1.7244897959183674e-05, "loss": 1.4569212198257446, "step": 775 }, { "epoch": 0.1552, "grad_norm": 14.055512428283691, "learning_rate": 1.7240816326530615e-05, "loss": 1.60598886013031, "step": 776 }, { "epoch": 0.1554, "grad_norm": 17.556177139282227, "learning_rate": 1.7236734693877552e-05, "loss": 0.31339550018310547, "step": 777 }, { "epoch": 0.1556, "grad_norm": 10.759800910949707, "learning_rate": 1.723265306122449e-05, "loss": 0.34965983033180237, "step": 778 }, { "epoch": 0.1558, "grad_norm": 4.928140163421631, "learning_rate": 1.722857142857143e-05, "loss": 0.0968223437666893, "step": 779 }, { "epoch": 0.156, "grad_norm": 4.8700361251831055, "learning_rate": 1.722448979591837e-05, "loss": 0.07303012162446976, "step": 780 }, { "epoch": 0.1562, "grad_norm": 13.173325538635254, "learning_rate": 1.7220408163265306e-05, "loss": 0.7579927444458008, "step": 781 }, { "epoch": 0.1564, "grad_norm": 11.799232482910156, "learning_rate": 1.7216326530612247e-05, "loss": 1.2781928777694702, "step": 782 }, { "epoch": 0.1566, "grad_norm": 11.70905876159668, "learning_rate": 1.7212244897959185e-05, "loss": 0.45815756916999817, "step": 783 }, { "epoch": 0.1568, "grad_norm": 18.038320541381836, "learning_rate": 1.7208163265306123e-05, "loss": 0.3250984251499176, "step": 784 }, { "epoch": 0.157, "grad_norm": 17.403844833374023, "learning_rate": 1.7204081632653064e-05, "loss": 1.2099705934524536, "step": 785 }, { "epoch": 0.1572, "grad_norm": 15.40157413482666, "learning_rate": 1.72e-05, "loss": 1.0935746431350708, "step": 786 }, { "epoch": 0.1574, "grad_norm": 8.310811042785645, "learning_rate": 1.719591836734694e-05, "loss": 0.8277642130851746, "step": 787 }, { "epoch": 0.1576, "grad_norm": 8.334830284118652, "learning_rate": 1.719183673469388e-05, "loss": 0.8446757197380066, "step": 788 }, { "epoch": 0.1578, "grad_norm": 8.584236145019531, "learning_rate": 1.7187755102040818e-05, "loss": 0.38698020577430725, "step": 789 }, { "epoch": 0.158, "grad_norm": 9.955022811889648, "learning_rate": 1.7183673469387755e-05, "loss": 0.5164220929145813, "step": 790 }, { "epoch": 0.1582, "grad_norm": 9.958945274353027, "learning_rate": 1.7179591836734696e-05, "loss": 1.1173901557922363, "step": 791 }, { "epoch": 0.1584, "grad_norm": 8.765222549438477, "learning_rate": 1.7175510204081634e-05, "loss": 0.9976674914360046, "step": 792 }, { "epoch": 0.1586, "grad_norm": 23.056232452392578, "learning_rate": 1.717142857142857e-05, "loss": 1.370761752128601, "step": 793 }, { "epoch": 0.1588, "grad_norm": 23.940549850463867, "learning_rate": 1.7167346938775513e-05, "loss": 1.3595713376998901, "step": 794 }, { "epoch": 0.159, "grad_norm": 18.20630645751953, "learning_rate": 1.716326530612245e-05, "loss": 1.7061136960983276, "step": 795 }, { "epoch": 0.1592, "grad_norm": 16.01688003540039, "learning_rate": 1.7159183673469388e-05, "loss": 1.3681236505508423, "step": 796 }, { "epoch": 0.1594, "grad_norm": 8.32278823852539, "learning_rate": 1.7155102040816326e-05, "loss": 0.9235571026802063, "step": 797 }, { "epoch": 0.1596, "grad_norm": 10.294227600097656, "learning_rate": 1.7151020408163267e-05, "loss": 0.5937687754631042, "step": 798 }, { "epoch": 0.1598, "grad_norm": 11.361611366271973, "learning_rate": 1.7146938775510204e-05, "loss": 0.8640332221984863, "step": 799 }, { "epoch": 0.16, "grad_norm": 7.6554789543151855, "learning_rate": 1.7142857142857142e-05, "loss": 0.8463423848152161, "step": 800 }, { "epoch": 0.1602, "grad_norm": 8.576823234558105, "learning_rate": 1.7138775510204083e-05, "loss": 0.3656291961669922, "step": 801 }, { "epoch": 0.1604, "grad_norm": 6.198919773101807, "learning_rate": 1.713469387755102e-05, "loss": 0.22547852993011475, "step": 802 }, { "epoch": 0.1606, "grad_norm": 8.521926879882812, "learning_rate": 1.7130612244897958e-05, "loss": 0.3603907525539398, "step": 803 }, { "epoch": 0.1608, "grad_norm": 4.4991044998168945, "learning_rate": 1.71265306122449e-05, "loss": 0.04183853790163994, "step": 804 }, { "epoch": 0.161, "grad_norm": 7.233295440673828, "learning_rate": 1.712244897959184e-05, "loss": 0.22137486934661865, "step": 805 }, { "epoch": 0.1612, "grad_norm": 3.7667109966278076, "learning_rate": 1.7118367346938778e-05, "loss": 0.04676097258925438, "step": 806 }, { "epoch": 0.1614, "grad_norm": 8.665865898132324, "learning_rate": 1.7114285714285715e-05, "loss": 0.3963099420070648, "step": 807 }, { "epoch": 0.1616, "grad_norm": 5.754110336303711, "learning_rate": 1.7110204081632657e-05, "loss": 0.07381335645914078, "step": 808 }, { "epoch": 0.1618, "grad_norm": 15.004722595214844, "learning_rate": 1.7106122448979594e-05, "loss": 0.8166268467903137, "step": 809 }, { "epoch": 0.162, "grad_norm": 14.96613883972168, "learning_rate": 1.7102040816326532e-05, "loss": 0.2907205820083618, "step": 810 }, { "epoch": 0.1622, "grad_norm": 16.329971313476562, "learning_rate": 1.7097959183673473e-05, "loss": 0.6810183525085449, "step": 811 }, { "epoch": 0.1624, "grad_norm": 11.588258743286133, "learning_rate": 1.709387755102041e-05, "loss": 0.3183893859386444, "step": 812 }, { "epoch": 0.1626, "grad_norm": 11.18985652923584, "learning_rate": 1.7089795918367348e-05, "loss": 0.33114203810691833, "step": 813 }, { "epoch": 0.1628, "grad_norm": 11.678988456726074, "learning_rate": 1.708571428571429e-05, "loss": 0.7535752654075623, "step": 814 }, { "epoch": 0.163, "grad_norm": 13.68851375579834, "learning_rate": 1.7081632653061227e-05, "loss": 0.889157772064209, "step": 815 }, { "epoch": 0.1632, "grad_norm": 14.774972915649414, "learning_rate": 1.7077551020408164e-05, "loss": 1.3850507736206055, "step": 816 }, { "epoch": 0.1634, "grad_norm": 11.22496223449707, "learning_rate": 1.7073469387755105e-05, "loss": 0.2509547472000122, "step": 817 }, { "epoch": 0.1636, "grad_norm": 10.1691312789917, "learning_rate": 1.7069387755102043e-05, "loss": 0.150825634598732, "step": 818 }, { "epoch": 0.1638, "grad_norm": 63.16236114501953, "learning_rate": 1.706530612244898e-05, "loss": 2.487028121948242, "step": 819 }, { "epoch": 0.164, "grad_norm": 29.83321189880371, "learning_rate": 1.7061224489795922e-05, "loss": 1.8375195264816284, "step": 820 }, { "epoch": 0.1642, "grad_norm": 13.816812515258789, "learning_rate": 1.705714285714286e-05, "loss": 0.6691530346870422, "step": 821 }, { "epoch": 0.1644, "grad_norm": 4.308398723602295, "learning_rate": 1.7053061224489797e-05, "loss": 0.05395898222923279, "step": 822 }, { "epoch": 0.1646, "grad_norm": 12.161376953125, "learning_rate": 1.7048979591836735e-05, "loss": 0.49681249260902405, "step": 823 }, { "epoch": 0.1648, "grad_norm": 7.281305313110352, "learning_rate": 1.7044897959183676e-05, "loss": 0.2690201997756958, "step": 824 }, { "epoch": 0.165, "grad_norm": 15.228219032287598, "learning_rate": 1.7040816326530613e-05, "loss": 2.183903932571411, "step": 825 }, { "epoch": 0.1652, "grad_norm": 11.266607284545898, "learning_rate": 1.703673469387755e-05, "loss": 1.8891977071762085, "step": 826 }, { "epoch": 0.1654, "grad_norm": 9.80399227142334, "learning_rate": 1.7032653061224492e-05, "loss": 1.9456223249435425, "step": 827 }, { "epoch": 0.1656, "grad_norm": 6.174814224243164, "learning_rate": 1.702857142857143e-05, "loss": 1.6220054626464844, "step": 828 }, { "epoch": 0.1658, "grad_norm": 9.70406723022461, "learning_rate": 1.7024489795918367e-05, "loss": 1.2355018854141235, "step": 829 }, { "epoch": 0.166, "grad_norm": 8.401901245117188, "learning_rate": 1.7020408163265308e-05, "loss": 0.797677755355835, "step": 830 }, { "epoch": 0.1662, "grad_norm": 10.09656810760498, "learning_rate": 1.7016326530612246e-05, "loss": 0.30209818482398987, "step": 831 }, { "epoch": 0.1664, "grad_norm": 7.124859809875488, "learning_rate": 1.7012244897959184e-05, "loss": 0.24133729934692383, "step": 832 }, { "epoch": 0.1666, "grad_norm": 8.425739288330078, "learning_rate": 1.7008163265306125e-05, "loss": 0.19166846573352814, "step": 833 }, { "epoch": 0.1668, "grad_norm": 5.5932769775390625, "learning_rate": 1.7004081632653062e-05, "loss": 0.12489122152328491, "step": 834 }, { "epoch": 0.167, "grad_norm": 7.666117191314697, "learning_rate": 1.7e-05, "loss": 0.3184269368648529, "step": 835 }, { "epoch": 0.1672, "grad_norm": 7.162036895751953, "learning_rate": 1.699591836734694e-05, "loss": 0.14772318303585052, "step": 836 }, { "epoch": 0.1674, "grad_norm": 23.604835510253906, "learning_rate": 1.699183673469388e-05, "loss": 1.2217166423797607, "step": 837 }, { "epoch": 0.1676, "grad_norm": 19.6046142578125, "learning_rate": 1.6987755102040816e-05, "loss": 1.1713870763778687, "step": 838 }, { "epoch": 0.1678, "grad_norm": 10.156699180603027, "learning_rate": 1.6983673469387757e-05, "loss": 0.7312700152397156, "step": 839 }, { "epoch": 0.168, "grad_norm": 6.102944850921631, "learning_rate": 1.6979591836734695e-05, "loss": 0.21069572865962982, "step": 840 }, { "epoch": 0.1682, "grad_norm": 7.401151657104492, "learning_rate": 1.6975510204081632e-05, "loss": 0.18337900936603546, "step": 841 }, { "epoch": 0.1684, "grad_norm": 5.875607490539551, "learning_rate": 1.6971428571428574e-05, "loss": 0.06811075657606125, "step": 842 }, { "epoch": 0.1686, "grad_norm": 15.57354736328125, "learning_rate": 1.696734693877551e-05, "loss": 2.1977641582489014, "step": 843 }, { "epoch": 0.1688, "grad_norm": 12.335282325744629, "learning_rate": 1.696326530612245e-05, "loss": 2.071246862411499, "step": 844 }, { "epoch": 0.169, "grad_norm": 9.727235794067383, "learning_rate": 1.695918367346939e-05, "loss": 1.200844168663025, "step": 845 }, { "epoch": 0.1692, "grad_norm": 11.321294784545898, "learning_rate": 1.6955102040816327e-05, "loss": 1.156197428703308, "step": 846 }, { "epoch": 0.1694, "grad_norm": 27.516820907592773, "learning_rate": 1.6951020408163265e-05, "loss": 1.9044551849365234, "step": 847 }, { "epoch": 0.1696, "grad_norm": 26.124889373779297, "learning_rate": 1.6946938775510203e-05, "loss": 1.5935903787612915, "step": 848 }, { "epoch": 0.1698, "grad_norm": 10.166653633117676, "learning_rate": 1.6942857142857144e-05, "loss": 1.2305039167404175, "step": 849 }, { "epoch": 0.17, "grad_norm": 10.783432006835938, "learning_rate": 1.6938775510204085e-05, "loss": 0.37452611327171326, "step": 850 }, { "epoch": 0.1702, "grad_norm": 8.736797332763672, "learning_rate": 1.6934693877551022e-05, "loss": 0.39462193846702576, "step": 851 }, { "epoch": 0.1704, "grad_norm": 4.686192989349365, "learning_rate": 1.693061224489796e-05, "loss": 0.05741189420223236, "step": 852 }, { "epoch": 0.1706, "grad_norm": 8.015236854553223, "learning_rate": 1.69265306122449e-05, "loss": 0.346451997756958, "step": 853 }, { "epoch": 0.1708, "grad_norm": 4.675577163696289, "learning_rate": 1.692244897959184e-05, "loss": 0.04520675167441368, "step": 854 }, { "epoch": 0.171, "grad_norm": 12.066052436828613, "learning_rate": 1.6918367346938776e-05, "loss": 0.5323671698570251, "step": 855 }, { "epoch": 0.1712, "grad_norm": 4.285087585449219, "learning_rate": 1.6914285714285717e-05, "loss": 0.05052861571311951, "step": 856 }, { "epoch": 0.1714, "grad_norm": 8.832772254943848, "learning_rate": 1.6910204081632655e-05, "loss": 2.343698501586914, "step": 857 }, { "epoch": 0.1716, "grad_norm": 7.151556491851807, "learning_rate": 1.6906122448979593e-05, "loss": 2.344592332839966, "step": 858 }, { "epoch": 0.1718, "grad_norm": 9.483006477355957, "learning_rate": 1.6902040816326534e-05, "loss": 1.2930094003677368, "step": 859 }, { "epoch": 0.172, "grad_norm": 10.535362243652344, "learning_rate": 1.689795918367347e-05, "loss": 0.9568271636962891, "step": 860 }, { "epoch": 0.1722, "grad_norm": 12.547567367553711, "learning_rate": 1.689387755102041e-05, "loss": 0.5192998051643372, "step": 861 }, { "epoch": 0.1724, "grad_norm": 8.66522216796875, "learning_rate": 1.688979591836735e-05, "loss": 0.18473176658153534, "step": 862 }, { "epoch": 0.1726, "grad_norm": 11.593875885009766, "learning_rate": 1.6885714285714288e-05, "loss": 1.230906367301941, "step": 863 }, { "epoch": 0.1728, "grad_norm": 12.751099586486816, "learning_rate": 1.6881632653061225e-05, "loss": 1.3743547201156616, "step": 864 }, { "epoch": 0.173, "grad_norm": 11.993965148925781, "learning_rate": 1.6877551020408166e-05, "loss": 1.8797513246536255, "step": 865 }, { "epoch": 0.1732, "grad_norm": 10.8436279296875, "learning_rate": 1.6873469387755104e-05, "loss": 1.398915410041809, "step": 866 }, { "epoch": 0.1734, "grad_norm": 12.852522850036621, "learning_rate": 1.686938775510204e-05, "loss": 1.0060895681381226, "step": 867 }, { "epoch": 0.1736, "grad_norm": 12.911187171936035, "learning_rate": 1.6865306122448983e-05, "loss": 0.9872347712516785, "step": 868 }, { "epoch": 0.1738, "grad_norm": 10.305516242980957, "learning_rate": 1.686122448979592e-05, "loss": 0.8250768184661865, "step": 869 }, { "epoch": 0.174, "grad_norm": 8.301410675048828, "learning_rate": 1.6857142857142858e-05, "loss": 0.1378517895936966, "step": 870 }, { "epoch": 0.1742, "grad_norm": 10.204079627990723, "learning_rate": 1.68530612244898e-05, "loss": 0.8009190559387207, "step": 871 }, { "epoch": 0.1744, "grad_norm": 11.249664306640625, "learning_rate": 1.6848979591836737e-05, "loss": 0.389691025018692, "step": 872 }, { "epoch": 0.1746, "grad_norm": 10.197457313537598, "learning_rate": 1.6844897959183674e-05, "loss": 0.3324334919452667, "step": 873 }, { "epoch": 0.1748, "grad_norm": 6.4466400146484375, "learning_rate": 1.6840816326530612e-05, "loss": 0.16674156486988068, "step": 874 }, { "epoch": 0.175, "grad_norm": 8.318696022033691, "learning_rate": 1.6836734693877553e-05, "loss": 0.4006370007991791, "step": 875 }, { "epoch": 0.1752, "grad_norm": 4.881451606750488, "learning_rate": 1.683265306122449e-05, "loss": 0.163467139005661, "step": 876 }, { "epoch": 0.1754, "grad_norm": 9.221220970153809, "learning_rate": 1.6828571428571428e-05, "loss": 0.5745043754577637, "step": 877 }, { "epoch": 0.1756, "grad_norm": 9.977005958557129, "learning_rate": 1.682448979591837e-05, "loss": 0.2814178466796875, "step": 878 }, { "epoch": 0.1758, "grad_norm": 10.049405097961426, "learning_rate": 1.6820408163265307e-05, "loss": 0.39915338158607483, "step": 879 }, { "epoch": 0.176, "grad_norm": 8.953500747680664, "learning_rate": 1.6816326530612244e-05, "loss": 0.6644017696380615, "step": 880 }, { "epoch": 0.1762, "grad_norm": 35.30412292480469, "learning_rate": 1.6812244897959185e-05, "loss": 2.2476003170013428, "step": 881 }, { "epoch": 0.1764, "grad_norm": 23.374265670776367, "learning_rate": 1.6808163265306123e-05, "loss": 1.5329824686050415, "step": 882 }, { "epoch": 0.1766, "grad_norm": 20.165687561035156, "learning_rate": 1.680408163265306e-05, "loss": 0.6441766023635864, "step": 883 }, { "epoch": 0.1768, "grad_norm": 17.912281036376953, "learning_rate": 1.6800000000000002e-05, "loss": 0.4463423490524292, "step": 884 }, { "epoch": 0.177, "grad_norm": 20.251941680908203, "learning_rate": 1.679591836734694e-05, "loss": 0.8454023003578186, "step": 885 }, { "epoch": 0.1772, "grad_norm": 22.15752410888672, "learning_rate": 1.6791836734693877e-05, "loss": 0.892297089099884, "step": 886 }, { "epoch": 0.1774, "grad_norm": 10.843847274780273, "learning_rate": 1.6787755102040818e-05, "loss": 0.5362411141395569, "step": 887 }, { "epoch": 0.1776, "grad_norm": 4.842869758605957, "learning_rate": 1.6783673469387756e-05, "loss": 0.15559206902980804, "step": 888 }, { "epoch": 0.1778, "grad_norm": 8.601651191711426, "learning_rate": 1.6779591836734693e-05, "loss": 0.2904387414455414, "step": 889 }, { "epoch": 0.178, "grad_norm": 3.9437415599823, "learning_rate": 1.6775510204081634e-05, "loss": 0.04863588139414787, "step": 890 }, { "epoch": 0.1782, "grad_norm": 7.6431708335876465, "learning_rate": 1.6771428571428572e-05, "loss": 0.18728196620941162, "step": 891 }, { "epoch": 0.1784, "grad_norm": 10.038166046142578, "learning_rate": 1.676734693877551e-05, "loss": 0.2966366410255432, "step": 892 }, { "epoch": 0.1786, "grad_norm": 11.422334671020508, "learning_rate": 1.676326530612245e-05, "loss": 0.5634375214576721, "step": 893 }, { "epoch": 0.1788, "grad_norm": 11.061003684997559, "learning_rate": 1.6759183673469392e-05, "loss": 0.6100184321403503, "step": 894 }, { "epoch": 0.179, "grad_norm": 10.992979049682617, "learning_rate": 1.675510204081633e-05, "loss": 1.1781855821609497, "step": 895 }, { "epoch": 0.1792, "grad_norm": 13.457104682922363, "learning_rate": 1.6751020408163267e-05, "loss": 1.1264137029647827, "step": 896 }, { "epoch": 0.1794, "grad_norm": 7.429170608520508, "learning_rate": 1.6746938775510208e-05, "loss": 0.5296066403388977, "step": 897 }, { "epoch": 0.1796, "grad_norm": 12.779236793518066, "learning_rate": 1.6742857142857146e-05, "loss": 0.47797414660453796, "step": 898 }, { "epoch": 0.1798, "grad_norm": 12.133633613586426, "learning_rate": 1.6738775510204083e-05, "loss": 1.237507939338684, "step": 899 }, { "epoch": 0.18, "grad_norm": 14.93108081817627, "learning_rate": 1.673469387755102e-05, "loss": 1.5555381774902344, "step": 900 }, { "epoch": 0.1802, "grad_norm": 7.929037570953369, "learning_rate": 1.6730612244897962e-05, "loss": 0.27353933453559875, "step": 901 }, { "epoch": 0.1804, "grad_norm": 9.74111557006836, "learning_rate": 1.67265306122449e-05, "loss": 0.3268223702907562, "step": 902 }, { "epoch": 0.1806, "grad_norm": 12.810250282287598, "learning_rate": 1.6722448979591837e-05, "loss": 1.1125568151474, "step": 903 }, { "epoch": 0.1808, "grad_norm": 10.80146312713623, "learning_rate": 1.6718367346938778e-05, "loss": 0.8165103793144226, "step": 904 }, { "epoch": 0.181, "grad_norm": 12.98203182220459, "learning_rate": 1.6714285714285716e-05, "loss": 0.8082551956176758, "step": 905 }, { "epoch": 0.1812, "grad_norm": 15.881731033325195, "learning_rate": 1.6710204081632654e-05, "loss": 0.8954185843467712, "step": 906 }, { "epoch": 0.1814, "grad_norm": 7.476495742797852, "learning_rate": 1.6706122448979595e-05, "loss": 2.61548113822937, "step": 907 }, { "epoch": 0.1816, "grad_norm": 11.759521484375, "learning_rate": 1.6702040816326532e-05, "loss": 2.5834901332855225, "step": 908 }, { "epoch": 0.1818, "grad_norm": 17.707563400268555, "learning_rate": 1.669795918367347e-05, "loss": 0.46600961685180664, "step": 909 }, { "epoch": 0.182, "grad_norm": 17.35898208618164, "learning_rate": 1.669387755102041e-05, "loss": 0.5477342009544373, "step": 910 }, { "epoch": 0.1822, "grad_norm": 16.38886260986328, "learning_rate": 1.668979591836735e-05, "loss": 0.241845965385437, "step": 911 }, { "epoch": 0.1824, "grad_norm": 9.18626594543457, "learning_rate": 1.6685714285714286e-05, "loss": 0.3251483738422394, "step": 912 }, { "epoch": 0.1826, "grad_norm": 11.48183536529541, "learning_rate": 1.6681632653061227e-05, "loss": 1.201566457748413, "step": 913 }, { "epoch": 0.1828, "grad_norm": 12.863166809082031, "learning_rate": 1.6677551020408165e-05, "loss": 1.3929704427719116, "step": 914 }, { "epoch": 0.183, "grad_norm": 8.982046127319336, "learning_rate": 1.6673469387755102e-05, "loss": 0.30986925959587097, "step": 915 }, { "epoch": 0.1832, "grad_norm": 11.40433120727539, "learning_rate": 1.6669387755102044e-05, "loss": 0.4559279978275299, "step": 916 }, { "epoch": 0.1834, "grad_norm": 7.865514755249023, "learning_rate": 1.666530612244898e-05, "loss": 0.21401290595531464, "step": 917 }, { "epoch": 0.1836, "grad_norm": 5.85783052444458, "learning_rate": 1.666122448979592e-05, "loss": 0.058106567710638046, "step": 918 }, { "epoch": 0.1838, "grad_norm": 12.835536003112793, "learning_rate": 1.665714285714286e-05, "loss": 0.5007133483886719, "step": 919 }, { "epoch": 0.184, "grad_norm": 10.269946098327637, "learning_rate": 1.6653061224489797e-05, "loss": 1.027040958404541, "step": 920 }, { "epoch": 0.1842, "grad_norm": 28.42228889465332, "learning_rate": 1.6648979591836735e-05, "loss": 1.9055904150009155, "step": 921 }, { "epoch": 0.1844, "grad_norm": 30.98174476623535, "learning_rate": 1.6644897959183676e-05, "loss": 1.6907411813735962, "step": 922 }, { "epoch": 0.1846, "grad_norm": 11.129870414733887, "learning_rate": 1.6640816326530614e-05, "loss": 0.4462185800075531, "step": 923 }, { "epoch": 0.1848, "grad_norm": 9.945165634155273, "learning_rate": 1.663673469387755e-05, "loss": 0.38100337982177734, "step": 924 }, { "epoch": 0.185, "grad_norm": 9.017776489257812, "learning_rate": 1.6632653061224492e-05, "loss": 0.3078192174434662, "step": 925 }, { "epoch": 0.1852, "grad_norm": 7.9817118644714355, "learning_rate": 1.662857142857143e-05, "loss": 0.24498671293258667, "step": 926 }, { "epoch": 0.1854, "grad_norm": 10.647873878479004, "learning_rate": 1.6624489795918368e-05, "loss": 0.6621216535568237, "step": 927 }, { "epoch": 0.1856, "grad_norm": 8.087475776672363, "learning_rate": 1.6620408163265305e-05, "loss": 0.9449856877326965, "step": 928 }, { "epoch": 0.1858, "grad_norm": 9.353774070739746, "learning_rate": 1.6616326530612246e-05, "loss": 0.6571586728096008, "step": 929 }, { "epoch": 0.186, "grad_norm": 5.978039264678955, "learning_rate": 1.6612244897959184e-05, "loss": 0.3160820007324219, "step": 930 }, { "epoch": 0.1862, "grad_norm": 9.821951866149902, "learning_rate": 1.660816326530612e-05, "loss": 1.3220912218093872, "step": 931 }, { "epoch": 0.1864, "grad_norm": 16.91419792175293, "learning_rate": 1.6604081632653063e-05, "loss": 1.5503568649291992, "step": 932 }, { "epoch": 0.1866, "grad_norm": 10.032426834106445, "learning_rate": 1.66e-05, "loss": 0.3008374869823456, "step": 933 }, { "epoch": 0.1868, "grad_norm": 9.833456993103027, "learning_rate": 1.6595918367346938e-05, "loss": 0.3403104841709137, "step": 934 }, { "epoch": 0.187, "grad_norm": 10.148767471313477, "learning_rate": 1.659183673469388e-05, "loss": 0.35658565163612366, "step": 935 }, { "epoch": 0.1872, "grad_norm": 9.864377975463867, "learning_rate": 1.6587755102040817e-05, "loss": 0.3419613540172577, "step": 936 }, { "epoch": 0.1874, "grad_norm": 10.93486213684082, "learning_rate": 1.6583673469387754e-05, "loss": 0.3023720681667328, "step": 937 }, { "epoch": 0.1876, "grad_norm": 11.544412612915039, "learning_rate": 1.6579591836734695e-05, "loss": 0.3000563085079193, "step": 938 }, { "epoch": 0.1878, "grad_norm": 26.628463745117188, "learning_rate": 1.6575510204081633e-05, "loss": 1.5193748474121094, "step": 939 }, { "epoch": 0.188, "grad_norm": 34.582759857177734, "learning_rate": 1.6571428571428574e-05, "loss": 1.9608640670776367, "step": 940 }, { "epoch": 0.1882, "grad_norm": 12.25756549835205, "learning_rate": 1.656734693877551e-05, "loss": 0.35635626316070557, "step": 941 }, { "epoch": 0.1884, "grad_norm": 9.758035659790039, "learning_rate": 1.6563265306122453e-05, "loss": 0.5577664375305176, "step": 942 }, { "epoch": 0.1886, "grad_norm": 11.022090911865234, "learning_rate": 1.655918367346939e-05, "loss": 0.7006524205207825, "step": 943 }, { "epoch": 0.1888, "grad_norm": 5.026852607727051, "learning_rate": 1.6555102040816328e-05, "loss": 0.07087205350399017, "step": 944 }, { "epoch": 0.189, "grad_norm": 17.114988327026367, "learning_rate": 1.655102040816327e-05, "loss": 0.38376402854919434, "step": 945 }, { "epoch": 0.1892, "grad_norm": 7.769924640655518, "learning_rate": 1.6546938775510207e-05, "loss": 0.42324718832969666, "step": 946 }, { "epoch": 0.1894, "grad_norm": 6.787831783294678, "learning_rate": 1.6542857142857144e-05, "loss": 0.15425239503383636, "step": 947 }, { "epoch": 0.1896, "grad_norm": 9.386636734008789, "learning_rate": 1.6538775510204085e-05, "loss": 0.12651686370372772, "step": 948 }, { "epoch": 0.1898, "grad_norm": 9.607634544372559, "learning_rate": 1.6534693877551023e-05, "loss": 1.2049074172973633, "step": 949 }, { "epoch": 0.19, "grad_norm": 6.797369480133057, "learning_rate": 1.653061224489796e-05, "loss": 0.4870491027832031, "step": 950 }, { "epoch": 0.1902, "grad_norm": 11.430431365966797, "learning_rate": 1.65265306122449e-05, "loss": 0.5907344818115234, "step": 951 }, { "epoch": 0.1904, "grad_norm": 5.133027076721191, "learning_rate": 1.652244897959184e-05, "loss": 0.05749613419175148, "step": 952 }, { "epoch": 0.1906, "grad_norm": 12.587271690368652, "learning_rate": 1.6518367346938777e-05, "loss": 0.5050173997879028, "step": 953 }, { "epoch": 0.1908, "grad_norm": 8.202754974365234, "learning_rate": 1.6514285714285714e-05, "loss": 0.12933553755283356, "step": 954 }, { "epoch": 0.191, "grad_norm": 16.069805145263672, "learning_rate": 1.6510204081632655e-05, "loss": 0.9935396313667297, "step": 955 }, { "epoch": 0.1912, "grad_norm": 12.470759391784668, "learning_rate": 1.6506122448979593e-05, "loss": 0.9283061623573303, "step": 956 }, { "epoch": 0.1914, "grad_norm": 11.212996482849121, "learning_rate": 1.650204081632653e-05, "loss": 0.42528319358825684, "step": 957 }, { "epoch": 0.1916, "grad_norm": 11.121280670166016, "learning_rate": 1.6497959183673472e-05, "loss": 0.3700462281703949, "step": 958 }, { "epoch": 0.1918, "grad_norm": 7.70326042175293, "learning_rate": 1.649387755102041e-05, "loss": 0.29956695437431335, "step": 959 }, { "epoch": 0.192, "grad_norm": 7.19465970993042, "learning_rate": 1.6489795918367347e-05, "loss": 0.12585383653640747, "step": 960 }, { "epoch": 0.1922, "grad_norm": 9.511792182922363, "learning_rate": 1.6485714285714288e-05, "loss": 0.7968559265136719, "step": 961 }, { "epoch": 0.1924, "grad_norm": 9.611586570739746, "learning_rate": 1.6481632653061226e-05, "loss": 0.5876891016960144, "step": 962 }, { "epoch": 0.1926, "grad_norm": 13.071715354919434, "learning_rate": 1.6477551020408163e-05, "loss": 2.1680548191070557, "step": 963 }, { "epoch": 0.1928, "grad_norm": 13.33552074432373, "learning_rate": 1.6473469387755104e-05, "loss": 2.052441358566284, "step": 964 }, { "epoch": 0.193, "grad_norm": 6.807119369506836, "learning_rate": 1.6469387755102042e-05, "loss": 0.11891964823007584, "step": 965 }, { "epoch": 0.1932, "grad_norm": 7.7496867179870605, "learning_rate": 1.646530612244898e-05, "loss": 0.09799344092607498, "step": 966 }, { "epoch": 0.1934, "grad_norm": 7.558412551879883, "learning_rate": 1.646122448979592e-05, "loss": 3.6984379291534424, "step": 967 }, { "epoch": 0.1936, "grad_norm": 9.672758102416992, "learning_rate": 1.645714285714286e-05, "loss": 3.672480344772339, "step": 968 }, { "epoch": 0.1938, "grad_norm": 11.874825477600098, "learning_rate": 1.6453061224489796e-05, "loss": 1.5774186849594116, "step": 969 }, { "epoch": 0.194, "grad_norm": 9.94528865814209, "learning_rate": 1.6448979591836737e-05, "loss": 1.4908758401870728, "step": 970 }, { "epoch": 0.1942, "grad_norm": 8.902161598205566, "learning_rate": 1.6444897959183675e-05, "loss": 1.4055157899856567, "step": 971 }, { "epoch": 0.1944, "grad_norm": 8.480953216552734, "learning_rate": 1.6440816326530612e-05, "loss": 1.4322876930236816, "step": 972 }, { "epoch": 0.1946, "grad_norm": 8.42033576965332, "learning_rate": 1.6436734693877553e-05, "loss": 0.5924834609031677, "step": 973 }, { "epoch": 0.1948, "grad_norm": 9.280115127563477, "learning_rate": 1.643265306122449e-05, "loss": 1.2530088424682617, "step": 974 }, { "epoch": 0.195, "grad_norm": 11.592122077941895, "learning_rate": 1.642857142857143e-05, "loss": 2.4300496578216553, "step": 975 }, { "epoch": 0.1952, "grad_norm": 12.084485054016113, "learning_rate": 1.642448979591837e-05, "loss": 2.4004557132720947, "step": 976 }, { "epoch": 0.1954, "grad_norm": 15.528419494628906, "learning_rate": 1.6420408163265307e-05, "loss": 0.7741004824638367, "step": 977 }, { "epoch": 0.1956, "grad_norm": 21.244705200195312, "learning_rate": 1.6416326530612245e-05, "loss": 0.8380337357521057, "step": 978 }, { "epoch": 0.1958, "grad_norm": 45.149131774902344, "learning_rate": 1.6412244897959183e-05, "loss": 2.7999441623687744, "step": 979 }, { "epoch": 0.196, "grad_norm": 12.799947738647461, "learning_rate": 1.6408163265306124e-05, "loss": 2.601938486099243, "step": 980 }, { "epoch": 0.1962, "grad_norm": 12.487517356872559, "learning_rate": 1.640408163265306e-05, "loss": 0.842620313167572, "step": 981 }, { "epoch": 0.1964, "grad_norm": 11.241732597351074, "learning_rate": 1.64e-05, "loss": 1.266300082206726, "step": 982 }, { "epoch": 0.1966, "grad_norm": 13.937366485595703, "learning_rate": 1.639591836734694e-05, "loss": 0.48552432656288147, "step": 983 }, { "epoch": 0.1968, "grad_norm": 12.574949264526367, "learning_rate": 1.6391836734693878e-05, "loss": 0.39523372054100037, "step": 984 }, { "epoch": 0.197, "grad_norm": 12.430194854736328, "learning_rate": 1.638775510204082e-05, "loss": 0.41487932205200195, "step": 985 }, { "epoch": 0.1972, "grad_norm": 12.363018035888672, "learning_rate": 1.6383673469387756e-05, "loss": 0.5536958575248718, "step": 986 }, { "epoch": 0.1974, "grad_norm": 12.69167709350586, "learning_rate": 1.6379591836734697e-05, "loss": 0.532865583896637, "step": 987 }, { "epoch": 0.1976, "grad_norm": 12.416764259338379, "learning_rate": 1.6375510204081635e-05, "loss": 0.49868759512901306, "step": 988 }, { "epoch": 0.1978, "grad_norm": 21.368391036987305, "learning_rate": 1.6371428571428572e-05, "loss": 0.9303390979766846, "step": 989 }, { "epoch": 0.198, "grad_norm": 17.462474822998047, "learning_rate": 1.6367346938775513e-05, "loss": 0.6140914559364319, "step": 990 }, { "epoch": 0.1982, "grad_norm": 13.780851364135742, "learning_rate": 1.636326530612245e-05, "loss": 0.538284420967102, "step": 991 }, { "epoch": 0.1984, "grad_norm": 11.856263160705566, "learning_rate": 1.635918367346939e-05, "loss": 0.3810892403125763, "step": 992 }, { "epoch": 0.1986, "grad_norm": 13.552824974060059, "learning_rate": 1.635510204081633e-05, "loss": 1.638392448425293, "step": 993 }, { "epoch": 0.1988, "grad_norm": 13.62729263305664, "learning_rate": 1.6351020408163267e-05, "loss": 1.083305835723877, "step": 994 }, { "epoch": 0.199, "grad_norm": 13.757994651794434, "learning_rate": 1.6346938775510205e-05, "loss": 0.7081871032714844, "step": 995 }, { "epoch": 0.1992, "grad_norm": 11.198212623596191, "learning_rate": 1.6342857142857146e-05, "loss": 0.4434303343296051, "step": 996 }, { "epoch": 0.1994, "grad_norm": 9.911272048950195, "learning_rate": 1.6338775510204084e-05, "loss": 0.4623580276966095, "step": 997 }, { "epoch": 0.1996, "grad_norm": 6.290027618408203, "learning_rate": 1.633469387755102e-05, "loss": 0.2070257067680359, "step": 998 }, { "epoch": 0.1998, "grad_norm": 8.490586280822754, "learning_rate": 1.6330612244897962e-05, "loss": 0.41782307624816895, "step": 999 }, { "epoch": 0.2, "grad_norm": 6.666543006896973, "learning_rate": 1.63265306122449e-05, "loss": 0.0420907698571682, "step": 1000 }, { "epoch": 0.2002, "grad_norm": 7.6608710289001465, "learning_rate": 1.6322448979591838e-05, "loss": 0.16788025200366974, "step": 1001 }, { "epoch": 0.2004, "grad_norm": 10.02257251739502, "learning_rate": 1.631836734693878e-05, "loss": 0.19406701624393463, "step": 1002 }, { "epoch": 0.2006, "grad_norm": 9.231233596801758, "learning_rate": 1.6314285714285716e-05, "loss": 0.34114423394203186, "step": 1003 }, { "epoch": 0.2008, "grad_norm": 8.61544418334961, "learning_rate": 1.6310204081632654e-05, "loss": 0.36107227206230164, "step": 1004 }, { "epoch": 0.201, "grad_norm": 12.734411239624023, "learning_rate": 1.630612244897959e-05, "loss": 0.4258911609649658, "step": 1005 }, { "epoch": 0.2012, "grad_norm": 8.247803688049316, "learning_rate": 1.6302040816326533e-05, "loss": 0.5811495780944824, "step": 1006 }, { "epoch": 0.2014, "grad_norm": 12.795159339904785, "learning_rate": 1.629795918367347e-05, "loss": 0.45137616991996765, "step": 1007 }, { "epoch": 0.2016, "grad_norm": 11.190593719482422, "learning_rate": 1.6293877551020408e-05, "loss": 0.4331296384334564, "step": 1008 }, { "epoch": 0.2018, "grad_norm": 6.39260196685791, "learning_rate": 1.628979591836735e-05, "loss": 0.23958873748779297, "step": 1009 }, { "epoch": 0.202, "grad_norm": 8.173701286315918, "learning_rate": 1.6285714285714287e-05, "loss": 0.301384836435318, "step": 1010 }, { "epoch": 0.2022, "grad_norm": 6.8658552169799805, "learning_rate": 1.6281632653061224e-05, "loss": 2.3241984844207764, "step": 1011 }, { "epoch": 0.2024, "grad_norm": 8.660361289978027, "learning_rate": 1.6277551020408165e-05, "loss": 2.2831337451934814, "step": 1012 }, { "epoch": 0.2026, "grad_norm": 8.958223342895508, "learning_rate": 1.6273469387755103e-05, "loss": 0.4590326249599457, "step": 1013 }, { "epoch": 0.2028, "grad_norm": 5.628757476806641, "learning_rate": 1.626938775510204e-05, "loss": 0.2605743706226349, "step": 1014 }, { "epoch": 0.203, "grad_norm": 11.452364921569824, "learning_rate": 1.626530612244898e-05, "loss": 0.4239030182361603, "step": 1015 }, { "epoch": 0.2032, "grad_norm": 9.73918342590332, "learning_rate": 1.626122448979592e-05, "loss": 0.2536905109882355, "step": 1016 }, { "epoch": 0.2034, "grad_norm": 10.82717514038086, "learning_rate": 1.6257142857142857e-05, "loss": 3.9543802738189697, "step": 1017 }, { "epoch": 0.2036, "grad_norm": 8.782698631286621, "learning_rate": 1.6253061224489798e-05, "loss": 3.950533628463745, "step": 1018 }, { "epoch": 0.2038, "grad_norm": 12.06242847442627, "learning_rate": 1.6248979591836736e-05, "loss": 0.34714195132255554, "step": 1019 }, { "epoch": 0.204, "grad_norm": 10.154047966003418, "learning_rate": 1.6244897959183673e-05, "loss": 0.4075103998184204, "step": 1020 }, { "epoch": 0.2042, "grad_norm": 16.931293487548828, "learning_rate": 1.6240816326530614e-05, "loss": 0.6289825439453125, "step": 1021 }, { "epoch": 0.2044, "grad_norm": 14.441441535949707, "learning_rate": 1.6236734693877552e-05, "loss": 0.46771684288978577, "step": 1022 }, { "epoch": 0.2046, "grad_norm": 16.51957130432129, "learning_rate": 1.623265306122449e-05, "loss": 2.189422845840454, "step": 1023 }, { "epoch": 0.2048, "grad_norm": 16.4713077545166, "learning_rate": 1.622857142857143e-05, "loss": 2.1738336086273193, "step": 1024 }, { "epoch": 0.205, "grad_norm": 18.388099670410156, "learning_rate": 1.6224489795918368e-05, "loss": 0.933708131313324, "step": 1025 }, { "epoch": 0.2052, "grad_norm": 24.324893951416016, "learning_rate": 1.6220408163265306e-05, "loss": 1.1171584129333496, "step": 1026 }, { "epoch": 0.2054, "grad_norm": 9.67011833190918, "learning_rate": 1.6216326530612247e-05, "loss": 0.8000146746635437, "step": 1027 }, { "epoch": 0.2056, "grad_norm": 11.25383472442627, "learning_rate": 1.6212244897959184e-05, "loss": 0.6505023837089539, "step": 1028 }, { "epoch": 0.2058, "grad_norm": 9.337218284606934, "learning_rate": 1.6208163265306122e-05, "loss": 1.2439852952957153, "step": 1029 }, { "epoch": 0.206, "grad_norm": 11.8048677444458, "learning_rate": 1.6204081632653063e-05, "loss": 1.338663935661316, "step": 1030 }, { "epoch": 0.2062, "grad_norm": 10.880928993225098, "learning_rate": 1.62e-05, "loss": 0.3804416358470917, "step": 1031 }, { "epoch": 0.2064, "grad_norm": 6.2770514488220215, "learning_rate": 1.6195918367346942e-05, "loss": 0.5007272362709045, "step": 1032 }, { "epoch": 0.2066, "grad_norm": 12.41136646270752, "learning_rate": 1.619183673469388e-05, "loss": 0.5273497700691223, "step": 1033 }, { "epoch": 0.2068, "grad_norm": 11.801478385925293, "learning_rate": 1.6187755102040817e-05, "loss": 0.3947758674621582, "step": 1034 }, { "epoch": 0.207, "grad_norm": 11.118836402893066, "learning_rate": 1.6183673469387758e-05, "loss": 0.8456204533576965, "step": 1035 }, { "epoch": 0.2072, "grad_norm": 8.68510627746582, "learning_rate": 1.6179591836734696e-05, "loss": 0.9995532035827637, "step": 1036 }, { "epoch": 0.2074, "grad_norm": 9.32492733001709, "learning_rate": 1.6175510204081633e-05, "loss": 0.3073303997516632, "step": 1037 }, { "epoch": 0.2076, "grad_norm": 9.923360824584961, "learning_rate": 1.6171428571428574e-05, "loss": 0.24806194007396698, "step": 1038 }, { "epoch": 0.2078, "grad_norm": 8.502784729003906, "learning_rate": 1.6167346938775512e-05, "loss": 0.3961394131183624, "step": 1039 }, { "epoch": 0.208, "grad_norm": 3.021148204803467, "learning_rate": 1.616326530612245e-05, "loss": 0.023742148652672768, "step": 1040 }, { "epoch": 0.2082, "grad_norm": 6.566261291503906, "learning_rate": 1.615918367346939e-05, "loss": 0.2751035988330841, "step": 1041 }, { "epoch": 0.2084, "grad_norm": 4.9713029861450195, "learning_rate": 1.615510204081633e-05, "loss": 0.21981175243854523, "step": 1042 }, { "epoch": 0.2086, "grad_norm": 8.515305519104004, "learning_rate": 1.6151020408163266e-05, "loss": 0.28081637620925903, "step": 1043 }, { "epoch": 0.2088, "grad_norm": 8.78039836883545, "learning_rate": 1.6146938775510207e-05, "loss": 0.27198779582977295, "step": 1044 }, { "epoch": 0.209, "grad_norm": 9.654149055480957, "learning_rate": 1.6142857142857145e-05, "loss": 0.6349632143974304, "step": 1045 }, { "epoch": 0.2092, "grad_norm": 8.170354843139648, "learning_rate": 1.6138775510204082e-05, "loss": 0.8125829696655273, "step": 1046 }, { "epoch": 0.2094, "grad_norm": 7.464170932769775, "learning_rate": 1.6134693877551023e-05, "loss": 0.307439386844635, "step": 1047 }, { "epoch": 0.2096, "grad_norm": 10.438311576843262, "learning_rate": 1.613061224489796e-05, "loss": 0.36502647399902344, "step": 1048 }, { "epoch": 0.2098, "grad_norm": 12.222745895385742, "learning_rate": 1.61265306122449e-05, "loss": 0.4203064739704132, "step": 1049 }, { "epoch": 0.21, "grad_norm": 4.82296895980835, "learning_rate": 1.612244897959184e-05, "loss": 0.11699660867452621, "step": 1050 }, { "epoch": 0.2102, "grad_norm": 9.689432144165039, "learning_rate": 1.6118367346938777e-05, "loss": 0.4499718248844147, "step": 1051 }, { "epoch": 0.2104, "grad_norm": 3.983093500137329, "learning_rate": 1.6114285714285715e-05, "loss": 0.08581101894378662, "step": 1052 }, { "epoch": 0.2106, "grad_norm": 8.218273162841797, "learning_rate": 1.6110204081632656e-05, "loss": 1.0330334901809692, "step": 1053 }, { "epoch": 0.2108, "grad_norm": 8.369105339050293, "learning_rate": 1.6106122448979594e-05, "loss": 0.7431716918945312, "step": 1054 }, { "epoch": 0.211, "grad_norm": 6.2383832931518555, "learning_rate": 1.610204081632653e-05, "loss": 0.27529022097587585, "step": 1055 }, { "epoch": 0.2112, "grad_norm": 7.661225318908691, "learning_rate": 1.6097959183673472e-05, "loss": 0.3052869439125061, "step": 1056 }, { "epoch": 0.2114, "grad_norm": 14.939837455749512, "learning_rate": 1.609387755102041e-05, "loss": 0.5405413508415222, "step": 1057 }, { "epoch": 0.2116, "grad_norm": 16.573583602905273, "learning_rate": 1.6089795918367347e-05, "loss": 0.6392536759376526, "step": 1058 }, { "epoch": 0.2118, "grad_norm": 15.064249038696289, "learning_rate": 1.6085714285714285e-05, "loss": 2.238734006881714, "step": 1059 }, { "epoch": 0.212, "grad_norm": 16.13454818725586, "learning_rate": 1.6081632653061226e-05, "loss": 2.2306811809539795, "step": 1060 }, { "epoch": 0.2122, "grad_norm": 11.392303466796875, "learning_rate": 1.6077551020408164e-05, "loss": 0.4538653790950775, "step": 1061 }, { "epoch": 0.2124, "grad_norm": 19.961862564086914, "learning_rate": 1.60734693877551e-05, "loss": 0.38527926802635193, "step": 1062 }, { "epoch": 0.2126, "grad_norm": 9.796698570251465, "learning_rate": 1.6069387755102042e-05, "loss": 0.3802415132522583, "step": 1063 }, { "epoch": 0.2128, "grad_norm": 11.403270721435547, "learning_rate": 1.606530612244898e-05, "loss": 0.3938158452510834, "step": 1064 }, { "epoch": 0.213, "grad_norm": 8.493871688842773, "learning_rate": 1.6061224489795918e-05, "loss": 0.556251585483551, "step": 1065 }, { "epoch": 0.2132, "grad_norm": 5.4968156814575195, "learning_rate": 1.605714285714286e-05, "loss": 0.21873192489147186, "step": 1066 }, { "epoch": 0.2134, "grad_norm": 9.716056823730469, "learning_rate": 1.6053061224489796e-05, "loss": 0.5703141689300537, "step": 1067 }, { "epoch": 0.2136, "grad_norm": 6.592094898223877, "learning_rate": 1.6048979591836734e-05, "loss": 0.16273975372314453, "step": 1068 }, { "epoch": 0.2138, "grad_norm": 11.018513679504395, "learning_rate": 1.6044897959183675e-05, "loss": 0.3084613084793091, "step": 1069 }, { "epoch": 0.214, "grad_norm": 5.117728233337402, "learning_rate": 1.6040816326530613e-05, "loss": 0.3241164982318878, "step": 1070 }, { "epoch": 0.2142, "grad_norm": 9.158685684204102, "learning_rate": 1.603673469387755e-05, "loss": 1.0577012300491333, "step": 1071 }, { "epoch": 0.2144, "grad_norm": 10.653162002563477, "learning_rate": 1.603265306122449e-05, "loss": 0.5433899760246277, "step": 1072 }, { "epoch": 0.2146, "grad_norm": 19.387746810913086, "learning_rate": 1.602857142857143e-05, "loss": 1.367659568786621, "step": 1073 }, { "epoch": 0.2148, "grad_norm": 16.123849868774414, "learning_rate": 1.6024489795918367e-05, "loss": 1.470677375793457, "step": 1074 }, { "epoch": 0.215, "grad_norm": 12.336935997009277, "learning_rate": 1.6020408163265308e-05, "loss": 1.355057716369629, "step": 1075 }, { "epoch": 0.2152, "grad_norm": 15.715835571289062, "learning_rate": 1.601632653061225e-05, "loss": 0.7269086837768555, "step": 1076 }, { "epoch": 0.2154, "grad_norm": 9.702445983886719, "learning_rate": 1.6012244897959186e-05, "loss": 0.7072898745536804, "step": 1077 }, { "epoch": 0.2156, "grad_norm": 10.954768180847168, "learning_rate": 1.6008163265306124e-05, "loss": 0.5804625749588013, "step": 1078 }, { "epoch": 0.2158, "grad_norm": 7.3774027824401855, "learning_rate": 1.6004081632653065e-05, "loss": 0.6128941774368286, "step": 1079 }, { "epoch": 0.216, "grad_norm": 9.81145191192627, "learning_rate": 1.6000000000000003e-05, "loss": 0.5012831687927246, "step": 1080 }, { "epoch": 0.2162, "grad_norm": 17.655073165893555, "learning_rate": 1.599591836734694e-05, "loss": 0.9120580554008484, "step": 1081 }, { "epoch": 0.2164, "grad_norm": 17.177391052246094, "learning_rate": 1.599183673469388e-05, "loss": 0.9013988375663757, "step": 1082 }, { "epoch": 0.2166, "grad_norm": 9.5853853225708, "learning_rate": 1.598775510204082e-05, "loss": 0.22319452464580536, "step": 1083 }, { "epoch": 0.2168, "grad_norm": 7.035661220550537, "learning_rate": 1.5983673469387757e-05, "loss": 0.13573120534420013, "step": 1084 }, { "epoch": 0.217, "grad_norm": 10.84890079498291, "learning_rate": 1.5979591836734694e-05, "loss": 0.9032676815986633, "step": 1085 }, { "epoch": 0.2172, "grad_norm": 15.105993270874023, "learning_rate": 1.5975510204081635e-05, "loss": 0.975628137588501, "step": 1086 }, { "epoch": 0.2174, "grad_norm": 10.471083641052246, "learning_rate": 1.5971428571428573e-05, "loss": 0.46671006083488464, "step": 1087 }, { "epoch": 0.2176, "grad_norm": 9.520712852478027, "learning_rate": 1.596734693877551e-05, "loss": 0.31040897965431213, "step": 1088 }, { "epoch": 0.2178, "grad_norm": 10.263479232788086, "learning_rate": 1.596326530612245e-05, "loss": 0.5954416394233704, "step": 1089 }, { "epoch": 0.218, "grad_norm": 6.333590030670166, "learning_rate": 1.595918367346939e-05, "loss": 0.28529664874076843, "step": 1090 }, { "epoch": 0.2182, "grad_norm": 9.763040542602539, "learning_rate": 1.5955102040816327e-05, "loss": 1.5908406972885132, "step": 1091 }, { "epoch": 0.2184, "grad_norm": 12.306046485900879, "learning_rate": 1.5951020408163268e-05, "loss": 0.5967962741851807, "step": 1092 }, { "epoch": 0.2186, "grad_norm": 22.245534896850586, "learning_rate": 1.5946938775510206e-05, "loss": 2.2511146068573, "step": 1093 }, { "epoch": 0.2188, "grad_norm": 27.319904327392578, "learning_rate": 1.5942857142857143e-05, "loss": 2.5454025268554688, "step": 1094 }, { "epoch": 0.219, "grad_norm": 8.289361000061035, "learning_rate": 1.5938775510204084e-05, "loss": 1.4484649896621704, "step": 1095 }, { "epoch": 0.2192, "grad_norm": 12.739070892333984, "learning_rate": 1.5934693877551022e-05, "loss": 1.9173656702041626, "step": 1096 }, { "epoch": 0.2194, "grad_norm": 8.818769454956055, "learning_rate": 1.593061224489796e-05, "loss": 0.3959437608718872, "step": 1097 }, { "epoch": 0.2196, "grad_norm": 8.04629898071289, "learning_rate": 1.59265306122449e-05, "loss": 0.5050313472747803, "step": 1098 }, { "epoch": 0.2198, "grad_norm": 9.941852569580078, "learning_rate": 1.5922448979591838e-05, "loss": 0.6239913105964661, "step": 1099 }, { "epoch": 0.22, "grad_norm": 6.9901957511901855, "learning_rate": 1.5918367346938776e-05, "loss": 0.2791489362716675, "step": 1100 }, { "epoch": 0.2202, "grad_norm": 14.052346229553223, "learning_rate": 1.5914285714285717e-05, "loss": 0.6275435090065002, "step": 1101 }, { "epoch": 0.2204, "grad_norm": 14.876299858093262, "learning_rate": 1.5910204081632654e-05, "loss": 0.8954221606254578, "step": 1102 }, { "epoch": 0.2206, "grad_norm": 8.319660186767578, "learning_rate": 1.5906122448979592e-05, "loss": 0.38297703862190247, "step": 1103 }, { "epoch": 0.2208, "grad_norm": 11.229364395141602, "learning_rate": 1.5902040816326533e-05, "loss": 0.4629489481449127, "step": 1104 }, { "epoch": 0.221, "grad_norm": 6.867579460144043, "learning_rate": 1.589795918367347e-05, "loss": 0.2175288051366806, "step": 1105 }, { "epoch": 0.2212, "grad_norm": 4.158148765563965, "learning_rate": 1.589387755102041e-05, "loss": 0.057790834456682205, "step": 1106 }, { "epoch": 0.2214, "grad_norm": 12.21056842803955, "learning_rate": 1.588979591836735e-05, "loss": 0.4217188358306885, "step": 1107 }, { "epoch": 0.2216, "grad_norm": 8.648012161254883, "learning_rate": 1.5885714285714287e-05, "loss": 0.3710355758666992, "step": 1108 }, { "epoch": 0.2218, "grad_norm": 10.563362121582031, "learning_rate": 1.5881632653061225e-05, "loss": 0.6589060425758362, "step": 1109 }, { "epoch": 0.222, "grad_norm": 10.837821006774902, "learning_rate": 1.5877551020408162e-05, "loss": 0.6284217238426208, "step": 1110 }, { "epoch": 0.2222, "grad_norm": 10.624652862548828, "learning_rate": 1.5873469387755103e-05, "loss": 0.6576958298683167, "step": 1111 }, { "epoch": 0.2224, "grad_norm": 9.058115005493164, "learning_rate": 1.586938775510204e-05, "loss": 1.0441205501556396, "step": 1112 }, { "epoch": 0.2226, "grad_norm": 8.694955825805664, "learning_rate": 1.586530612244898e-05, "loss": 0.4351350963115692, "step": 1113 }, { "epoch": 0.2228, "grad_norm": 12.053570747375488, "learning_rate": 1.586122448979592e-05, "loss": 0.7542349696159363, "step": 1114 }, { "epoch": 0.223, "grad_norm": 8.173776626586914, "learning_rate": 1.5857142857142857e-05, "loss": 0.5358859896659851, "step": 1115 }, { "epoch": 0.2232, "grad_norm": 7.0457377433776855, "learning_rate": 1.5853061224489795e-05, "loss": 0.3554631173610687, "step": 1116 }, { "epoch": 0.2234, "grad_norm": 7.606454849243164, "learning_rate": 1.5848979591836736e-05, "loss": 0.19667239487171173, "step": 1117 }, { "epoch": 0.2236, "grad_norm": 5.035172462463379, "learning_rate": 1.5844897959183674e-05, "loss": 0.1307631880044937, "step": 1118 }, { "epoch": 0.2238, "grad_norm": 13.789462089538574, "learning_rate": 1.584081632653061e-05, "loss": 0.5289885997772217, "step": 1119 }, { "epoch": 0.224, "grad_norm": 11.246469497680664, "learning_rate": 1.5836734693877552e-05, "loss": 0.7898580431938171, "step": 1120 }, { "epoch": 0.2242, "grad_norm": 11.977219581604004, "learning_rate": 1.5832653061224493e-05, "loss": 0.5384518504142761, "step": 1121 }, { "epoch": 0.2244, "grad_norm": 11.0541353225708, "learning_rate": 1.582857142857143e-05, "loss": 0.3177433907985687, "step": 1122 }, { "epoch": 0.2246, "grad_norm": 11.144501686096191, "learning_rate": 1.582448979591837e-05, "loss": 0.7216606736183167, "step": 1123 }, { "epoch": 0.2248, "grad_norm": 9.658855438232422, "learning_rate": 1.582040816326531e-05, "loss": 0.5789176821708679, "step": 1124 }, { "epoch": 0.225, "grad_norm": 8.09659481048584, "learning_rate": 1.5816326530612247e-05, "loss": 0.3930445611476898, "step": 1125 }, { "epoch": 0.2252, "grad_norm": 7.9985575675964355, "learning_rate": 1.5812244897959185e-05, "loss": 0.2690459191799164, "step": 1126 }, { "epoch": 0.2254, "grad_norm": 7.55100154876709, "learning_rate": 1.5808163265306126e-05, "loss": 0.2630295753479004, "step": 1127 }, { "epoch": 0.2256, "grad_norm": 3.8033721446990967, "learning_rate": 1.5804081632653064e-05, "loss": 0.08478239923715591, "step": 1128 }, { "epoch": 0.2258, "grad_norm": 9.037246704101562, "learning_rate": 1.58e-05, "loss": 0.456184983253479, "step": 1129 }, { "epoch": 0.226, "grad_norm": 9.843621253967285, "learning_rate": 1.5795918367346942e-05, "loss": 0.5759654641151428, "step": 1130 }, { "epoch": 0.2262, "grad_norm": 11.10017204284668, "learning_rate": 1.579183673469388e-05, "loss": 0.5551418662071228, "step": 1131 }, { "epoch": 0.2264, "grad_norm": 12.201043128967285, "learning_rate": 1.5787755102040817e-05, "loss": 0.3533940315246582, "step": 1132 }, { "epoch": 0.2266, "grad_norm": 11.207563400268555, "learning_rate": 1.578367346938776e-05, "loss": 0.23960542678833008, "step": 1133 }, { "epoch": 0.2268, "grad_norm": 11.763862609863281, "learning_rate": 1.5779591836734696e-05, "loss": 0.24775488674640656, "step": 1134 }, { "epoch": 0.227, "grad_norm": 23.140377044677734, "learning_rate": 1.5775510204081634e-05, "loss": 0.8969824910163879, "step": 1135 }, { "epoch": 0.2272, "grad_norm": 19.724388122558594, "learning_rate": 1.577142857142857e-05, "loss": 0.5984858870506287, "step": 1136 }, { "epoch": 0.2274, "grad_norm": 8.402267456054688, "learning_rate": 1.5767346938775512e-05, "loss": 0.30400899052619934, "step": 1137 }, { "epoch": 0.2276, "grad_norm": 2.2674312591552734, "learning_rate": 1.576326530612245e-05, "loss": 0.02049305848777294, "step": 1138 }, { "epoch": 0.2278, "grad_norm": 9.017812728881836, "learning_rate": 1.5759183673469388e-05, "loss": 0.2635888159275055, "step": 1139 }, { "epoch": 0.228, "grad_norm": 7.476223468780518, "learning_rate": 1.575510204081633e-05, "loss": 0.19383282959461212, "step": 1140 }, { "epoch": 0.2282, "grad_norm": 8.226303100585938, "learning_rate": 1.5751020408163266e-05, "loss": 0.4954911768436432, "step": 1141 }, { "epoch": 0.2284, "grad_norm": 7.242204666137695, "learning_rate": 1.5746938775510204e-05, "loss": 0.2757532298564911, "step": 1142 }, { "epoch": 0.2286, "grad_norm": 8.157430648803711, "learning_rate": 1.5742857142857145e-05, "loss": 0.29086917638778687, "step": 1143 }, { "epoch": 0.2288, "grad_norm": 2.8150479793548584, "learning_rate": 1.5738775510204083e-05, "loss": 0.027883892878890038, "step": 1144 }, { "epoch": 0.229, "grad_norm": 8.284408569335938, "learning_rate": 1.573469387755102e-05, "loss": 0.8409969806671143, "step": 1145 }, { "epoch": 0.2292, "grad_norm": 10.060462951660156, "learning_rate": 1.573061224489796e-05, "loss": 0.4277164936065674, "step": 1146 }, { "epoch": 0.2294, "grad_norm": 10.635279655456543, "learning_rate": 1.57265306122449e-05, "loss": 0.8976736068725586, "step": 1147 }, { "epoch": 0.2296, "grad_norm": 9.420601844787598, "learning_rate": 1.5722448979591837e-05, "loss": 0.37881675362586975, "step": 1148 }, { "epoch": 0.2298, "grad_norm": 11.448286056518555, "learning_rate": 1.5718367346938778e-05, "loss": 0.5376109480857849, "step": 1149 }, { "epoch": 0.23, "grad_norm": 5.324295997619629, "learning_rate": 1.5714285714285715e-05, "loss": 0.07132100313901901, "step": 1150 }, { "epoch": 0.2302, "grad_norm": 9.235241889953613, "learning_rate": 1.5710204081632653e-05, "loss": 1.1396642923355103, "step": 1151 }, { "epoch": 0.2304, "grad_norm": 8.762242317199707, "learning_rate": 1.5706122448979594e-05, "loss": 0.934231698513031, "step": 1152 }, { "epoch": 0.2306, "grad_norm": 5.705596446990967, "learning_rate": 1.570204081632653e-05, "loss": 0.082107312977314, "step": 1153 }, { "epoch": 0.2308, "grad_norm": 6.186675071716309, "learning_rate": 1.569795918367347e-05, "loss": 0.1751657873392105, "step": 1154 }, { "epoch": 0.231, "grad_norm": 8.288212776184082, "learning_rate": 1.569387755102041e-05, "loss": 0.42954421043395996, "step": 1155 }, { "epoch": 0.2312, "grad_norm": 2.3689615726470947, "learning_rate": 1.5689795918367348e-05, "loss": 0.018760303035378456, "step": 1156 }, { "epoch": 0.2314, "grad_norm": 10.67268180847168, "learning_rate": 1.5685714285714286e-05, "loss": 0.6322822570800781, "step": 1157 }, { "epoch": 0.2316, "grad_norm": 3.1626954078674316, "learning_rate": 1.5681632653061227e-05, "loss": 0.058579444885253906, "step": 1158 }, { "epoch": 0.2318, "grad_norm": 25.20267677307129, "learning_rate": 1.5677551020408164e-05, "loss": 0.627945065498352, "step": 1159 }, { "epoch": 0.232, "grad_norm": 17.494667053222656, "learning_rate": 1.5673469387755102e-05, "loss": 0.5639864206314087, "step": 1160 }, { "epoch": 0.2322, "grad_norm": 9.18785572052002, "learning_rate": 1.5669387755102043e-05, "loss": 0.9669410586357117, "step": 1161 }, { "epoch": 0.2324, "grad_norm": 8.705360412597656, "learning_rate": 1.566530612244898e-05, "loss": 0.8179224133491516, "step": 1162 }, { "epoch": 0.2326, "grad_norm": 9.248069763183594, "learning_rate": 1.5661224489795918e-05, "loss": 0.8471116423606873, "step": 1163 }, { "epoch": 0.2328, "grad_norm": 7.668497562408447, "learning_rate": 1.5657142857142856e-05, "loss": 0.7964031100273132, "step": 1164 }, { "epoch": 0.233, "grad_norm": 12.119585990905762, "learning_rate": 1.5653061224489797e-05, "loss": 0.35344669222831726, "step": 1165 }, { "epoch": 0.2332, "grad_norm": 9.378921508789062, "learning_rate": 1.5648979591836734e-05, "loss": 0.16154851019382477, "step": 1166 }, { "epoch": 0.2334, "grad_norm": 7.75394868850708, "learning_rate": 1.5644897959183676e-05, "loss": 0.395351380109787, "step": 1167 }, { "epoch": 0.2336, "grad_norm": 3.9985170364379883, "learning_rate": 1.5640816326530613e-05, "loss": 0.030814791098237038, "step": 1168 }, { "epoch": 0.2338, "grad_norm": 10.399243354797363, "learning_rate": 1.5636734693877554e-05, "loss": 0.5034558773040771, "step": 1169 }, { "epoch": 0.234, "grad_norm": 11.97280502319336, "learning_rate": 1.5632653061224492e-05, "loss": 0.34756746888160706, "step": 1170 }, { "epoch": 0.2342, "grad_norm": 9.808802604675293, "learning_rate": 1.562857142857143e-05, "loss": 1.1920939683914185, "step": 1171 }, { "epoch": 0.2344, "grad_norm": 9.306438446044922, "learning_rate": 1.562448979591837e-05, "loss": 0.9638805389404297, "step": 1172 }, { "epoch": 0.2346, "grad_norm": 9.191034317016602, "learning_rate": 1.5620408163265308e-05, "loss": 1.107985258102417, "step": 1173 }, { "epoch": 0.2348, "grad_norm": 9.289813995361328, "learning_rate": 1.5616326530612246e-05, "loss": 0.901727020740509, "step": 1174 }, { "epoch": 0.235, "grad_norm": 8.056550025939941, "learning_rate": 1.5612244897959187e-05, "loss": 1.07204270362854, "step": 1175 }, { "epoch": 0.2352, "grad_norm": 8.49542236328125, "learning_rate": 1.5608163265306124e-05, "loss": 0.7138166427612305, "step": 1176 }, { "epoch": 0.2354, "grad_norm": 7.110629081726074, "learning_rate": 1.5604081632653062e-05, "loss": 0.16482390463352203, "step": 1177 }, { "epoch": 0.2356, "grad_norm": 6.91373348236084, "learning_rate": 1.5600000000000003e-05, "loss": 0.1108563244342804, "step": 1178 }, { "epoch": 0.2358, "grad_norm": 20.4930362701416, "learning_rate": 1.559591836734694e-05, "loss": 0.7573724389076233, "step": 1179 }, { "epoch": 0.236, "grad_norm": 19.89785385131836, "learning_rate": 1.559183673469388e-05, "loss": 0.7158824801445007, "step": 1180 }, { "epoch": 0.2362, "grad_norm": 8.862764358520508, "learning_rate": 1.558775510204082e-05, "loss": 0.5127131342887878, "step": 1181 }, { "epoch": 0.2364, "grad_norm": 10.195377349853516, "learning_rate": 1.5583673469387757e-05, "loss": 0.5104061365127563, "step": 1182 }, { "epoch": 0.2366, "grad_norm": 18.68464469909668, "learning_rate": 1.5579591836734695e-05, "loss": 0.965368926525116, "step": 1183 }, { "epoch": 0.2368, "grad_norm": 13.724676132202148, "learning_rate": 1.5575510204081636e-05, "loss": 0.9805054664611816, "step": 1184 }, { "epoch": 0.237, "grad_norm": 11.423314094543457, "learning_rate": 1.5571428571428573e-05, "loss": 0.3765859603881836, "step": 1185 }, { "epoch": 0.2372, "grad_norm": 11.845845222473145, "learning_rate": 1.556734693877551e-05, "loss": 0.29967060685157776, "step": 1186 }, { "epoch": 0.2374, "grad_norm": 10.537186622619629, "learning_rate": 1.5563265306122452e-05, "loss": 0.39351996779441833, "step": 1187 }, { "epoch": 0.2376, "grad_norm": 8.71883487701416, "learning_rate": 1.555918367346939e-05, "loss": 0.4899787902832031, "step": 1188 }, { "epoch": 0.2378, "grad_norm": 6.350547790527344, "learning_rate": 1.5555102040816327e-05, "loss": 0.1344643384218216, "step": 1189 }, { "epoch": 0.238, "grad_norm": 7.202303886413574, "learning_rate": 1.5551020408163265e-05, "loss": 0.1689714789390564, "step": 1190 }, { "epoch": 0.2382, "grad_norm": 12.866501808166504, "learning_rate": 1.5546938775510206e-05, "loss": 0.18987785279750824, "step": 1191 }, { "epoch": 0.2384, "grad_norm": 9.913227081298828, "learning_rate": 1.5542857142857144e-05, "loss": 0.24315452575683594, "step": 1192 }, { "epoch": 0.2386, "grad_norm": 9.650484085083008, "learning_rate": 1.553877551020408e-05, "loss": 0.5700632333755493, "step": 1193 }, { "epoch": 0.2388, "grad_norm": 4.102683067321777, "learning_rate": 1.5534693877551022e-05, "loss": 0.0377318412065506, "step": 1194 }, { "epoch": 0.239, "grad_norm": 21.682559967041016, "learning_rate": 1.553061224489796e-05, "loss": 1.3413629531860352, "step": 1195 }, { "epoch": 0.2392, "grad_norm": 31.129104614257812, "learning_rate": 1.5526530612244898e-05, "loss": 1.8052949905395508, "step": 1196 }, { "epoch": 0.2394, "grad_norm": 20.076871871948242, "learning_rate": 1.552244897959184e-05, "loss": 0.6911614537239075, "step": 1197 }, { "epoch": 0.2396, "grad_norm": 20.200077056884766, "learning_rate": 1.5518367346938776e-05, "loss": 0.5988926291465759, "step": 1198 }, { "epoch": 0.2398, "grad_norm": 12.946271896362305, "learning_rate": 1.5514285714285714e-05, "loss": 1.3554376363754272, "step": 1199 }, { "epoch": 0.24, "grad_norm": 13.336772918701172, "learning_rate": 1.5510204081632655e-05, "loss": 0.9927406907081604, "step": 1200 }, { "epoch": 0.2402, "grad_norm": 6.8865461349487305, "learning_rate": 1.5506122448979593e-05, "loss": 0.24796366691589355, "step": 1201 }, { "epoch": 0.2404, "grad_norm": 6.864181995391846, "learning_rate": 1.550204081632653e-05, "loss": 0.3500555753707886, "step": 1202 }, { "epoch": 0.2406, "grad_norm": 3.788555145263672, "learning_rate": 1.549795918367347e-05, "loss": 3.665847063064575, "step": 1203 }, { "epoch": 0.2408, "grad_norm": 4.365561485290527, "learning_rate": 1.549387755102041e-05, "loss": 3.6833412647247314, "step": 1204 }, { "epoch": 0.241, "grad_norm": 7.910531997680664, "learning_rate": 1.5489795918367346e-05, "loss": 0.6531661152839661, "step": 1205 }, { "epoch": 0.2412, "grad_norm": 8.578229904174805, "learning_rate": 1.5485714285714287e-05, "loss": 0.9215391278266907, "step": 1206 }, { "epoch": 0.2414, "grad_norm": 8.530535697937012, "learning_rate": 1.5481632653061225e-05, "loss": 0.9444023966789246, "step": 1207 }, { "epoch": 0.2416, "grad_norm": 8.345559120178223, "learning_rate": 1.5477551020408163e-05, "loss": 0.6658654808998108, "step": 1208 }, { "epoch": 0.2418, "grad_norm": 12.190149307250977, "learning_rate": 1.5473469387755104e-05, "loss": 1.0146502256393433, "step": 1209 }, { "epoch": 0.242, "grad_norm": 13.669610977172852, "learning_rate": 1.546938775510204e-05, "loss": 1.262741208076477, "step": 1210 }, { "epoch": 0.2422, "grad_norm": 23.169113159179688, "learning_rate": 1.546530612244898e-05, "loss": 0.4525338411331177, "step": 1211 }, { "epoch": 0.2424, "grad_norm": 12.746214866638184, "learning_rate": 1.546122448979592e-05, "loss": 0.42098966240882874, "step": 1212 }, { "epoch": 0.2426, "grad_norm": 7.180917263031006, "learning_rate": 1.545714285714286e-05, "loss": 0.3197638988494873, "step": 1213 }, { "epoch": 0.2428, "grad_norm": 5.308062553405762, "learning_rate": 1.54530612244898e-05, "loss": 0.21710480749607086, "step": 1214 }, { "epoch": 0.243, "grad_norm": 9.355246543884277, "learning_rate": 1.5448979591836736e-05, "loss": 0.39869144558906555, "step": 1215 }, { "epoch": 0.2432, "grad_norm": 5.3946990966796875, "learning_rate": 1.5444897959183674e-05, "loss": 0.0702393427491188, "step": 1216 }, { "epoch": 0.2434, "grad_norm": 9.240615844726562, "learning_rate": 1.5440816326530615e-05, "loss": 0.32536718249320984, "step": 1217 }, { "epoch": 0.2436, "grad_norm": 4.04171895980835, "learning_rate": 1.5436734693877553e-05, "loss": 0.03030652366578579, "step": 1218 }, { "epoch": 0.2438, "grad_norm": 6.824328899383545, "learning_rate": 1.543265306122449e-05, "loss": 0.12200311571359634, "step": 1219 }, { "epoch": 0.244, "grad_norm": 7.719374179840088, "learning_rate": 1.542857142857143e-05, "loss": 0.11944810301065445, "step": 1220 }, { "epoch": 0.2442, "grad_norm": 13.698663711547852, "learning_rate": 1.542448979591837e-05, "loss": 0.22648067772388458, "step": 1221 }, { "epoch": 0.2444, "grad_norm": 9.995159149169922, "learning_rate": 1.5420408163265307e-05, "loss": 0.16953688859939575, "step": 1222 }, { "epoch": 0.2446, "grad_norm": 17.760517120361328, "learning_rate": 1.5416326530612248e-05, "loss": 1.0466417074203491, "step": 1223 }, { "epoch": 0.2448, "grad_norm": 24.354721069335938, "learning_rate": 1.5412244897959185e-05, "loss": 1.1571100950241089, "step": 1224 }, { "epoch": 0.245, "grad_norm": 7.29709005355835, "learning_rate": 1.5408163265306123e-05, "loss": 0.2536810636520386, "step": 1225 }, { "epoch": 0.2452, "grad_norm": 7.742997169494629, "learning_rate": 1.5404081632653064e-05, "loss": 0.7320311069488525, "step": 1226 }, { "epoch": 0.2454, "grad_norm": 16.84010887145996, "learning_rate": 1.54e-05, "loss": 0.42508554458618164, "step": 1227 }, { "epoch": 0.2456, "grad_norm": 9.643656730651855, "learning_rate": 1.539591836734694e-05, "loss": 0.6736847758293152, "step": 1228 }, { "epoch": 0.2458, "grad_norm": 8.833418846130371, "learning_rate": 1.539183673469388e-05, "loss": 0.4503360688686371, "step": 1229 }, { "epoch": 0.246, "grad_norm": 10.094308853149414, "learning_rate": 1.5387755102040818e-05, "loss": 0.6737444400787354, "step": 1230 }, { "epoch": 0.2462, "grad_norm": 16.184350967407227, "learning_rate": 1.5383673469387756e-05, "loss": 1.3393627405166626, "step": 1231 }, { "epoch": 0.2464, "grad_norm": 10.129984855651855, "learning_rate": 1.5379591836734697e-05, "loss": 1.0179818868637085, "step": 1232 }, { "epoch": 0.2466, "grad_norm": 9.137284278869629, "learning_rate": 1.5375510204081634e-05, "loss": 0.4089559018611908, "step": 1233 }, { "epoch": 0.2468, "grad_norm": 8.856403350830078, "learning_rate": 1.5371428571428572e-05, "loss": 0.4209579527378082, "step": 1234 }, { "epoch": 0.247, "grad_norm": 9.629825592041016, "learning_rate": 1.5367346938775513e-05, "loss": 0.26891955733299255, "step": 1235 }, { "epoch": 0.2472, "grad_norm": 10.32089900970459, "learning_rate": 1.536326530612245e-05, "loss": 0.34742483496665955, "step": 1236 }, { "epoch": 0.2474, "grad_norm": 7.093871116638184, "learning_rate": 1.5359183673469388e-05, "loss": 0.5111809372901917, "step": 1237 }, { "epoch": 0.2476, "grad_norm": 7.806134223937988, "learning_rate": 1.535510204081633e-05, "loss": 0.31436750292778015, "step": 1238 }, { "epoch": 0.2478, "grad_norm": 9.90782642364502, "learning_rate": 1.5351020408163267e-05, "loss": 0.802239179611206, "step": 1239 }, { "epoch": 0.248, "grad_norm": 6.178524017333984, "learning_rate": 1.5346938775510204e-05, "loss": 0.17601804435253143, "step": 1240 }, { "epoch": 0.2482, "grad_norm": 7.302453517913818, "learning_rate": 1.5342857142857142e-05, "loss": 0.34948650002479553, "step": 1241 }, { "epoch": 0.2484, "grad_norm": 10.507240295410156, "learning_rate": 1.5338775510204083e-05, "loss": 0.5208330750465393, "step": 1242 }, { "epoch": 0.2486, "grad_norm": 9.383793830871582, "learning_rate": 1.533469387755102e-05, "loss": 0.3614520728588104, "step": 1243 }, { "epoch": 0.2488, "grad_norm": 9.760103225708008, "learning_rate": 1.533061224489796e-05, "loss": 0.4865609109401703, "step": 1244 }, { "epoch": 0.249, "grad_norm": 9.849222183227539, "learning_rate": 1.53265306122449e-05, "loss": 0.48985394835472107, "step": 1245 }, { "epoch": 0.2492, "grad_norm": 5.9976043701171875, "learning_rate": 1.5322448979591837e-05, "loss": 0.09232968837022781, "step": 1246 }, { "epoch": 0.2494, "grad_norm": 21.510356903076172, "learning_rate": 1.5318367346938775e-05, "loss": 0.9444459080696106, "step": 1247 }, { "epoch": 0.2496, "grad_norm": 17.3249568939209, "learning_rate": 1.5314285714285716e-05, "loss": 0.5190832614898682, "step": 1248 }, { "epoch": 0.2498, "grad_norm": 8.224980354309082, "learning_rate": 1.5310204081632653e-05, "loss": 0.3235931396484375, "step": 1249 }, { "epoch": 0.25, "grad_norm": 6.962210655212402, "learning_rate": 1.530612244897959e-05, "loss": 0.1323358565568924, "step": 1250 } ], "logging_steps": 1, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 192, "trial_name": null, "trial_params": null }