{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 500.0, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 8.9375, "learning_rate": 1.8e-05, "loss": 2.4138946533203125, "loss_d0": 2.4146595120429994, "step": 10 }, { "epoch": 2.0, "grad_norm": 4.46875, "learning_rate": 3.8e-05, "loss": 0.9612014770507813, "loss_d0": 0.9622029483318328, "step": 20 }, { "epoch": 3.0, "grad_norm": 3.765625, "learning_rate": 5.8e-05, "loss": 0.6186047077178956, "loss_d0": 0.6192301928997039, "step": 30 }, { "epoch": 4.0, "grad_norm": 3.21875, "learning_rate": 7.800000000000001e-05, "loss": 0.42489213943481446, "loss_d0": 0.42572466731071473, "step": 40 }, { "epoch": 5.0, "grad_norm": 2.84375, "learning_rate": 9.8e-05, "loss": 0.33167214393615724, "loss_d0": 0.33206869661808014, "step": 50 }, { "epoch": 6.0, "grad_norm": 2.984375, "learning_rate": 0.000118, "loss": 0.3143290042877197, "loss_d0": 0.3145958036184311, "step": 60 }, { "epoch": 7.0, "grad_norm": 2.046875, "learning_rate": 0.000138, "loss": 0.24962928295135497, "loss_d0": 0.2503396525979042, "step": 70 }, { "epoch": 8.0, "grad_norm": 3.28125, "learning_rate": 0.00015800000000000002, "loss": 0.22044200897216798, "loss_d0": 0.22091176211833954, "step": 80 }, { "epoch": 9.0, "grad_norm": 2.546875, "learning_rate": 0.00017800000000000002, "loss": 0.22365751266479492, "loss_d0": 0.2240446463227272, "step": 90 }, { "epoch": 10.0, "grad_norm": 1.8359375, "learning_rate": 0.00019800000000000002, "loss": 0.22820873260498048, "loss_d0": 0.22806004285812378, "step": 100 }, { "epoch": 11.0, "grad_norm": 1.15625, "learning_rate": 0.0002, "loss": 0.1881038784980774, "loss_d0": 0.18772014379501342, "step": 110 }, { "epoch": 12.0, "grad_norm": 1.3515625, "learning_rate": 0.0002, "loss": 0.16590781211853028, "loss_d0": 0.16589888483285903, "step": 120 }, { "epoch": 13.0, "grad_norm": 2.46875, "learning_rate": 0.0002, "loss": 0.15281176567077637, "loss_d0": 0.1532880373299122, "step": 130 }, { "epoch": 14.0, "grad_norm": 1.1875, "learning_rate": 0.0002, "loss": 0.1305345416069031, "loss_d0": 0.13086820542812347, "step": 140 }, { "epoch": 15.0, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.11721111536026001, "loss_d0": 0.117678714543581, "step": 150 }, { "epoch": 16.0, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 0.1084968090057373, "loss_d0": 0.10861062631011009, "step": 160 }, { "epoch": 17.0, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.10888866186141968, "loss_d0": 0.109267458319664, "step": 170 }, { "epoch": 18.0, "grad_norm": 1.1875, "learning_rate": 0.0002, "loss": 0.08663774132728577, "loss_d0": 0.08675258159637451, "step": 180 }, { "epoch": 19.0, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 0.09369056820869445, "loss_d0": 0.09477355107665061, "step": 190 }, { "epoch": 20.0, "grad_norm": 1.1953125, "learning_rate": 0.0002, "loss": 0.08849127292633056, "loss_d0": 0.08843691125512124, "step": 200 }, { "epoch": 21.0, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.07778345346450806, "loss_d0": 0.07785326093435288, "step": 210 }, { "epoch": 22.0, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 0.08467817902565003, "loss_d0": 0.08556383550167083, "step": 220 }, { "epoch": 23.0, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.08082451820373535, "loss_d0": 0.08079542741179466, "step": 230 }, { "epoch": 24.0, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.08634517192840577, "loss_d0": 0.0867164421826601, "step": 240 }, { "epoch": 25.0, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.07757908701896668, "loss_d0": 0.07813894674181938, "step": 250 }, { "epoch": 26.0, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.07069578766822815, "loss_d0": 0.07092957794666291, "step": 260 }, { "epoch": 27.0, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.06855013966560364, "loss_d0": 0.0690783817321062, "step": 270 }, { "epoch": 28.0, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.06371119022369384, "loss_d0": 0.06389858424663544, "step": 280 }, { "epoch": 29.0, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.06759281754493714, "loss_d0": 0.06811215244233608, "step": 290 }, { "epoch": 30.0, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.06775572896003723, "loss_d0": 0.06840224675834179, "step": 300 }, { "epoch": 31.0, "grad_norm": 1.8828125, "learning_rate": 0.0002, "loss": 0.07054120302200317, "loss_d0": 0.07125221230089665, "step": 310 }, { "epoch": 32.0, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0642963707447052, "loss_d0": 0.06480461172759533, "step": 320 }, { "epoch": 33.0, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.06202612519264221, "loss_d0": 0.06230700649321079, "step": 330 }, { "epoch": 34.0, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.05183584094047546, "loss_d0": 0.0517644751816988, "step": 340 }, { "epoch": 35.0, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.06536717414855957, "loss_d0": 0.06529441438615322, "step": 350 }, { "epoch": 36.0, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.05596559047698975, "loss_d0": 0.05624995082616806, "step": 360 }, { "epoch": 37.0, "grad_norm": 1.3515625, "learning_rate": 0.0002, "loss": 0.056243062019348145, "loss_d0": 0.0563370194286108, "step": 370 }, { "epoch": 38.0, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.053859710693359375, "loss_d0": 0.054140987992286685, "step": 380 }, { "epoch": 39.0, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.056958770751953124, "loss_d0": 0.05690641142427921, "step": 390 }, { "epoch": 40.0, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 0.047679942846298215, "loss_d0": 0.04806721806526184, "step": 400 }, { "epoch": 41.0, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.05440508723258972, "loss_d0": 0.054613354802131656, "step": 410 }, { "epoch": 42.0, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.053417938947677615, "loss_d0": 0.05367131717503071, "step": 420 }, { "epoch": 43.0, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.06491599082946778, "loss_d0": 0.0651374600827694, "step": 430 }, { "epoch": 44.0, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.06125739812850952, "loss_d0": 0.06130740195512772, "step": 440 }, { "epoch": 45.0, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.05628480911254883, "loss_d0": 0.05642572902143002, "step": 450 }, { "epoch": 46.0, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.05836213231086731, "loss_d0": 0.05857353545725345, "step": 460 }, { "epoch": 47.0, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.07030140161514283, "loss_d0": 0.07007076032459736, "step": 470 }, { "epoch": 48.0, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.06899864673614502, "loss_d0": 0.06894715838134288, "step": 480 }, { "epoch": 49.0, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.06125810742378235, "loss_d0": 0.06205868683755398, "step": 490 }, { "epoch": 50.0, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.048345702886581424, "loss_d0": 0.048552507907152175, "step": 500 }, { "epoch": 50.0, "eval_loss": 8.588546752929688, "eval_runtime": 0.14, "eval_samples_per_second": 714.47, "eval_steps_per_second": 71.447, "step": 500 }, { "epoch": 51.0, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.05221686363220215, "loss_d0": 0.05234598331153393, "step": 510 }, { "epoch": 52.0, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.046692115068435666, "loss_d0": 0.046809761226177214, "step": 520 }, { "epoch": 53.0, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.04488828182220459, "loss_d0": 0.04517585374414921, "step": 530 }, { "epoch": 54.0, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 0.050816571712493895, "loss_d0": 0.051004741340875626, "step": 540 }, { "epoch": 55.0, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 0.059779518842697145, "loss_d0": 0.059067552536726, "step": 550 }, { "epoch": 56.0, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.05488635897636414, "loss_d0": 0.055578538402915004, "step": 560 }, { "epoch": 57.0, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.05492435097694397, "loss_d0": 0.05542452968657017, "step": 570 }, { "epoch": 58.0, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.055647605657577516, "loss_d0": 0.055836337804794314, "step": 580 }, { "epoch": 59.0, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 0.05527116060256958, "loss_d0": 0.05528037548065186, "step": 590 }, { "epoch": 60.0, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.04689827561378479, "loss_d0": 0.04706285260617733, "step": 600 }, { "epoch": 61.0, "grad_norm": 1.3203125, "learning_rate": 0.0002, "loss": 0.04486198127269745, "loss_d0": 0.04498056173324585, "step": 610 }, { "epoch": 62.0, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.046405041217803956, "loss_d0": 0.0469747718423605, "step": 620 }, { "epoch": 63.0, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.04741775393486023, "loss_d0": 0.047610121220350264, "step": 630 }, { "epoch": 64.0, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.05079013109207153, "loss_d0": 0.050666602700948714, "step": 640 }, { "epoch": 65.0, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.04559406936168671, "loss_d0": 0.04578414224088192, "step": 650 }, { "epoch": 66.0, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.05777719020843506, "loss_d0": 0.058517447859048846, "step": 660 }, { "epoch": 67.0, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.048018404841423036, "loss_d0": 0.04816087186336517, "step": 670 }, { "epoch": 68.0, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.044987404346466066, "loss_d0": 0.045172940194606784, "step": 680 }, { "epoch": 69.0, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.04962021708488464, "loss_d0": 0.049866561964154246, "step": 690 }, { "epoch": 70.0, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.045312818884849546, "loss_d0": 0.04579245671629906, "step": 700 }, { "epoch": 71.0, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.04371422827243805, "loss_d0": 0.04383484534919262, "step": 710 }, { "epoch": 72.0, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 0.04401738941669464, "loss_d0": 0.04424504302442074, "step": 720 }, { "epoch": 73.0, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 0.049768149852752686, "loss_d0": 0.050346105545759204, "step": 730 }, { "epoch": 74.0, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0556221604347229, "loss_d0": 0.05594193115830422, "step": 740 }, { "epoch": 75.0, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.058312606811523435, "loss_d0": 0.05818049944937229, "step": 750 }, { "epoch": 76.0, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.04740939438343048, "loss_d0": 0.047768377140164374, "step": 760 }, { "epoch": 77.0, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.04591890275478363, "loss_d0": 0.04616109542548656, "step": 770 }, { "epoch": 78.0, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.05339767932891846, "loss_d0": 0.05372548848390579, "step": 780 }, { "epoch": 79.0, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.05273417234420776, "loss_d0": 0.05288811326026917, "step": 790 }, { "epoch": 80.0, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 0.04624132513999939, "loss_d0": 0.04625156968832016, "step": 800 }, { "epoch": 81.0, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.04598477482795715, "loss_d0": 0.04574774876236916, "step": 810 }, { "epoch": 82.0, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.04457350969314575, "loss_d0": 0.0447501577436924, "step": 820 }, { "epoch": 83.0, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0425421267747879, "loss_d0": 0.042947014421224596, "step": 830 }, { "epoch": 84.0, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 0.04168615341186523, "loss_d0": 0.041912952437996864, "step": 840 }, { "epoch": 85.0, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.04558302760124207, "loss_d0": 0.04559236913919449, "step": 850 }, { "epoch": 86.0, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 0.04107217192649841, "loss_d0": 0.041329674795269965, "step": 860 }, { "epoch": 87.0, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 0.04191828668117523, "loss_d0": 0.042078302800655366, "step": 870 }, { "epoch": 88.0, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.04548239409923553, "loss_d0": 0.046018870547413826, "step": 880 }, { "epoch": 89.0, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 0.04762960374355316, "loss_d0": 0.04762529172003269, "step": 890 }, { "epoch": 90.0, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0533312737941742, "loss_d0": 0.05363424420356751, "step": 900 }, { "epoch": 91.0, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 0.0427082359790802, "loss_d0": 0.04287462942302227, "step": 910 }, { "epoch": 92.0, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.041102361679077146, "loss_d0": 0.04092179089784622, "step": 920 }, { "epoch": 93.0, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 0.04685114622116089, "loss_d0": 0.04688086472451687, "step": 930 }, { "epoch": 94.0, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 0.04118316173553467, "loss_d0": 0.04106501340866089, "step": 940 }, { "epoch": 95.0, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.04072573184967041, "loss_d0": 0.04083249364048243, "step": 950 }, { "epoch": 96.0, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.04276859760284424, "loss_d0": 0.0430012721568346, "step": 960 }, { "epoch": 97.0, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 0.041749000549316406, "loss_d0": 0.041834582760930064, "step": 970 }, { "epoch": 98.0, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0400599479675293, "loss_d0": 0.0403048075735569, "step": 980 }, { "epoch": 99.0, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.042036592960357666, "loss_d0": 0.04251206368207931, "step": 990 }, { "epoch": 100.0, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.03869950473308563, "loss_d0": 0.03894384019076824, "step": 1000 }, { "epoch": 100.0, "eval_loss": 10.19174575805664, "eval_runtime": 0.1391, "eval_samples_per_second": 718.819, "eval_steps_per_second": 71.882, "step": 1000 }, { "epoch": 101.0, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.04335947036743164, "loss_d0": 0.04366342425346374, "step": 1010 }, { "epoch": 102.0, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 0.04421808123588562, "loss_d0": 0.044679348915815355, "step": 1020 }, { "epoch": 103.0, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.043343788385391234, "loss_d0": 0.043425630405545236, "step": 1030 }, { "epoch": 104.0, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.04365791082382202, "loss_d0": 0.04369321018457413, "step": 1040 }, { "epoch": 105.0, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.054432183504104614, "loss_d0": 0.0545597530901432, "step": 1050 }, { "epoch": 106.0, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.04702830910682678, "loss_d0": 0.04734110943973065, "step": 1060 }, { "epoch": 107.0, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 0.049464890360832216, "loss_d0": 0.05035017840564251, "step": 1070 }, { "epoch": 108.0, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 0.048483666777610776, "loss_d0": 0.04887337274849415, "step": 1080 }, { "epoch": 109.0, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 0.04406644403934479, "loss_d0": 0.04453907683491707, "step": 1090 }, { "epoch": 110.0, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 0.040239399671554564, "loss_d0": 0.040439304709434507, "step": 1100 }, { "epoch": 111.0, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 0.03831009566783905, "loss_d0": 0.03864146564155817, "step": 1110 }, { "epoch": 112.0, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.03766327500343323, "loss_d0": 0.037880677916109565, "step": 1120 }, { "epoch": 113.0, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.03850332498550415, "loss_d0": 0.038638781011104587, "step": 1130 }, { "epoch": 114.0, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 0.03850359618663788, "loss_d0": 0.038869307935237886, "step": 1140 }, { "epoch": 115.0, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 0.03971063494682312, "loss_d0": 0.04013624228537083, "step": 1150 }, { "epoch": 116.0, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 0.042095300555229184, "loss_d0": 0.042642821371555326, "step": 1160 }, { "epoch": 117.0, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.04150034487247467, "loss_d0": 0.04157007820904255, "step": 1170 }, { "epoch": 118.0, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.040689364075660706, "loss_d0": 0.040831054002046584, "step": 1180 }, { "epoch": 119.0, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.04281890392303467, "loss_d0": 0.04285797439515591, "step": 1190 }, { "epoch": 120.0, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 0.04245826900005341, "loss_d0": 0.04238181263208389, "step": 1200 }, { "epoch": 121.0, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 0.038734421133995056, "loss_d0": 0.03866237942129373, "step": 1210 }, { "epoch": 122.0, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.0406865805387497, "loss_d0": 0.04078188501298428, "step": 1220 }, { "epoch": 123.0, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.04529264867305756, "loss_d0": 0.045334940403699876, "step": 1230 }, { "epoch": 124.0, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.04386350810527802, "loss_d0": 0.04425263442099094, "step": 1240 }, { "epoch": 125.0, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 0.0462289959192276, "loss_d0": 0.04652940817177296, "step": 1250 }, { "epoch": 126.0, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.04571775496006012, "loss_d0": 0.04559636972844601, "step": 1260 }, { "epoch": 127.0, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 0.038785919547080994, "loss_d0": 0.03859848342835903, "step": 1270 }, { "epoch": 128.0, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 0.03868694007396698, "loss_d0": 0.038959866762161253, "step": 1280 }, { "epoch": 129.0, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 0.03828883171081543, "loss_d0": 0.03848838359117508, "step": 1290 }, { "epoch": 130.0, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.03865792453289032, "loss_d0": 0.03923876471817493, "step": 1300 }, { "epoch": 131.0, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.03844528198242188, "loss_d0": 0.03887251988053322, "step": 1310 }, { "epoch": 132.0, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 0.04256443381309509, "loss_d0": 0.04252460934221745, "step": 1320 }, { "epoch": 133.0, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 0.04144605398178101, "loss_d0": 0.04168446734547615, "step": 1330 }, { "epoch": 134.0, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.04084254205226898, "loss_d0": 0.04107677228748798, "step": 1340 }, { "epoch": 135.0, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.04082280099391937, "loss_d0": 0.04084976222366095, "step": 1350 }, { "epoch": 136.0, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.05087977647781372, "loss_d0": 0.05094983167946339, "step": 1360 }, { "epoch": 137.0, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.03763132691383362, "loss_d0": 0.037742501497268675, "step": 1370 }, { "epoch": 138.0, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 0.04227721095085144, "loss_d0": 0.04258807189762592, "step": 1380 }, { "epoch": 139.0, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.04076775908470154, "loss_d0": 0.04091629646718502, "step": 1390 }, { "epoch": 140.0, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.036710423231124875, "loss_d0": 0.03685082122683525, "step": 1400 }, { "epoch": 141.0, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 0.03908473551273346, "loss_d0": 0.03924738299101591, "step": 1410 }, { "epoch": 142.0, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.04046695828437805, "loss_d0": 0.040642332285642624, "step": 1420 }, { "epoch": 143.0, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.04576176106929779, "loss_d0": 0.04611779041588306, "step": 1430 }, { "epoch": 144.0, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.04068447351455688, "loss_d0": 0.0409211091697216, "step": 1440 }, { "epoch": 145.0, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 0.037341105937957766, "loss_d0": 0.03750820457935333, "step": 1450 }, { "epoch": 146.0, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 0.0402258038520813, "loss_d0": 0.04033276382833719, "step": 1460 }, { "epoch": 147.0, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 0.036438849568367, "loss_d0": 0.036417641676962376, "step": 1470 }, { "epoch": 148.0, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 0.036809423565864564, "loss_d0": 0.03706260584294796, "step": 1480 }, { "epoch": 149.0, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 0.04308081865310669, "loss_d0": 0.04313235841691494, "step": 1490 }, { "epoch": 150.0, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.04377509355545044, "loss_d0": 0.043812135607004164, "step": 1500 }, { "epoch": 150.0, "eval_loss": 10.099786758422852, "eval_runtime": 0.1396, "eval_samples_per_second": 716.528, "eval_steps_per_second": 71.653, "step": 1500 }, { "epoch": 151.0, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.04513736963272095, "loss_d0": 0.045294683426618576, "step": 1510 }, { "epoch": 152.0, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.042070984840393066, "loss_d0": 0.04207402504980564, "step": 1520 }, { "epoch": 153.0, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 0.03895401954650879, "loss_d0": 0.039164166525006296, "step": 1530 }, { "epoch": 154.0, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 0.03788978755474091, "loss_d0": 0.03798672072589397, "step": 1540 }, { "epoch": 155.0, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 0.0371559351682663, "loss_d0": 0.03732948414981365, "step": 1550 }, { "epoch": 156.0, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 0.03791390657424927, "loss_d0": 0.03796781674027443, "step": 1560 }, { "epoch": 157.0, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.043032911419868466, "loss_d0": 0.04329189658164978, "step": 1570 }, { "epoch": 158.0, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.042349377274513246, "loss_d0": 0.04258726164698601, "step": 1580 }, { "epoch": 159.0, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 0.041011756658554076, "loss_d0": 0.04121678285300732, "step": 1590 }, { "epoch": 160.0, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 0.042557564377784726, "loss_d0": 0.042490555346012114, "step": 1600 }, { "epoch": 161.0, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 0.04044482409954071, "loss_d0": 0.04086768478155136, "step": 1610 }, { "epoch": 162.0, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 0.03680611252784729, "loss_d0": 0.037069106847047804, "step": 1620 }, { "epoch": 163.0, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 0.039442169666290286, "loss_d0": 0.03970975168049336, "step": 1630 }, { "epoch": 164.0, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 0.0388568639755249, "loss_d0": 0.03895986620336771, "step": 1640 }, { "epoch": 165.0, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 0.0399217814207077, "loss_d0": 0.04005670771002769, "step": 1650 }, { "epoch": 166.0, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 0.041203856468200684, "loss_d0": 0.04109587501734495, "step": 1660 }, { "epoch": 167.0, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.040146204829216006, "loss_d0": 0.04009650684893131, "step": 1670 }, { "epoch": 168.0, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 0.03617185950279236, "loss_d0": 0.03628003634512424, "step": 1680 }, { "epoch": 169.0, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 0.03408201038837433, "loss_d0": 0.03423904702067375, "step": 1690 }, { "epoch": 170.0, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.035675281286239625, "loss_d0": 0.035747794434428215, "step": 1700 }, { "epoch": 171.0, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 0.034537112712860106, "loss_d0": 0.03466725647449494, "step": 1710 }, { "epoch": 172.0, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.034722638130187986, "loss_d0": 0.034859748929739, "step": 1720 }, { "epoch": 173.0, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 0.03772296607494354, "loss_d0": 0.037882160022854806, "step": 1730 }, { "epoch": 174.0, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 0.035678941011428836, "loss_d0": 0.035915711894631386, "step": 1740 }, { "epoch": 175.0, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 0.039092454314231875, "loss_d0": 0.039218348637223246, "step": 1750 }, { "epoch": 176.0, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.04822979271411896, "loss_d0": 0.047857464849948884, "step": 1760 }, { "epoch": 177.0, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 0.046614819765090944, "loss_d0": 0.04678779914975166, "step": 1770 }, { "epoch": 178.0, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 0.03927460312843323, "loss_d0": 0.03958616629242897, "step": 1780 }, { "epoch": 179.0, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 0.03676096200942993, "loss_d0": 0.03699468113481998, "step": 1790 }, { "epoch": 180.0, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 0.04232467412948608, "loss_d0": 0.04274795986711979, "step": 1800 }, { "epoch": 181.0, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 0.03871000707149506, "loss_d0": 0.03887897543609142, "step": 1810 }, { "epoch": 182.0, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 0.03659443855285645, "loss_d0": 0.03661809824407101, "step": 1820 }, { "epoch": 183.0, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 0.035073107481002806, "loss_d0": 0.03523430619388819, "step": 1830 }, { "epoch": 184.0, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 0.035892120003700255, "loss_d0": 0.036054035648703575, "step": 1840 }, { "epoch": 185.0, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 0.03480220139026642, "loss_d0": 0.034982044622302054, "step": 1850 }, { "epoch": 186.0, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.03778698444366455, "loss_d0": 0.03809323143213987, "step": 1860 }, { "epoch": 187.0, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 0.040064454078674316, "loss_d0": 0.0400713250041008, "step": 1870 }, { "epoch": 188.0, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 0.04113616943359375, "loss_d0": 0.04127737544476986, "step": 1880 }, { "epoch": 189.0, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 0.04329647719860077, "loss_d0": 0.04343764334917068, "step": 1890 }, { "epoch": 190.0, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 0.045363953709602355, "loss_d0": 0.045822636038064954, "step": 1900 }, { "epoch": 191.0, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 0.04857678413391113, "loss_d0": 0.04872763194143772, "step": 1910 }, { "epoch": 192.0, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 0.039028006792068484, "loss_d0": 0.039162874594330786, "step": 1920 }, { "epoch": 193.0, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 0.037773504853248596, "loss_d0": 0.03795196227729321, "step": 1930 }, { "epoch": 194.0, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 0.038288545608520505, "loss_d0": 0.03851104043424129, "step": 1940 }, { "epoch": 195.0, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 0.03629574179649353, "loss_d0": 0.036422957107424735, "step": 1950 }, { "epoch": 196.0, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 0.03418645262718201, "loss_d0": 0.034383627213537694, "step": 1960 }, { "epoch": 197.0, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 0.03561476767063141, "loss_d0": 0.03591015879064798, "step": 1970 }, { "epoch": 198.0, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.034640243649482726, "loss_d0": 0.03486303184181452, "step": 1980 }, { "epoch": 199.0, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 0.0359109491109848, "loss_d0": 0.036339792795479296, "step": 1990 }, { "epoch": 200.0, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 0.03623954951763153, "loss_d0": 0.036560784094035625, "step": 2000 }, { "epoch": 200.0, "eval_loss": 10.187005996704102, "eval_runtime": 0.139, "eval_samples_per_second": 719.304, "eval_steps_per_second": 71.93, "step": 2000 }, { "epoch": 201.0, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 0.03511194586753845, "loss_d0": 0.035325034707784655, "step": 2010 }, { "epoch": 202.0, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.03389591574668884, "loss_d0": 0.03413012661039829, "step": 2020 }, { "epoch": 203.0, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 0.03555125892162323, "loss_d0": 0.035829301737248895, "step": 2030 }, { "epoch": 204.0, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 0.03692673146724701, "loss_d0": 0.03710946962237358, "step": 2040 }, { "epoch": 205.0, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.037002432346344, "loss_d0": 0.0371723372489214, "step": 2050 }, { "epoch": 206.0, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 0.035772857069969174, "loss_d0": 0.03592176493257284, "step": 2060 }, { "epoch": 207.0, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.04214143455028534, "loss_d0": 0.04232599698007107, "step": 2070 }, { "epoch": 208.0, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 0.046618017554283145, "loss_d0": 0.04666309393942356, "step": 2080 }, { "epoch": 209.0, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.04070812165737152, "loss_d0": 0.040983137860894206, "step": 2090 }, { "epoch": 210.0, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 0.03687845766544342, "loss_d0": 0.036946045234799385, "step": 2100 }, { "epoch": 211.0, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 0.03687340021133423, "loss_d0": 0.037080207653343675, "step": 2110 }, { "epoch": 212.0, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 0.03650786876678467, "loss_d0": 0.036709110252559185, "step": 2120 }, { "epoch": 213.0, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.03592644035816193, "loss_d0": 0.03618489000946283, "step": 2130 }, { "epoch": 214.0, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.03352642357349396, "loss_d0": 0.033740665577352044, "step": 2140 }, { "epoch": 215.0, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 0.033082038164138794, "loss_d0": 0.03320937901735306, "step": 2150 }, { "epoch": 216.0, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 0.03333630859851837, "loss_d0": 0.03355294372886419, "step": 2160 }, { "epoch": 217.0, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 0.03343007862567902, "loss_d0": 0.0335527554154396, "step": 2170 }, { "epoch": 218.0, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 0.0336355596780777, "loss_d0": 0.0337705560028553, "step": 2180 }, { "epoch": 219.0, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 0.034079357981681824, "loss_d0": 0.03418950513005257, "step": 2190 }, { "epoch": 220.0, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 0.03472439646720886, "loss_d0": 0.03485944531857967, "step": 2200 }, { "epoch": 221.0, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 0.03487759828567505, "loss_d0": 0.035057125985622405, "step": 2210 }, { "epoch": 222.0, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 0.034225499629974364, "loss_d0": 0.03437905199825764, "step": 2220 }, { "epoch": 223.0, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 0.03416237533092499, "loss_d0": 0.03426816165447235, "step": 2230 }, { "epoch": 224.0, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.03371023833751678, "loss_d0": 0.0338993389159441, "step": 2240 }, { "epoch": 225.0, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 0.03552867174148559, "loss_d0": 0.03563482351601124, "step": 2250 }, { "epoch": 226.0, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.03628787696361542, "loss_d0": 0.03656598404049873, "step": 2260 }, { "epoch": 227.0, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 0.04079635143280029, "loss_d0": 0.04079515114426613, "step": 2270 }, { "epoch": 228.0, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.03795821666717529, "loss_d0": 0.03802300468087196, "step": 2280 }, { "epoch": 229.0, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 0.038852787017822264, "loss_d0": 0.039321544021368025, "step": 2290 }, { "epoch": 230.0, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 0.041524294018745425, "loss_d0": 0.04147848449647427, "step": 2300 }, { "epoch": 231.0, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 0.03884131908416748, "loss_d0": 0.03886221721768379, "step": 2310 }, { "epoch": 232.0, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 0.04030242264270782, "loss_d0": 0.04045657627284527, "step": 2320 }, { "epoch": 233.0, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 0.04222070574760437, "loss_d0": 0.042415831610560416, "step": 2330 }, { "epoch": 234.0, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 0.04108119308948517, "loss_d0": 0.041366887465119365, "step": 2340 }, { "epoch": 235.0, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 0.03535057008266449, "loss_d0": 0.03571846466511488, "step": 2350 }, { "epoch": 236.0, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 0.035059237480163576, "loss_d0": 0.035127292573452, "step": 2360 }, { "epoch": 237.0, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 0.03736799955368042, "loss_d0": 0.03745412621647119, "step": 2370 }, { "epoch": 238.0, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 0.03551305830478668, "loss_d0": 0.0356417216360569, "step": 2380 }, { "epoch": 239.0, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 0.03457649946212769, "loss_d0": 0.03480789009481668, "step": 2390 }, { "epoch": 240.0, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 0.03957979381084442, "loss_d0": 0.04006949737668038, "step": 2400 }, { "epoch": 241.0, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 0.0365037590265274, "loss_d0": 0.03668595440685749, "step": 2410 }, { "epoch": 242.0, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 0.03540135025978088, "loss_d0": 0.035515340603888036, "step": 2420 }, { "epoch": 243.0, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.03719135224819183, "loss_d0": 0.0371862705796957, "step": 2430 }, { "epoch": 244.0, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 0.03541145920753479, "loss_d0": 0.03548047095537186, "step": 2440 }, { "epoch": 245.0, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.03484504222869873, "loss_d0": 0.03501688167452812, "step": 2450 }, { "epoch": 246.0, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 0.038421687483787534, "loss_d0": 0.0385257288813591, "step": 2460 }, { "epoch": 247.0, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.04063983857631683, "loss_d0": 0.0406292287632823, "step": 2470 }, { "epoch": 248.0, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 0.03950218856334686, "loss_d0": 0.03991553075611591, "step": 2480 }, { "epoch": 249.0, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 0.04504139721393585, "loss_d0": 0.044934678450226785, "step": 2490 }, { "epoch": 250.0, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 0.044790136814117434, "loss_d0": 0.04500455781817436, "step": 2500 }, { "epoch": 250.0, "eval_loss": 8.968677520751953, "eval_runtime": 0.1386, "eval_samples_per_second": 721.638, "eval_steps_per_second": 72.164, "step": 2500 }, { "epoch": 251.0, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 0.03921242952346802, "loss_d0": 0.03957866467535496, "step": 2510 }, { "epoch": 252.0, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.03758401870727539, "loss_d0": 0.03765736278146505, "step": 2520 }, { "epoch": 253.0, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 0.04317566752433777, "loss_d0": 0.04380397293716669, "step": 2530 }, { "epoch": 254.0, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 0.037376290559768675, "loss_d0": 0.037432397902011874, "step": 2540 }, { "epoch": 255.0, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 0.039767023921012876, "loss_d0": 0.04032333269715309, "step": 2550 }, { "epoch": 256.0, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 0.03463535010814667, "loss_d0": 0.03480861857533455, "step": 2560 }, { "epoch": 257.0, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 0.03285456895828247, "loss_d0": 0.03306712061166763, "step": 2570 }, { "epoch": 258.0, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 0.03332630395889282, "loss_d0": 0.03344619534909725, "step": 2580 }, { "epoch": 259.0, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 0.03543700873851776, "loss_d0": 0.03559563998132944, "step": 2590 }, { "epoch": 260.0, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.036532235145568845, "loss_d0": 0.03675668500363827, "step": 2600 }, { "epoch": 261.0, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 0.03505159020423889, "loss_d0": 0.03499522525817156, "step": 2610 }, { "epoch": 262.0, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 0.037879806756973264, "loss_d0": 0.03809732664376497, "step": 2620 }, { "epoch": 263.0, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.03381354212760925, "loss_d0": 0.03394233249127865, "step": 2630 }, { "epoch": 264.0, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 0.03334706723690033, "loss_d0": 0.0334820106625557, "step": 2640 }, { "epoch": 265.0, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.035118257999420165, "loss_d0": 0.03531328849494457, "step": 2650 }, { "epoch": 266.0, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.03638906180858612, "loss_d0": 0.03639591373503208, "step": 2660 }, { "epoch": 267.0, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 0.038152918219566345, "loss_d0": 0.0381375428289175, "step": 2670 }, { "epoch": 268.0, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.03639993965625763, "loss_d0": 0.03658099547028541, "step": 2680 }, { "epoch": 269.0, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 0.033522921800613406, "loss_d0": 0.033687918819487093, "step": 2690 }, { "epoch": 270.0, "grad_norm": 0.2080078125, "learning_rate": 0.0002, "loss": 0.03548611998558045, "loss_d0": 0.035678256303071976, "step": 2700 }, { "epoch": 271.0, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 0.03788673281669617, "loss_d0": 0.038012200966477394, "step": 2710 }, { "epoch": 272.0, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 0.039587223529815675, "loss_d0": 0.03958538956940174, "step": 2720 }, { "epoch": 273.0, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 0.035868316888809204, "loss_d0": 0.03592061344534159, "step": 2730 }, { "epoch": 274.0, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.03773685693740845, "loss_d0": 0.037729034572839736, "step": 2740 }, { "epoch": 275.0, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 0.03608798086643219, "loss_d0": 0.036122091487050055, "step": 2750 }, { "epoch": 276.0, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 0.03635281622409821, "loss_d0": 0.03639194741845131, "step": 2760 }, { "epoch": 277.0, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 0.03512744605541229, "loss_d0": 0.03519647419452667, "step": 2770 }, { "epoch": 278.0, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 0.033726102113723753, "loss_d0": 0.03386385552585125, "step": 2780 }, { "epoch": 279.0, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 0.03256627917289734, "loss_d0": 0.03272116258740425, "step": 2790 }, { "epoch": 280.0, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 0.032463929057121275, "loss_d0": 0.03261604756116867, "step": 2800 }, { "epoch": 281.0, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 0.033078649640083314, "loss_d0": 0.03321625180542469, "step": 2810 }, { "epoch": 282.0, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 0.033195015788078305, "loss_d0": 0.03333393353968859, "step": 2820 }, { "epoch": 283.0, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 0.033011114597320555, "loss_d0": 0.03315127640962601, "step": 2830 }, { "epoch": 284.0, "grad_norm": 0.2255859375, "learning_rate": 0.0002, "loss": 0.03291727304458618, "loss_d0": 0.03308185450732708, "step": 2840 }, { "epoch": 285.0, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 0.03410106897354126, "loss_d0": 0.034225675836205484, "step": 2850 }, { "epoch": 286.0, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 0.03687707483768463, "loss_d0": 0.036962743103504184, "step": 2860 }, { "epoch": 287.0, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 0.03679845929145813, "loss_d0": 0.03684771880507469, "step": 2870 }, { "epoch": 288.0, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 0.037522953748703, "loss_d0": 0.037532795406878, "step": 2880 }, { "epoch": 289.0, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 0.036753326654434204, "loss_d0": 0.03696324098855257, "step": 2890 }, { "epoch": 290.0, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 0.03908161222934723, "loss_d0": 0.039364179223775865, "step": 2900 }, { "epoch": 291.0, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 0.03866567611694336, "loss_d0": 0.03884962908923626, "step": 2910 }, { "epoch": 292.0, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 0.03543041348457336, "loss_d0": 0.03545324634760618, "step": 2920 }, { "epoch": 293.0, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 0.03430930972099304, "loss_d0": 0.03445378467440605, "step": 2930 }, { "epoch": 294.0, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 0.03432491421699524, "loss_d0": 0.03445266745984554, "step": 2940 }, { "epoch": 295.0, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 0.03572409451007843, "loss_d0": 0.035761214606463906, "step": 2950 }, { "epoch": 296.0, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 0.036069047451019284, "loss_d0": 0.03615146838128567, "step": 2960 }, { "epoch": 297.0, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 0.03799552321434021, "loss_d0": 0.03810358978807926, "step": 2970 }, { "epoch": 298.0, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 0.03695048093795776, "loss_d0": 0.03708292245864868, "step": 2980 }, { "epoch": 299.0, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 0.03596867620944977, "loss_d0": 0.03633528091013431, "step": 2990 }, { "epoch": 300.0, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 0.034122595191001893, "loss_d0": 0.03427013717591763, "step": 3000 }, { "epoch": 300.0, "eval_loss": 10.386900901794434, "eval_runtime": 0.139, "eval_samples_per_second": 719.273, "eval_steps_per_second": 71.927, "step": 3000 }, { "epoch": 301.0, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 0.03683215379714966, "loss_d0": 0.03691470641642809, "step": 3010 }, { "epoch": 302.0, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 0.03393584489822388, "loss_d0": 0.03424133453518152, "step": 3020 }, { "epoch": 303.0, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 0.03355291485786438, "loss_d0": 0.033677864074707034, "step": 3030 }, { "epoch": 304.0, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.03414491713047028, "loss_d0": 0.034268779680132866, "step": 3040 }, { "epoch": 305.0, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 0.033174335956573486, "loss_d0": 0.03330630790442228, "step": 3050 }, { "epoch": 306.0, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 0.03314556479454041, "loss_d0": 0.03322423882782459, "step": 3060 }, { "epoch": 307.0, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 0.03415323793888092, "loss_d0": 0.03426978774368763, "step": 3070 }, { "epoch": 308.0, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 0.03368777930736542, "loss_d0": 0.033808144927024844, "step": 3080 }, { "epoch": 309.0, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 0.033814409375190736, "loss_d0": 0.03400766029953957, "step": 3090 }, { "epoch": 310.0, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.03682324290275574, "loss_d0": 0.03683286644518376, "step": 3100 }, { "epoch": 311.0, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 0.038091260194778445, "loss_d0": 0.03829929493367672, "step": 3110 }, { "epoch": 312.0, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 0.03884383738040924, "loss_d0": 0.039184533432126044, "step": 3120 }, { "epoch": 313.0, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.04147293865680694, "loss_d0": 0.041862902045249936, "step": 3130 }, { "epoch": 314.0, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 0.043798410892486574, "loss_d0": 0.04384028725326061, "step": 3140 }, { "epoch": 315.0, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.039188632369041444, "loss_d0": 0.03918187767267227, "step": 3150 }, { "epoch": 316.0, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 0.035008540749549864, "loss_d0": 0.03513674270361662, "step": 3160 }, { "epoch": 317.0, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 0.03717628419399262, "loss_d0": 0.03712181244045496, "step": 3170 }, { "epoch": 318.0, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 0.038753950595855714, "loss_d0": 0.03883886393159628, "step": 3180 }, { "epoch": 319.0, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 0.032760214805603025, "loss_d0": 0.03286585081368685, "step": 3190 }, { "epoch": 320.0, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 0.03260535299777985, "loss_d0": 0.032791460305452345, "step": 3200 }, { "epoch": 321.0, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 0.03248119056224823, "loss_d0": 0.03258972428739071, "step": 3210 }, { "epoch": 322.0, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.03233232796192169, "loss_d0": 0.03252448942512274, "step": 3220 }, { "epoch": 323.0, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 0.034965500235557556, "loss_d0": 0.035115770623087884, "step": 3230 }, { "epoch": 324.0, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 0.033337122201919554, "loss_d0": 0.03353979028761387, "step": 3240 }, { "epoch": 325.0, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.03400499820709228, "loss_d0": 0.03429629430174828, "step": 3250 }, { "epoch": 326.0, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 0.0365963339805603, "loss_d0": 0.03673166874796152, "step": 3260 }, { "epoch": 327.0, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 0.03627434968948364, "loss_d0": 0.03651664853096008, "step": 3270 }, { "epoch": 328.0, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 0.03481152355670929, "loss_d0": 0.03493843246251345, "step": 3280 }, { "epoch": 329.0, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 0.03715276122093201, "loss_d0": 0.03730902448296547, "step": 3290 }, { "epoch": 330.0, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 0.03660928010940552, "loss_d0": 0.03661738894879818, "step": 3300 }, { "epoch": 331.0, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 0.036237123608589175, "loss_d0": 0.03629975281655788, "step": 3310 }, { "epoch": 332.0, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.03516555726528168, "loss_d0": 0.03544416427612305, "step": 3320 }, { "epoch": 333.0, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 0.03515617847442627, "loss_d0": 0.03528529480099678, "step": 3330 }, { "epoch": 334.0, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 0.03232303559780121, "loss_d0": 0.03249768242239952, "step": 3340 }, { "epoch": 335.0, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 0.03263014554977417, "loss_d0": 0.032782161794602874, "step": 3350 }, { "epoch": 336.0, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 0.03202419579029083, "loss_d0": 0.032148735225200654, "step": 3360 }, { "epoch": 337.0, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 0.03205806612968445, "loss_d0": 0.03226653411984444, "step": 3370 }, { "epoch": 338.0, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 0.032559001445770265, "loss_d0": 0.03269967641681433, "step": 3380 }, { "epoch": 339.0, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 0.033017808198928834, "loss_d0": 0.03317041750997305, "step": 3390 }, { "epoch": 340.0, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.03368024230003357, "loss_d0": 0.0338536249473691, "step": 3400 }, { "epoch": 341.0, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 0.033771827816963196, "loss_d0": 0.033943959325551984, "step": 3410 }, { "epoch": 342.0, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.032893133163452146, "loss_d0": 0.03305067028850317, "step": 3420 }, { "epoch": 343.0, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 0.033337703347206114, "loss_d0": 0.033495990186929704, "step": 3430 }, { "epoch": 344.0, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 0.03301987051963806, "loss_d0": 0.0332637595012784, "step": 3440 }, { "epoch": 345.0, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 0.0331781804561615, "loss_d0": 0.03328663818538189, "step": 3450 }, { "epoch": 346.0, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 0.033059829473495485, "loss_d0": 0.03323287479579449, "step": 3460 }, { "epoch": 347.0, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 0.0326650857925415, "loss_d0": 0.032817641645669936, "step": 3470 }, { "epoch": 348.0, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.03305915892124176, "loss_d0": 0.033248364180326465, "step": 3480 }, { "epoch": 349.0, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.03351768255233765, "loss_d0": 0.03367482423782349, "step": 3490 }, { "epoch": 350.0, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.03559110462665558, "loss_d0": 0.03580189272761345, "step": 3500 }, { "epoch": 350.0, "eval_loss": 10.393937110900879, "eval_runtime": 0.1384, "eval_samples_per_second": 722.367, "eval_steps_per_second": 72.237, "step": 3500 }, { "epoch": 351.0, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 0.03623929023742676, "loss_d0": 0.03615676146000624, "step": 3510 }, { "epoch": 352.0, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 0.041447535157203674, "loss_d0": 0.04138267450034618, "step": 3520 }, { "epoch": 353.0, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 0.03586837351322174, "loss_d0": 0.03592200707644224, "step": 3530 }, { "epoch": 354.0, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 0.03626823723316193, "loss_d0": 0.036303963884711264, "step": 3540 }, { "epoch": 355.0, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 0.037262076139450075, "loss_d0": 0.03748076409101486, "step": 3550 }, { "epoch": 356.0, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 0.03400363326072693, "loss_d0": 0.03407878540456295, "step": 3560 }, { "epoch": 357.0, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 0.03302598893642426, "loss_d0": 0.033188611082732675, "step": 3570 }, { "epoch": 358.0, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 0.033322757482528685, "loss_d0": 0.033429968543350695, "step": 3580 }, { "epoch": 359.0, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 0.032757741212844846, "loss_d0": 0.032907084189355375, "step": 3590 }, { "epoch": 360.0, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 0.03233836889266968, "loss_d0": 0.03248996958136559, "step": 3600 }, { "epoch": 361.0, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 0.03228305578231812, "loss_d0": 0.032450858317315576, "step": 3610 }, { "epoch": 362.0, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 0.031928643584251404, "loss_d0": 0.0320748321712017, "step": 3620 }, { "epoch": 363.0, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 0.03240954875946045, "loss_d0": 0.03259107153862715, "step": 3630 }, { "epoch": 364.0, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 0.033070188760757444, "loss_d0": 0.033200465887784955, "step": 3640 }, { "epoch": 365.0, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 0.03347308337688446, "loss_d0": 0.03367231860756874, "step": 3650 }, { "epoch": 366.0, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 0.03574792146682739, "loss_d0": 0.035922230780124666, "step": 3660 }, { "epoch": 367.0, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 0.036630797386169436, "loss_d0": 0.036798950470983985, "step": 3670 }, { "epoch": 368.0, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 0.03712879717350006, "loss_d0": 0.0374255184084177, "step": 3680 }, { "epoch": 369.0, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 0.034944096207618715, "loss_d0": 0.035139490850269794, "step": 3690 }, { "epoch": 370.0, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.04007576704025269, "loss_d0": 0.04031298961490393, "step": 3700 }, { "epoch": 371.0, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 0.03740532994270325, "loss_d0": 0.037719056382775305, "step": 3710 }, { "epoch": 372.0, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 0.035480844974517825, "loss_d0": 0.035916320607066156, "step": 3720 }, { "epoch": 373.0, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 0.03470188677310944, "loss_d0": 0.03503651265054941, "step": 3730 }, { "epoch": 374.0, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.03549116551876068, "loss_d0": 0.035736043751239774, "step": 3740 }, { "epoch": 375.0, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 0.03598669171333313, "loss_d0": 0.03614006638526916, "step": 3750 }, { "epoch": 376.0, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 0.035441306233406064, "loss_d0": 0.0356020450592041, "step": 3760 }, { "epoch": 377.0, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 0.03322634994983673, "loss_d0": 0.03331465721130371, "step": 3770 }, { "epoch": 378.0, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 0.03281596004962921, "loss_d0": 0.03296285588294268, "step": 3780 }, { "epoch": 379.0, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 0.032360291481018065, "loss_d0": 0.03251561634242535, "step": 3790 }, { "epoch": 380.0, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 0.03268592357635498, "loss_d0": 0.03296835850924253, "step": 3800 }, { "epoch": 381.0, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 0.032299524545669554, "loss_d0": 0.032451895996928215, "step": 3810 }, { "epoch": 382.0, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 0.03271079659461975, "loss_d0": 0.03295987173914909, "step": 3820 }, { "epoch": 383.0, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 0.03348786532878876, "loss_d0": 0.03376887794584036, "step": 3830 }, { "epoch": 384.0, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 0.03320521414279938, "loss_d0": 0.03338473234325647, "step": 3840 }, { "epoch": 385.0, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 0.03748701810836792, "loss_d0": 0.037659105844795704, "step": 3850 }, { "epoch": 386.0, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.037096035480499265, "loss_d0": 0.037257416918873784, "step": 3860 }, { "epoch": 387.0, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 0.03656034171581268, "loss_d0": 0.03679768163710832, "step": 3870 }, { "epoch": 388.0, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 0.03579927086830139, "loss_d0": 0.0359389653429389, "step": 3880 }, { "epoch": 389.0, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 0.03452290296554565, "loss_d0": 0.03470381684601307, "step": 3890 }, { "epoch": 390.0, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 0.03379636406898499, "loss_d0": 0.033996118046343324, "step": 3900 }, { "epoch": 391.0, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 0.03610147833824158, "loss_d0": 0.036345665156841275, "step": 3910 }, { "epoch": 392.0, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 0.0401570588350296, "loss_d0": 0.04051978532224894, "step": 3920 }, { "epoch": 393.0, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 0.03571869134902954, "loss_d0": 0.035916782543063165, "step": 3930 }, { "epoch": 394.0, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.03941313028335571, "loss_d0": 0.03940380644053221, "step": 3940 }, { "epoch": 395.0, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 0.03546488285064697, "loss_d0": 0.035860291495919225, "step": 3950 }, { "epoch": 396.0, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 0.034747114777565, "loss_d0": 0.034962891787290576, "step": 3960 }, { "epoch": 397.0, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 0.03279576301574707, "loss_d0": 0.03299913611263037, "step": 3970 }, { "epoch": 398.0, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.03331714868545532, "loss_d0": 0.03338895961642265, "step": 3980 }, { "epoch": 399.0, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 0.032717388868331906, "loss_d0": 0.032927786000072956, "step": 3990 }, { "epoch": 400.0, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 0.032257989048957825, "loss_d0": 0.032345013320446016, "step": 4000 }, { "epoch": 400.0, "eval_loss": 10.058531761169434, "eval_runtime": 0.1385, "eval_samples_per_second": 722.228, "eval_steps_per_second": 72.223, "step": 4000 }, { "epoch": 401.0, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 0.031912589073181154, "loss_d0": 0.032074902392923835, "step": 4010 }, { "epoch": 402.0, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 0.03260010480880737, "loss_d0": 0.03272282462567091, "step": 4020 }, { "epoch": 403.0, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 0.033337125182151796, "loss_d0": 0.033443492092192176, "step": 4030 }, { "epoch": 404.0, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 0.032800838351249695, "loss_d0": 0.032973457127809525, "step": 4040 }, { "epoch": 405.0, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 0.03818729817867279, "loss_d0": 0.038232280500233175, "step": 4050 }, { "epoch": 406.0, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 0.034884825348854065, "loss_d0": 0.0350344393402338, "step": 4060 }, { "epoch": 407.0, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 0.033850425481796266, "loss_d0": 0.03393289912492037, "step": 4070 }, { "epoch": 408.0, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 0.036410349607467654, "loss_d0": 0.03655528649687767, "step": 4080 }, { "epoch": 409.0, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 0.03442665934562683, "loss_d0": 0.03471992388367653, "step": 4090 }, { "epoch": 410.0, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 0.03422979712486267, "loss_d0": 0.03457647804170847, "step": 4100 }, { "epoch": 411.0, "grad_norm": 0.296875, "learning_rate": 0.00019995559043291586, "loss": 0.0334944874048233, "loss_d0": 0.0336628831923008, "step": 4110 }, { "epoch": 412.0, "grad_norm": 0.275390625, "learning_rate": 0.0001998021321462845, "loss": 0.03505198955535889, "loss_d0": 0.0351263914257288, "step": 4120 }, { "epoch": 413.0, "grad_norm": 0.80859375, "learning_rate": 0.00019953926379459095, "loss": 0.03730386793613434, "loss_d0": 0.03729016147553921, "step": 4130 }, { "epoch": 414.0, "grad_norm": 0.333984375, "learning_rate": 0.00019916730564242994, "loss": 0.0371760368347168, "loss_d0": 0.037596262991428375, "step": 4140 }, { "epoch": 415.0, "grad_norm": 0.46875, "learning_rate": 0.00019868671086351413, "loss": 0.03883700668811798, "loss_d0": 0.03939884938299656, "step": 4150 }, { "epoch": 416.0, "grad_norm": 0.328125, "learning_rate": 0.00019809806498855166, "loss": 0.03612925708293915, "loss_d0": 0.03640542384237051, "step": 4160 }, { "epoch": 417.0, "grad_norm": 0.3046875, "learning_rate": 0.00019740208519186726, "loss": 0.03312317132949829, "loss_d0": 0.03327005449682474, "step": 4170 }, { "epoch": 418.0, "grad_norm": 0.369140625, "learning_rate": 0.0001965996194176357, "loss": 0.03240028619766235, "loss_d0": 0.03252810388803482, "step": 4180 }, { "epoch": 419.0, "grad_norm": 0.283203125, "learning_rate": 0.00019569164534679248, "loss": 0.032256463170051576, "loss_d0": 0.03243272062391043, "step": 4190 }, { "epoch": 420.0, "grad_norm": 0.478515625, "learning_rate": 0.0001946792692058803, "loss": 0.03251784741878509, "loss_d0": 0.03268965985625982, "step": 4200 }, { "epoch": 421.0, "grad_norm": 0.431640625, "learning_rate": 0.00019356372441928221, "loss": 0.03413170278072357, "loss_d0": 0.03415941409766674, "step": 4210 }, { "epoch": 422.0, "grad_norm": 0.330078125, "learning_rate": 0.00019234637010648426, "loss": 0.03352169692516327, "loss_d0": 0.03356893640011549, "step": 4220 }, { "epoch": 423.0, "grad_norm": 0.39453125, "learning_rate": 0.00019102868942619743, "loss": 0.03519007265567779, "loss_d0": 0.03537472262978554, "step": 4230 }, { "epoch": 424.0, "grad_norm": 0.30859375, "learning_rate": 0.00018961228776935755, "loss": 0.033835163712501524, "loss_d0": 0.0340066185221076, "step": 4240 }, { "epoch": 425.0, "grad_norm": 0.330078125, "learning_rate": 0.00018809889080320357, "loss": 0.03348900973796844, "loss_d0": 0.03368382565677166, "step": 4250 }, { "epoch": 426.0, "grad_norm": 0.359375, "learning_rate": 0.00018649034236881777, "loss": 0.032539117336273196, "loss_d0": 0.03265139237046242, "step": 4260 }, { "epoch": 427.0, "grad_norm": 0.265625, "learning_rate": 0.00018478860223468955, "loss": 0.03206770420074463, "loss_d0": 0.03221498392522335, "step": 4270 }, { "epoch": 428.0, "grad_norm": 0.353515625, "learning_rate": 0.0001829957437090394, "loss": 0.031978198885917665, "loss_d0": 0.032128226943314075, "step": 4280 }, { "epoch": 429.0, "grad_norm": 0.298828125, "learning_rate": 0.00018111395111381214, "loss": 0.03165724277496338, "loss_d0": 0.031812858395278455, "step": 4290 }, { "epoch": 430.0, "grad_norm": 0.28515625, "learning_rate": 0.00017914551712341713, "loss": 0.03185569643974304, "loss_d0": 0.032014632038772105, "step": 4300 }, { "epoch": 431.0, "grad_norm": 0.29296875, "learning_rate": 0.0001770928399714576, "loss": 0.03165157437324524, "loss_d0": 0.03178926482796669, "step": 4310 }, { "epoch": 432.0, "grad_norm": 0.3984375, "learning_rate": 0.0001749584205288526, "loss": 0.032035303115844724, "loss_d0": 0.03216155916452408, "step": 4320 }, { "epoch": 433.0, "grad_norm": 0.322265625, "learning_rate": 0.00017274485925691083, "loss": 0.03216339349746704, "loss_d0": 0.03232760801911354, "step": 4330 }, { "epoch": 434.0, "grad_norm": 0.271484375, "learning_rate": 0.00017045485303906913, "loss": 0.0319296658039093, "loss_d0": 0.03210555557161569, "step": 4340 }, { "epoch": 435.0, "grad_norm": 0.341796875, "learning_rate": 0.00016809119189515557, "loss": 0.03176964819431305, "loss_d0": 0.03191218227148056, "step": 4350 }, { "epoch": 436.0, "grad_norm": 0.3203125, "learning_rate": 0.00016565675558217989, "loss": 0.031781908869743344, "loss_d0": 0.03193502109497785, "step": 4360 }, { "epoch": 437.0, "grad_norm": 0.376953125, "learning_rate": 0.00016315451008579328, "loss": 0.03168573677539825, "loss_d0": 0.031842239387333396, "step": 4370 }, { "epoch": 438.0, "grad_norm": 0.2734375, "learning_rate": 0.00016058750400669178, "loss": 0.03162899613380432, "loss_d0": 0.03178664371371269, "step": 4380 }, { "epoch": 439.0, "grad_norm": 0.3125, "learning_rate": 0.0001579588648463657, "loss": 0.03158285319805145, "loss_d0": 0.03172982707619667, "step": 4390 }, { "epoch": 440.0, "grad_norm": 0.26953125, "learning_rate": 0.00015527179519672117, "loss": 0.03189119398593902, "loss_d0": 0.03203168921172619, "step": 4400 }, { "epoch": 441.0, "grad_norm": 0.341796875, "learning_rate": 0.00015252956883821488, "loss": 0.03179541230201721, "loss_d0": 0.031979261338710784, "step": 4410 }, { "epoch": 442.0, "grad_norm": 0.296875, "learning_rate": 0.00014973552675125708, "loss": 0.03197322189807892, "loss_d0": 0.03215518109500408, "step": 4420 }, { "epoch": 443.0, "grad_norm": 0.3125, "learning_rate": 0.00014689307304574154, "loss": 0.0316112220287323, "loss_d0": 0.03175947219133377, "step": 4430 }, { "epoch": 444.0, "grad_norm": 0.2578125, "learning_rate": 0.00014400567081366205, "loss": 0.031584432721138, "loss_d0": 0.031713084876537324, "step": 4440 }, { "epoch": 445.0, "grad_norm": 0.31640625, "learning_rate": 0.00014107683790986813, "loss": 0.031557098031044006, "loss_d0": 0.031706058979034425, "step": 4450 }, { "epoch": 446.0, "grad_norm": 0.484375, "learning_rate": 0.00013811014266610096, "loss": 0.0314616322517395, "loss_d0": 0.03159730080515146, "step": 4460 }, { "epoch": 447.0, "grad_norm": 0.296875, "learning_rate": 0.00013510919954353066, "loss": 0.03137812614440918, "loss_d0": 0.031485173292458056, "step": 4470 }, { "epoch": 448.0, "grad_norm": 0.333984375, "learning_rate": 0.00013207766472909225, "loss": 0.031725209951400754, "loss_d0": 0.03188701011240482, "step": 4480 }, { "epoch": 449.0, "grad_norm": 0.3359375, "learning_rate": 0.000129019231680985, "loss": 0.03140555918216705, "loss_d0": 0.031553713604807854, "step": 4490 }, { "epoch": 450.0, "grad_norm": 0.287109375, "learning_rate": 0.0001259376266287625, "loss": 0.031349974870681765, "loss_d0": 0.0315061841160059, "step": 4500 }, { "epoch": 450.0, "eval_loss": 11.965375900268555, "eval_runtime": 0.1383, "eval_samples_per_second": 723.18, "eval_steps_per_second": 72.318, "step": 4500 }, { "epoch": 451.0, "grad_norm": 0.310546875, "learning_rate": 0.00012283660403349607, "loss": 0.03128575384616852, "loss_d0": 0.03142898045480251, "step": 4510 }, { "epoch": 452.0, "grad_norm": 0.255859375, "learning_rate": 0.00011971994201354204, "loss": 0.031337812542915344, "loss_d0": 0.031481748633086684, "step": 4520 }, { "epoch": 453.0, "grad_norm": 0.263671875, "learning_rate": 0.00011659143774148684, "loss": 0.031167712807655335, "loss_d0": 0.03133993223309517, "step": 4530 }, { "epoch": 454.0, "grad_norm": 0.1923828125, "learning_rate": 0.0001134549028178768, "loss": 0.0312427818775177, "loss_d0": 0.03139432743191719, "step": 4540 }, { "epoch": 455.0, "grad_norm": 0.3046875, "learning_rate": 0.00011031415862737014, "loss": 0.031090378761291504, "loss_d0": 0.031241376884281635, "step": 4550 }, { "epoch": 456.0, "grad_norm": 0.36328125, "learning_rate": 0.00010717303168296846, "loss": 0.03118278682231903, "loss_d0": 0.03133631870150566, "step": 4560 }, { "epoch": 457.0, "grad_norm": 0.359375, "learning_rate": 0.000104035348964, "loss": 0.031090670824050905, "loss_d0": 0.031235255300998688, "step": 4570 }, { "epoch": 458.0, "grad_norm": 0.298828125, "learning_rate": 0.00010090493325353484, "loss": 0.031037402153015137, "loss_d0": 0.031187831424176693, "step": 4580 }, { "epoch": 459.0, "grad_norm": 0.3125, "learning_rate": 9.778559848091261e-05, "loss": 0.031089764833450318, "loss_d0": 0.03125000651925802, "step": 4590 }, { "epoch": 460.0, "grad_norm": 0.30078125, "learning_rate": 9.468114507505707e-05, "loss": 0.03099344074726105, "loss_d0": 0.031137890182435513, "step": 4600 }, { "epoch": 461.0, "grad_norm": 0.2734375, "learning_rate": 9.15953553342389e-05, "loss": 0.030816924571990967, "loss_d0": 0.03097556084394455, "step": 4610 }, { "epoch": 462.0, "grad_norm": 0.2412109375, "learning_rate": 8.853198881792772e-05, "loss": 0.03086390495300293, "loss_d0": 0.03101999256759882, "step": 4620 }, { "epoch": 463.0, "grad_norm": 0.26171875, "learning_rate": 8.549477776634832e-05, "loss": 0.030869218707084655, "loss_d0": 0.03101930357515812, "step": 4630 }, { "epoch": 464.0, "grad_norm": 0.330078125, "learning_rate": 8.24874225533205e-05, "loss": 0.030919986963272094, "loss_d0": 0.031084178015589714, "step": 4640 }, { "epoch": 465.0, "grad_norm": 0.2421875, "learning_rate": 7.951358717792378e-05, "loss": 0.030898517370223998, "loss_d0": 0.031054853461682796, "step": 4650 }, { "epoch": 466.0, "grad_norm": 0.265625, "learning_rate": 7.657689480047888e-05, "loss": 0.030780863761901856, "loss_d0": 0.030928720720112324, "step": 4660 }, { "epoch": 467.0, "grad_norm": 0.263671875, "learning_rate": 7.368092332828491e-05, "loss": 0.030795866250991823, "loss_d0": 0.03094344474375248, "step": 4670 }, { "epoch": 468.0, "grad_norm": 0.2314453125, "learning_rate": 7.082920105649054e-05, "loss": 0.0307847261428833, "loss_d0": 0.030939054489135743, "step": 4680 }, { "epoch": 469.0, "grad_norm": 0.337890625, "learning_rate": 6.80252023694098e-05, "loss": 0.03071419894695282, "loss_d0": 0.03086507637053728, "step": 4690 }, { "epoch": 470.0, "grad_norm": 0.2265625, "learning_rate": 6.527234350752003e-05, "loss": 0.030716896057128906, "loss_d0": 0.030868306756019592, "step": 4700 }, { "epoch": 471.0, "grad_norm": 0.30859375, "learning_rate": 6.257397840529903e-05, "loss": 0.030738791823387145, "loss_d0": 0.03089458905160427, "step": 4710 }, { "epoch": 472.0, "grad_norm": 0.328125, "learning_rate": 5.993339460497257e-05, "loss": 0.030718064308166503, "loss_d0": 0.030861228704452515, "step": 4720 }, { "epoch": 473.0, "grad_norm": 0.36328125, "learning_rate": 5.7353809251150606e-05, "loss": 0.030652564764022828, "loss_d0": 0.030811621434986593, "step": 4730 }, { "epoch": 474.0, "grad_norm": 0.26171875, "learning_rate": 5.483836517123214e-05, "loss": 0.030725887417793273, "loss_d0": 0.030884983018040657, "step": 4740 }, { "epoch": 475.0, "grad_norm": 0.251953125, "learning_rate": 5.239012704635402e-05, "loss": 0.03063139021396637, "loss_d0": 0.030782385356724264, "step": 4750 }, { "epoch": 476.0, "grad_norm": 0.2373046875, "learning_rate": 5.0012077677549283e-05, "loss": 0.030588483810424803, "loss_d0": 0.03074523825198412, "step": 4760 }, { "epoch": 477.0, "grad_norm": 0.26953125, "learning_rate": 4.77071143516634e-05, "loss": 0.030628728866577148, "loss_d0": 0.030772129260003566, "step": 4770 }, { "epoch": 478.0, "grad_norm": 0.314453125, "learning_rate": 4.547804531145656e-05, "loss": 0.03059217631816864, "loss_d0": 0.030741449631750583, "step": 4780 }, { "epoch": 479.0, "grad_norm": 0.275390625, "learning_rate": 4.332758633419252e-05, "loss": 0.030566006898880005, "loss_d0": 0.030716801062226295, "step": 4790 }, { "epoch": 480.0, "grad_norm": 0.26953125, "learning_rate": 4.12583574228822e-05, "loss": 0.03054628074169159, "loss_d0": 0.03070034421980381, "step": 4800 }, { "epoch": 481.0, "grad_norm": 0.291015625, "learning_rate": 3.927287961421382e-05, "loss": 0.030614039301872252, "loss_d0": 0.030754202604293825, "step": 4810 }, { "epoch": 482.0, "grad_norm": 0.296875, "learning_rate": 3.737357190705782e-05, "loss": 0.030571776628494262, "loss_d0": 0.030728295259177685, "step": 4820 }, { "epoch": 483.0, "grad_norm": 0.279296875, "learning_rate": 3.556274831528945e-05, "loss": 0.030562657117843627, "loss_d0": 0.030714512430131437, "step": 4830 }, { "epoch": 484.0, "grad_norm": 0.291015625, "learning_rate": 3.3842615048519255e-05, "loss": 0.030520126223564148, "loss_d0": 0.030660224333405494, "step": 4840 }, { "epoch": 485.0, "grad_norm": 0.25, "learning_rate": 3.221526782416659e-05, "loss": 0.030577152967453003, "loss_d0": 0.03073058519512415, "step": 4850 }, { "epoch": 486.0, "grad_norm": 0.259765625, "learning_rate": 3.068268931415069e-05, "loss": 0.030531370639801027, "loss_d0": 0.030680059641599654, "step": 4860 }, { "epoch": 487.0, "grad_norm": 0.267578125, "learning_rate": 2.9246746729310446e-05, "loss": 0.030506277084350587, "loss_d0": 0.030657671578228473, "step": 4870 }, { "epoch": 488.0, "grad_norm": 0.291015625, "learning_rate": 2.7909189544495435e-05, "loss": 0.030476900935173034, "loss_d0": 0.0306233711540699, "step": 4880 }, { "epoch": 489.0, "grad_norm": 0.236328125, "learning_rate": 2.6671647367100477e-05, "loss": 0.03056578040122986, "loss_d0": 0.030728305876255035, "step": 4890 }, { "epoch": 490.0, "grad_norm": 0.33203125, "learning_rate": 2.553562795163998e-05, "loss": 0.03052770495414734, "loss_d0": 0.0306832792237401, "step": 4900 }, { "epoch": 491.0, "grad_norm": 0.28125, "learning_rate": 2.450251536278129e-05, "loss": 0.030547520518302916, "loss_d0": 0.03070348985493183, "step": 4910 }, { "epoch": 492.0, "grad_norm": 0.244140625, "learning_rate": 2.3573568289075136e-05, "loss": 0.030580446124076843, "loss_d0": 0.030734172835946084, "step": 4920 }, { "epoch": 493.0, "grad_norm": 0.302734375, "learning_rate": 2.2749918509437493e-05, "loss": 0.0305540531873703, "loss_d0": 0.030697750858962536, "step": 4930 }, { "epoch": 494.0, "grad_norm": 0.240234375, "learning_rate": 2.2032569514251373e-05, "loss": 0.03054444193840027, "loss_d0": 0.030680439807474612, "step": 4940 }, { "epoch": 495.0, "grad_norm": 0.310546875, "learning_rate": 2.1422395282768234e-05, "loss": 0.030556756258010864, "loss_d0": 0.03069473523646593, "step": 4950 }, { "epoch": 496.0, "grad_norm": 0.28515625, "learning_rate": 2.092013921829899e-05, "loss": 0.03053104877471924, "loss_d0": 0.030675996840000153, "step": 4960 }, { "epoch": 497.0, "grad_norm": 0.279296875, "learning_rate": 2.0526413242491617e-05, "loss": 0.03049129843711853, "loss_d0": 0.03064910229295492, "step": 4970 }, { "epoch": 498.0, "grad_norm": 0.26953125, "learning_rate": 2.0241697049798773e-05, "loss": 0.030531305074691772, "loss_d0": 0.030683065764606, "step": 4980 }, { "epoch": 499.0, "grad_norm": 0.2578125, "learning_rate": 2.0066337523044098e-05, "loss": 0.030558282136917116, "loss_d0": 0.030703888833522798, "step": 4990 }, { "epoch": 500.0, "grad_norm": 0.248046875, "learning_rate": 2.0000548310798866e-05, "loss": 0.030524393916130065, "loss_d0": 0.030675517208874226, "step": 5000 }, { "epoch": 500.0, "eval_loss": 12.82153034210205, "eval_runtime": 0.1387, "eval_samples_per_second": 721.061, "eval_steps_per_second": 72.106, "step": 5000 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 500, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.592828551364608e+16, "train_batch_size": 10, "trial_name": null, "trial_params": null }