{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 1608, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 3.8916015625, "epoch": 0.00124421633811579, "grad_norm": 0.04375067884776007, "learning_rate": 1.7647058823529412e-06, "loss": 2.7179, "mean_token_accuracy": 0.627417977899313, "num_tokens": 957273.0, "step": 2 }, { "entropy": 3.9189453125, "epoch": 0.00248843267623158, "grad_norm": 0.04424783449636998, "learning_rate": 5.294117647058824e-06, "loss": 2.717, "mean_token_accuracy": 0.6276088338345289, "num_tokens": 1916763.0, "step": 4 }, { "entropy": 3.9345703125, "epoch": 0.00373264901434737, "grad_norm": 0.044169311229086676, "learning_rate": 8.823529411764707e-06, "loss": 2.7174, "mean_token_accuracy": 0.6269403174519539, "num_tokens": 2884483.0, "step": 6 }, { "entropy": 3.86669921875, "epoch": 0.00497686535246316, "grad_norm": 0.043734994375829034, "learning_rate": 1.2352941176470587e-05, "loss": 2.7123, "mean_token_accuracy": 0.6266846135258675, "num_tokens": 3834394.0, "step": 8 }, { "entropy": 3.84765625, "epoch": 0.00622108169057895, "grad_norm": 0.04478739685061408, "learning_rate": 1.5882352941176473e-05, "loss": 2.6624, "mean_token_accuracy": 0.6316747814416885, "num_tokens": 4814867.0, "step": 10 }, { "entropy": 3.703125, "epoch": 0.00746529802869474, "grad_norm": 0.04394022138020568, "learning_rate": 1.9411764705882355e-05, "loss": 2.6154, "mean_token_accuracy": 0.6323299612849951, "num_tokens": 5788115.0, "step": 12 }, { "entropy": 3.591796875, "epoch": 0.00870951436681053, "grad_norm": 0.044132756754733486, "learning_rate": 2.2941176470588233e-05, "loss": 2.5602, "mean_token_accuracy": 0.6358892396092415, "num_tokens": 6764048.0, "step": 14 }, { "entropy": 3.4609375, "epoch": 0.00995373070492632, "grad_norm": 0.043933966745939, "learning_rate": 2.647058823529412e-05, "loss": 2.5284, "mean_token_accuracy": 0.6353750042617321, "num_tokens": 7721558.0, "step": 16 }, { "entropy": 3.25830078125, "epoch": 0.01119794704304211, "grad_norm": 0.04262581471895549, "learning_rate": 3e-05, "loss": 2.4236, "mean_token_accuracy": 0.6459825430065393, "num_tokens": 8703166.0, "step": 18 }, { "entropy": 3.08251953125, "epoch": 0.0124421633811579, "grad_norm": 0.041446806689034235, "learning_rate": 2.9981244138793376e-05, "loss": 2.3626, "mean_token_accuracy": 0.6500243619084358, "num_tokens": 9669872.0, "step": 20 }, { "entropy": 2.916015625, "epoch": 0.01368637971927369, "grad_norm": 0.040165180244055104, "learning_rate": 2.9962488277586748e-05, "loss": 2.2867, "mean_token_accuracy": 0.6563531290739775, "num_tokens": 10638302.0, "step": 22 }, { "entropy": 2.76123046875, "epoch": 0.01493059605738948, "grad_norm": 0.0399557706068265, "learning_rate": 2.994373241638012e-05, "loss": 2.2138, "mean_token_accuracy": 0.6658517103642225, "num_tokens": 11608044.0, "step": 24 }, { "entropy": 2.6171875, "epoch": 0.016174812395505268, "grad_norm": 0.03929459981537806, "learning_rate": 2.992497655517349e-05, "loss": 2.1438, "mean_token_accuracy": 0.6765973474830389, "num_tokens": 12566238.0, "step": 26 }, { "entropy": 2.5, "epoch": 0.01741902873362106, "grad_norm": 0.04007888397827045, "learning_rate": 2.9906220693966867e-05, "loss": 2.0814, "mean_token_accuracy": 0.6787540279328823, "num_tokens": 13518945.0, "step": 28 }, { "entropy": 2.38818359375, "epoch": 0.018663245071736848, "grad_norm": 0.04030043340328399, "learning_rate": 2.988746483276024e-05, "loss": 2.0142, "mean_token_accuracy": 0.6849971897900105, "num_tokens": 14460672.0, "step": 30 }, { "entropy": 2.29736328125, "epoch": 0.01990746140985264, "grad_norm": 0.039903859220650706, "learning_rate": 2.986870897155361e-05, "loss": 1.9302, "mean_token_accuracy": 0.6909354832023382, "num_tokens": 15412383.0, "step": 32 }, { "entropy": 2.20068359375, "epoch": 0.021151677747968427, "grad_norm": 0.039589187039968075, "learning_rate": 2.9849953110346982e-05, "loss": 1.8311, "mean_token_accuracy": 0.7058284785598516, "num_tokens": 16382675.0, "step": 34 }, { "entropy": 2.1337890625, "epoch": 0.02239589408608422, "grad_norm": 0.04048044906935387, "learning_rate": 2.9831197249140357e-05, "loss": 1.7669, "mean_token_accuracy": 0.7101518884301186, "num_tokens": 17338717.0, "step": 36 }, { "entropy": 2.074951171875, "epoch": 0.023640110424200007, "grad_norm": 0.040509203114577085, "learning_rate": 2.9812441387933732e-05, "loss": 1.6784, "mean_token_accuracy": 0.7168243546038866, "num_tokens": 18308885.0, "step": 38 }, { "entropy": 2.009765625, "epoch": 0.0248843267623158, "grad_norm": 0.04025989903113805, "learning_rate": 2.97936855267271e-05, "loss": 1.5991, "mean_token_accuracy": 0.7300447728484869, "num_tokens": 19278799.0, "step": 40 }, { "entropy": 1.927978515625, "epoch": 0.026128543100431587, "grad_norm": 0.04188718131403856, "learning_rate": 2.9774929665520476e-05, "loss": 1.518, "mean_token_accuracy": 0.7441737987101078, "num_tokens": 20237950.0, "step": 42 }, { "entropy": 1.843994140625, "epoch": 0.02737275943854738, "grad_norm": 0.04050419320108763, "learning_rate": 2.9756173804313848e-05, "loss": 1.4427, "mean_token_accuracy": 0.7544036228209734, "num_tokens": 21195456.0, "step": 44 }, { "entropy": 1.769775390625, "epoch": 0.028616975776663167, "grad_norm": 0.037611229887135056, "learning_rate": 2.9737417943107223e-05, "loss": 1.3694, "mean_token_accuracy": 0.7618667017668486, "num_tokens": 22153393.0, "step": 46 }, { "entropy": 1.677734375, "epoch": 0.02986119211477896, "grad_norm": 0.03746635019302526, "learning_rate": 2.9718662081900595e-05, "loss": 1.312, "mean_token_accuracy": 0.7719069700688124, "num_tokens": 23122688.0, "step": 48 }, { "entropy": 1.580322265625, "epoch": 0.031105408452894747, "grad_norm": 0.03495485154553519, "learning_rate": 2.9699906220693967e-05, "loss": 1.2367, "mean_token_accuracy": 0.7826033104211092, "num_tokens": 24094625.0, "step": 50 }, { "entropy": 1.508056640625, "epoch": 0.032349624791010535, "grad_norm": 0.03221473070906054, "learning_rate": 2.9681150359487342e-05, "loss": 1.1953, "mean_token_accuracy": 0.786092720925808, "num_tokens": 25063310.0, "step": 52 }, { "entropy": 1.444091796875, "epoch": 0.03359384112912633, "grad_norm": 0.02998305029369566, "learning_rate": 2.9662394498280714e-05, "loss": 1.1475, "mean_token_accuracy": 0.7919168993830681, "num_tokens": 26035019.0, "step": 54 }, { "entropy": 1.41845703125, "epoch": 0.03483805746724212, "grad_norm": 0.030100662099233916, "learning_rate": 2.964363863707409e-05, "loss": 1.1088, "mean_token_accuracy": 0.7974316887557507, "num_tokens": 26986263.0, "step": 56 }, { "entropy": 1.37841796875, "epoch": 0.0360822738053579, "grad_norm": 0.029168854204663745, "learning_rate": 2.9624882775867457e-05, "loss": 1.0664, "mean_token_accuracy": 0.8031521048396826, "num_tokens": 27954783.0, "step": 58 }, { "entropy": 1.3515625, "epoch": 0.037326490143473695, "grad_norm": 0.028592424763496094, "learning_rate": 2.9606126914660832e-05, "loss": 1.0269, "mean_token_accuracy": 0.8082536458969116, "num_tokens": 28913274.0, "step": 60 }, { "entropy": 1.32275390625, "epoch": 0.03857070648158949, "grad_norm": 0.02711502925224832, "learning_rate": 2.9587371053454204e-05, "loss": 0.996, "mean_token_accuracy": 0.8141133449971676, "num_tokens": 29865984.0, "step": 62 }, { "entropy": 1.2802734375, "epoch": 0.03981492281970528, "grad_norm": 0.025707649345845223, "learning_rate": 2.956861519224758e-05, "loss": 0.9518, "mean_token_accuracy": 0.8208200559020042, "num_tokens": 30832564.0, "step": 64 }, { "entropy": 1.233642578125, "epoch": 0.04105913915782106, "grad_norm": 0.02568435573433197, "learning_rate": 2.954985933104095e-05, "loss": 0.9158, "mean_token_accuracy": 0.8275469224900007, "num_tokens": 31792150.0, "step": 66 }, { "entropy": 1.20263671875, "epoch": 0.042303355495936855, "grad_norm": 0.024265992451134286, "learning_rate": 2.9531103469834323e-05, "loss": 0.8929, "mean_token_accuracy": 0.8309035133570433, "num_tokens": 32772012.0, "step": 68 }, { "entropy": 1.1630859375, "epoch": 0.04354757183405265, "grad_norm": 0.022293135841185124, "learning_rate": 2.9512347608627698e-05, "loss": 0.8628, "mean_token_accuracy": 0.8341421224176884, "num_tokens": 33733983.0, "step": 70 }, { "entropy": 1.11865234375, "epoch": 0.04479178817216844, "grad_norm": 0.019848328568626967, "learning_rate": 2.949359174742107e-05, "loss": 0.8336, "mean_token_accuracy": 0.8379728831350803, "num_tokens": 34699264.0, "step": 72 }, { "entropy": 1.075927734375, "epoch": 0.04603600451028422, "grad_norm": 0.019049148365157205, "learning_rate": 2.9474835886214445e-05, "loss": 0.8134, "mean_token_accuracy": 0.8414452541619539, "num_tokens": 35667749.0, "step": 74 }, { "entropy": 1.04248046875, "epoch": 0.047280220848400015, "grad_norm": 0.020379115722980193, "learning_rate": 2.9456080025007814e-05, "loss": 0.7956, "mean_token_accuracy": 0.8453655540943146, "num_tokens": 36618250.0, "step": 76 }, { "entropy": 1.005615234375, "epoch": 0.048524437186515806, "grad_norm": 0.01735763390895165, "learning_rate": 2.943732416380119e-05, "loss": 0.7728, "mean_token_accuracy": 0.8468529600650072, "num_tokens": 37598645.0, "step": 78 }, { "entropy": 0.975830078125, "epoch": 0.0497686535246316, "grad_norm": 0.018308151005361317, "learning_rate": 2.941856830259456e-05, "loss": 0.756, "mean_token_accuracy": 0.8498339783400297, "num_tokens": 38581648.0, "step": 80 }, { "entropy": 0.95166015625, "epoch": 0.05101286986274738, "grad_norm": 0.016887734308931736, "learning_rate": 2.9399812441387936e-05, "loss": 0.7489, "mean_token_accuracy": 0.8497179094702005, "num_tokens": 39558927.0, "step": 82 }, { "entropy": 0.907470703125, "epoch": 0.052257086200863175, "grad_norm": 0.016681384874127802, "learning_rate": 2.9381056580181304e-05, "loss": 0.7161, "mean_token_accuracy": 0.8557771425694227, "num_tokens": 40527856.0, "step": 84 }, { "entropy": 0.88720703125, "epoch": 0.053501302538978966, "grad_norm": 0.01516475435038533, "learning_rate": 2.936230071897468e-05, "loss": 0.7073, "mean_token_accuracy": 0.8551859520375729, "num_tokens": 41479733.0, "step": 86 }, { "entropy": 0.864501953125, "epoch": 0.05474551887709476, "grad_norm": 0.01251344153431511, "learning_rate": 2.9343544857768055e-05, "loss": 0.7036, "mean_token_accuracy": 0.8547059707343578, "num_tokens": 42447079.0, "step": 88 }, { "entropy": 0.8360595703125, "epoch": 0.05598973521521054, "grad_norm": 0.011634507292653147, "learning_rate": 2.9324788996561426e-05, "loss": 0.6868, "mean_token_accuracy": 0.857071390375495, "num_tokens": 43400570.0, "step": 90 }, { "entropy": 0.821533203125, "epoch": 0.057233951553326334, "grad_norm": 0.011210219122059583, "learning_rate": 2.93060331353548e-05, "loss": 0.6825, "mean_token_accuracy": 0.8564629200845957, "num_tokens": 44362717.0, "step": 92 }, { "entropy": 0.8062744140625, "epoch": 0.058478167891442126, "grad_norm": 0.010930254287366652, "learning_rate": 2.928727727414817e-05, "loss": 0.6739, "mean_token_accuracy": 0.856963537633419, "num_tokens": 45335083.0, "step": 94 }, { "entropy": 0.7928466796875, "epoch": 0.05972238422955792, "grad_norm": 0.010786192510614955, "learning_rate": 2.9268521412941545e-05, "loss": 0.6673, "mean_token_accuracy": 0.8589914217591286, "num_tokens": 46297536.0, "step": 96 }, { "entropy": 0.7734375, "epoch": 0.0609666005676737, "grad_norm": 0.00995930490248025, "learning_rate": 2.9249765551734917e-05, "loss": 0.6593, "mean_token_accuracy": 0.8603709153831005, "num_tokens": 47266993.0, "step": 98 }, { "entropy": 0.747314453125, "epoch": 0.062210816905789494, "grad_norm": 0.009529968396945893, "learning_rate": 2.9231009690528292e-05, "loss": 0.6368, "mean_token_accuracy": 0.863072831183672, "num_tokens": 48219964.0, "step": 100 }, { "entropy": 0.743408203125, "epoch": 0.06345503324390528, "grad_norm": 0.0089252084362091, "learning_rate": 2.921225382932166e-05, "loss": 0.6459, "mean_token_accuracy": 0.861508522182703, "num_tokens": 49195163.0, "step": 102 }, { "entropy": 0.7353515625, "epoch": 0.06469924958202107, "grad_norm": 0.009286877010477545, "learning_rate": 2.9193497968115036e-05, "loss": 0.6505, "mean_token_accuracy": 0.8599878679960966, "num_tokens": 50152439.0, "step": 104 }, { "entropy": 0.723388671875, "epoch": 0.06594346592013686, "grad_norm": 0.009857745782260929, "learning_rate": 2.917474210690841e-05, "loss": 0.6368, "mean_token_accuracy": 0.8622154109179974, "num_tokens": 51110212.0, "step": 106 }, { "entropy": 0.7188720703125, "epoch": 0.06718768225825265, "grad_norm": 0.013151911597721819, "learning_rate": 2.9155986245701783e-05, "loss": 0.6325, "mean_token_accuracy": 0.862042736262083, "num_tokens": 52072746.0, "step": 108 }, { "entropy": 0.710693359375, "epoch": 0.06843189859636845, "grad_norm": 0.009937149777264806, "learning_rate": 2.9137230384495158e-05, "loss": 0.6191, "mean_token_accuracy": 0.8641938399523497, "num_tokens": 53019561.0, "step": 110 }, { "entropy": 0.7220458984375, "epoch": 0.06967611493448424, "grad_norm": 0.014280118182390616, "learning_rate": 2.9118474523288527e-05, "loss": 0.6163, "mean_token_accuracy": 0.8640485219657421, "num_tokens": 53990391.0, "step": 112 }, { "entropy": 0.71044921875, "epoch": 0.07092033127260003, "grad_norm": 0.016647005330032202, "learning_rate": 2.9099718662081902e-05, "loss": 0.6067, "mean_token_accuracy": 0.8654431011527777, "num_tokens": 54952155.0, "step": 114 }, { "entropy": 0.69873046875, "epoch": 0.0721645476107158, "grad_norm": 0.013601550779443158, "learning_rate": 2.9080962800875274e-05, "loss": 0.6021, "mean_token_accuracy": 0.8661979287862778, "num_tokens": 55923903.0, "step": 116 }, { "entropy": 0.6810302734375, "epoch": 0.0734087639488316, "grad_norm": 0.014658167785505546, "learning_rate": 2.906220693966865e-05, "loss": 0.5961, "mean_token_accuracy": 0.8666274584829807, "num_tokens": 56890166.0, "step": 118 }, { "entropy": 0.67529296875, "epoch": 0.07465298028694739, "grad_norm": 0.010433782983785733, "learning_rate": 2.904345107846202e-05, "loss": 0.5882, "mean_token_accuracy": 0.868389219045639, "num_tokens": 57843749.0, "step": 120 }, { "entropy": 0.6673583984375, "epoch": 0.07589719662506318, "grad_norm": 0.01158726988075527, "learning_rate": 2.9024695217255392e-05, "loss": 0.5805, "mean_token_accuracy": 0.8695809561759233, "num_tokens": 58795676.0, "step": 122 }, { "entropy": 0.6614990234375, "epoch": 0.07714141296317897, "grad_norm": 0.009345274566015846, "learning_rate": 2.9005939356048768e-05, "loss": 0.5711, "mean_token_accuracy": 0.870631393045187, "num_tokens": 59755907.0, "step": 124 }, { "entropy": 0.6644287109375, "epoch": 0.07838562930129477, "grad_norm": 0.009449361737772253, "learning_rate": 2.898718349484214e-05, "loss": 0.5746, "mean_token_accuracy": 0.8696553781628609, "num_tokens": 60711244.0, "step": 126 }, { "entropy": 0.66259765625, "epoch": 0.07962984563941056, "grad_norm": 0.010252287607322412, "learning_rate": 2.896842763363551e-05, "loss": 0.5726, "mean_token_accuracy": 0.8702653460204601, "num_tokens": 61661317.0, "step": 128 }, { "entropy": 0.6553955078125, "epoch": 0.08087406197752635, "grad_norm": 0.009826350368300495, "learning_rate": 2.8949671772428883e-05, "loss": 0.5726, "mean_token_accuracy": 0.8699843529611826, "num_tokens": 62642845.0, "step": 130 }, { "entropy": 0.6448974609375, "epoch": 0.08211827831564213, "grad_norm": 0.009066386980989197, "learning_rate": 2.8930915911222258e-05, "loss": 0.5612, "mean_token_accuracy": 0.8716051783412695, "num_tokens": 63601688.0, "step": 132 }, { "entropy": 0.6363525390625, "epoch": 0.08336249465375792, "grad_norm": 0.008088118946637307, "learning_rate": 2.891216005001563e-05, "loss": 0.5554, "mean_token_accuracy": 0.8725471515208483, "num_tokens": 64562041.0, "step": 134 }, { "entropy": 0.6317138671875, "epoch": 0.08460671099187371, "grad_norm": 0.013105816618610803, "learning_rate": 2.8893404188809005e-05, "loss": 0.5531, "mean_token_accuracy": 0.8727884497493505, "num_tokens": 65524590.0, "step": 136 }, { "entropy": 0.63916015625, "epoch": 0.0858509273299895, "grad_norm": 0.01909081119736018, "learning_rate": 2.8874648327602377e-05, "loss": 0.5536, "mean_token_accuracy": 0.8721345569938421, "num_tokens": 66478771.0, "step": 138 }, { "entropy": 0.646240234375, "epoch": 0.0870951436681053, "grad_norm": 0.009629608572019343, "learning_rate": 2.885589246639575e-05, "loss": 0.5594, "mean_token_accuracy": 0.8720645401626825, "num_tokens": 67449488.0, "step": 140 }, { "entropy": 0.6290283203125, "epoch": 0.08833936000622108, "grad_norm": 0.015544477133498205, "learning_rate": 2.8837136605189124e-05, "loss": 0.5418, "mean_token_accuracy": 0.8747996557503939, "num_tokens": 68410167.0, "step": 142 }, { "entropy": 0.622314453125, "epoch": 0.08958357634433688, "grad_norm": 0.008083402095717028, "learning_rate": 2.8818380743982496e-05, "loss": 0.5407, "mean_token_accuracy": 0.8747551310807467, "num_tokens": 69366784.0, "step": 144 }, { "entropy": 0.6163330078125, "epoch": 0.09082779268245267, "grad_norm": 0.005655753881121235, "learning_rate": 2.8799624882775868e-05, "loss": 0.5431, "mean_token_accuracy": 0.8741663359105587, "num_tokens": 70341166.0, "step": 146 }, { "entropy": 0.6080322265625, "epoch": 0.09207200902056845, "grad_norm": 0.013310517476886885, "learning_rate": 2.878086902156924e-05, "loss": 0.5405, "mean_token_accuracy": 0.874915199354291, "num_tokens": 71307815.0, "step": 148 }, { "entropy": 0.5987548828125, "epoch": 0.09331622535868424, "grad_norm": 0.0051796312256755135, "learning_rate": 2.8762113160362615e-05, "loss": 0.5389, "mean_token_accuracy": 0.875050388276577, "num_tokens": 72270020.0, "step": 150 }, { "entropy": 0.587890625, "epoch": 0.09456044169680003, "grad_norm": 0.004922183957384014, "learning_rate": 2.8743357299155986e-05, "loss": 0.527, "mean_token_accuracy": 0.8766882754862309, "num_tokens": 73220216.0, "step": 152 }, { "entropy": 0.587890625, "epoch": 0.09580465803491582, "grad_norm": 0.004954187828101408, "learning_rate": 2.872460143794936e-05, "loss": 0.5283, "mean_token_accuracy": 0.8765918165445328, "num_tokens": 74170528.0, "step": 154 }, { "entropy": 0.6051025390625, "epoch": 0.09704887437303161, "grad_norm": 0.004742784771105153, "learning_rate": 2.8705845576742733e-05, "loss": 0.5446, "mean_token_accuracy": 0.8737094234675169, "num_tokens": 75125871.0, "step": 156 }, { "entropy": 0.583984375, "epoch": 0.0982930907111474, "grad_norm": 0.00449682380005368, "learning_rate": 2.8687089715536105e-05, "loss": 0.5261, "mean_token_accuracy": 0.8764713611453772, "num_tokens": 76082376.0, "step": 158 }, { "entropy": 0.590576171875, "epoch": 0.0995373070492632, "grad_norm": 0.005095159903783169, "learning_rate": 2.866833385432948e-05, "loss": 0.5335, "mean_token_accuracy": 0.8752720728516579, "num_tokens": 77043312.0, "step": 160 }, { "entropy": 0.593505859375, "epoch": 0.10078152338737897, "grad_norm": 0.004736338556288233, "learning_rate": 2.8649577993122852e-05, "loss": 0.5323, "mean_token_accuracy": 0.8756220769137144, "num_tokens": 78022410.0, "step": 162 }, { "entropy": 0.5909423828125, "epoch": 0.10202573972549477, "grad_norm": 0.0048435409298364695, "learning_rate": 2.8630822131916224e-05, "loss": 0.5292, "mean_token_accuracy": 0.8756781909614801, "num_tokens": 78994721.0, "step": 164 }, { "entropy": 0.5823974609375, "epoch": 0.10326995606361056, "grad_norm": 0.0043600039927944095, "learning_rate": 2.8612066270709596e-05, "loss": 0.5265, "mean_token_accuracy": 0.8766606859862804, "num_tokens": 79977056.0, "step": 166 }, { "entropy": 0.55426025390625, "epoch": 0.10451417240172635, "grad_norm": 0.00450761103408803, "learning_rate": 2.859331040950297e-05, "loss": 0.5059, "mean_token_accuracy": 0.8799897208809853, "num_tokens": 80932917.0, "step": 168 }, { "entropy": 0.5655517578125, "epoch": 0.10575838873984214, "grad_norm": 0.00419735748132864, "learning_rate": 2.8574554548296343e-05, "loss": 0.5121, "mean_token_accuracy": 0.8786576054990292, "num_tokens": 81875228.0, "step": 170 }, { "entropy": 0.5784912109375, "epoch": 0.10700260507795793, "grad_norm": 0.004457364964672314, "learning_rate": 2.8555798687089715e-05, "loss": 0.5257, "mean_token_accuracy": 0.8765816017985344, "num_tokens": 82850393.0, "step": 172 }, { "entropy": 0.577880859375, "epoch": 0.10824682141607372, "grad_norm": 0.004158010838208147, "learning_rate": 2.853704282588309e-05, "loss": 0.5181, "mean_token_accuracy": 0.8781408928334713, "num_tokens": 83829392.0, "step": 174 }, { "entropy": 0.5699462890625, "epoch": 0.10949103775418952, "grad_norm": 0.004510444114803957, "learning_rate": 2.851828696467646e-05, "loss": 0.5198, "mean_token_accuracy": 0.8775961492210627, "num_tokens": 84788897.0, "step": 176 }, { "entropy": 0.574462890625, "epoch": 0.1107352540923053, "grad_norm": 0.004177521831715433, "learning_rate": 2.8499531103469837e-05, "loss": 0.5202, "mean_token_accuracy": 0.877587117254734, "num_tokens": 85767942.0, "step": 178 }, { "entropy": 0.5682373046875, "epoch": 0.11197947043042109, "grad_norm": 0.004233272414436176, "learning_rate": 2.848077524226321e-05, "loss": 0.5141, "mean_token_accuracy": 0.8780548553913832, "num_tokens": 86739785.0, "step": 180 }, { "entropy": 0.5634765625, "epoch": 0.11322368676853688, "grad_norm": 0.004251848292136647, "learning_rate": 2.846201938105658e-05, "loss": 0.5122, "mean_token_accuracy": 0.8789201825857162, "num_tokens": 87711034.0, "step": 182 }, { "entropy": 0.5673828125, "epoch": 0.11446790310665267, "grad_norm": 0.004823159725451837, "learning_rate": 2.8443263519849952e-05, "loss": 0.5181, "mean_token_accuracy": 0.8779822532087564, "num_tokens": 88687991.0, "step": 184 }, { "entropy": 0.5587158203125, "epoch": 0.11571211944476846, "grad_norm": 0.003971978808828467, "learning_rate": 2.8424507658643327e-05, "loss": 0.5081, "mean_token_accuracy": 0.87936832010746, "num_tokens": 89652458.0, "step": 186 }, { "entropy": 0.5670166015625, "epoch": 0.11695633578288425, "grad_norm": 0.004599875483193996, "learning_rate": 2.8405751797436703e-05, "loss": 0.5168, "mean_token_accuracy": 0.8775633033365011, "num_tokens": 90625390.0, "step": 188 }, { "entropy": 0.565673828125, "epoch": 0.11820055212100004, "grad_norm": 0.004410995708472844, "learning_rate": 2.838699593623007e-05, "loss": 0.5124, "mean_token_accuracy": 0.8781507574021816, "num_tokens": 91588111.0, "step": 190 }, { "entropy": 0.55169677734375, "epoch": 0.11944476845911584, "grad_norm": 0.004323225361166531, "learning_rate": 2.8368240075023446e-05, "loss": 0.5022, "mean_token_accuracy": 0.8810433447360992, "num_tokens": 92555948.0, "step": 192 }, { "entropy": 0.562255859375, "epoch": 0.12068898479723161, "grad_norm": 0.004283609607054523, "learning_rate": 2.8349484213816818e-05, "loss": 0.5117, "mean_token_accuracy": 0.8785576019436121, "num_tokens": 93510156.0, "step": 194 }, { "entropy": 0.55267333984375, "epoch": 0.1219332011353474, "grad_norm": 0.0038985910121687667, "learning_rate": 2.8330728352610193e-05, "loss": 0.5026, "mean_token_accuracy": 0.880028972402215, "num_tokens": 94470766.0, "step": 196 }, { "entropy": 0.55535888671875, "epoch": 0.1231774174734632, "grad_norm": 0.003942758357608937, "learning_rate": 2.831197249140356e-05, "loss": 0.5079, "mean_token_accuracy": 0.8788180500268936, "num_tokens": 95441029.0, "step": 198 }, { "entropy": 0.55352783203125, "epoch": 0.12442163381157899, "grad_norm": 0.0038519246292422, "learning_rate": 2.8293216630196937e-05, "loss": 0.5078, "mean_token_accuracy": 0.8789091128855944, "num_tokens": 96405054.0, "step": 200 }, { "epoch": 0.12442163381157899, "eval_entropy": 0.5564487744645494, "eval_loss": 0.5077720284461975, "eval_mean_token_accuracy": 0.8789945521682193, "eval_num_tokens": 96405054.0, "eval_runtime": 425.6064, "eval_samples_per_second": 203.545, "eval_steps_per_second": 3.181, "step": 200 }, { "entropy": 0.553955078125, "epoch": 0.12566585014969478, "grad_norm": 0.004202481820270757, "learning_rate": 2.827446076899031e-05, "loss": 0.5069, "mean_token_accuracy": 0.8796794656664133, "num_tokens": 97357480.0, "step": 202 }, { "entropy": 0.5616455078125, "epoch": 0.12691006648781056, "grad_norm": 0.004342700177450456, "learning_rate": 2.8255704907783684e-05, "loss": 0.5078, "mean_token_accuracy": 0.8793207015842199, "num_tokens": 98324930.0, "step": 204 }, { "entropy": 0.55084228515625, "epoch": 0.12815428282592636, "grad_norm": 0.004518013966570155, "learning_rate": 2.823694904657706e-05, "loss": 0.4999, "mean_token_accuracy": 0.8799815904349089, "num_tokens": 99308379.0, "step": 206 }, { "entropy": 0.53485107421875, "epoch": 0.12939849916404214, "grad_norm": 0.003980418228294994, "learning_rate": 2.8218193185370427e-05, "loss": 0.4875, "mean_token_accuracy": 0.8826124537736177, "num_tokens": 100284945.0, "step": 208 }, { "entropy": 0.5411376953125, "epoch": 0.13064271550215795, "grad_norm": 0.004226576134816777, "learning_rate": 2.8199437324163803e-05, "loss": 0.4939, "mean_token_accuracy": 0.881903612986207, "num_tokens": 101239875.0, "step": 210 }, { "entropy": 0.541748046875, "epoch": 0.13188693184027372, "grad_norm": 0.004095557010591326, "learning_rate": 2.8180681462957174e-05, "loss": 0.4943, "mean_token_accuracy": 0.8809374794363976, "num_tokens": 102193649.0, "step": 212 }, { "entropy": 0.552734375, "epoch": 0.13313114817838953, "grad_norm": 0.004677599786034972, "learning_rate": 2.816192560175055e-05, "loss": 0.5077, "mean_token_accuracy": 0.8783110100775957, "num_tokens": 103161885.0, "step": 214 }, { "entropy": 0.5550537109375, "epoch": 0.1343753645165053, "grad_norm": 0.004111724861610862, "learning_rate": 2.8143169740543918e-05, "loss": 0.5102, "mean_token_accuracy": 0.878773296251893, "num_tokens": 104130032.0, "step": 216 }, { "entropy": 0.5455322265625, "epoch": 0.13561958085462109, "grad_norm": 0.004460405407464148, "learning_rate": 2.8124413879337293e-05, "loss": 0.4999, "mean_token_accuracy": 0.8797915708273649, "num_tokens": 105100321.0, "step": 218 }, { "entropy": 0.54638671875, "epoch": 0.1368637971927369, "grad_norm": 0.00472624547675932, "learning_rate": 2.8105658018130665e-05, "loss": 0.4995, "mean_token_accuracy": 0.8797464743256569, "num_tokens": 106049793.0, "step": 220 }, { "entropy": 0.54315185546875, "epoch": 0.13810801353085267, "grad_norm": 0.00420817351847422, "learning_rate": 2.808690215692404e-05, "loss": 0.4999, "mean_token_accuracy": 0.879876684397459, "num_tokens": 107013781.0, "step": 222 }, { "entropy": 0.5377197265625, "epoch": 0.13935222986896847, "grad_norm": 0.004992303992386141, "learning_rate": 2.8068146295717415e-05, "loss": 0.4898, "mean_token_accuracy": 0.8818494435399771, "num_tokens": 107975843.0, "step": 224 }, { "entropy": 0.53900146484375, "epoch": 0.14059644620708425, "grad_norm": 0.0041431739680149356, "learning_rate": 2.8049390434510784e-05, "loss": 0.4946, "mean_token_accuracy": 0.8811660911887884, "num_tokens": 108942825.0, "step": 226 }, { "entropy": 0.54473876953125, "epoch": 0.14184066254520006, "grad_norm": 0.00427744544585974, "learning_rate": 2.803063457330416e-05, "loss": 0.5, "mean_token_accuracy": 0.8794045951217413, "num_tokens": 109913094.0, "step": 228 }, { "entropy": 0.53448486328125, "epoch": 0.14308487888331584, "grad_norm": 0.004451259283818379, "learning_rate": 2.801187871209753e-05, "loss": 0.4885, "mean_token_accuracy": 0.8816246408969164, "num_tokens": 110866687.0, "step": 230 }, { "entropy": 0.53662109375, "epoch": 0.1443290952214316, "grad_norm": 0.005154362408257279, "learning_rate": 2.7993122850890906e-05, "loss": 0.495, "mean_token_accuracy": 0.88081955909729, "num_tokens": 111835840.0, "step": 232 }, { "entropy": 0.53924560546875, "epoch": 0.14557331155954742, "grad_norm": 0.004346554324761445, "learning_rate": 2.7974366989684274e-05, "loss": 0.4959, "mean_token_accuracy": 0.8800926674157381, "num_tokens": 112814274.0, "step": 234 }, { "entropy": 0.5380859375, "epoch": 0.1468175278976632, "grad_norm": 0.0044365543171540935, "learning_rate": 2.795561112847765e-05, "loss": 0.487, "mean_token_accuracy": 0.8816618304699659, "num_tokens": 113764286.0, "step": 236 }, { "entropy": 0.53570556640625, "epoch": 0.148061744235779, "grad_norm": 0.0041143226767026185, "learning_rate": 2.793685526727102e-05, "loss": 0.486, "mean_token_accuracy": 0.8824022393673658, "num_tokens": 114729023.0, "step": 238 }, { "entropy": 0.5443115234375, "epoch": 0.14930596057389478, "grad_norm": 0.004746456791016532, "learning_rate": 2.7918099406064397e-05, "loss": 0.4993, "mean_token_accuracy": 0.8796773850917816, "num_tokens": 115702887.0, "step": 240 }, { "entropy": 0.5418701171875, "epoch": 0.15055017691201059, "grad_norm": 0.00480922089737296, "learning_rate": 2.789934354485777e-05, "loss": 0.4963, "mean_token_accuracy": 0.8802901618182659, "num_tokens": 116678448.0, "step": 242 }, { "entropy": 0.53765869140625, "epoch": 0.15179439325012636, "grad_norm": 0.004630745457045256, "learning_rate": 2.788058768365114e-05, "loss": 0.4944, "mean_token_accuracy": 0.8803971409797668, "num_tokens": 117645118.0, "step": 244 }, { "entropy": 0.53546142578125, "epoch": 0.15303860958824217, "grad_norm": 0.004292368656085013, "learning_rate": 2.7861831822444516e-05, "loss": 0.4898, "mean_token_accuracy": 0.8815171681344509, "num_tokens": 118609718.0, "step": 246 }, { "entropy": 0.53009033203125, "epoch": 0.15428282592635795, "grad_norm": 0.00432853447352249, "learning_rate": 2.7843075961237887e-05, "loss": 0.4822, "mean_token_accuracy": 0.8823816385120153, "num_tokens": 119581719.0, "step": 248 }, { "entropy": 0.53131103515625, "epoch": 0.15552704226447372, "grad_norm": 0.004266208568860564, "learning_rate": 2.7824320100031263e-05, "loss": 0.4866, "mean_token_accuracy": 0.8818963132798672, "num_tokens": 120561738.0, "step": 250 }, { "entropy": 0.52288818359375, "epoch": 0.15677125860258953, "grad_norm": 0.0041049614545513435, "learning_rate": 2.780556423882463e-05, "loss": 0.4776, "mean_token_accuracy": 0.8831405956298113, "num_tokens": 121516626.0, "step": 252 }, { "entropy": 0.53668212890625, "epoch": 0.1580154749407053, "grad_norm": 0.004899191475369027, "learning_rate": 2.7786808377618006e-05, "loss": 0.4904, "mean_token_accuracy": 0.8802980054169893, "num_tokens": 122497298.0, "step": 254 }, { "entropy": 0.528076171875, "epoch": 0.1592596912788211, "grad_norm": 0.004468998508890563, "learning_rate": 2.776805251641138e-05, "loss": 0.4825, "mean_token_accuracy": 0.8819232229143381, "num_tokens": 123460525.0, "step": 256 }, { "entropy": 0.52655029296875, "epoch": 0.1605039076169369, "grad_norm": 0.00419010446138929, "learning_rate": 2.7749296655204753e-05, "loss": 0.4806, "mean_token_accuracy": 0.8823020961135626, "num_tokens": 124418896.0, "step": 258 }, { "entropy": 0.52166748046875, "epoch": 0.1617481239550527, "grad_norm": 0.004482564049216853, "learning_rate": 2.7730540793998125e-05, "loss": 0.4737, "mean_token_accuracy": 0.8841393161565065, "num_tokens": 125389389.0, "step": 260 }, { "entropy": 0.5216064453125, "epoch": 0.16299234029316848, "grad_norm": 0.0058174813454089005, "learning_rate": 2.7711784932791497e-05, "loss": 0.4779, "mean_token_accuracy": 0.883222334086895, "num_tokens": 126349612.0, "step": 262 }, { "entropy": 0.52642822265625, "epoch": 0.16423655663128425, "grad_norm": 0.00496875366145162, "learning_rate": 2.7693029071584872e-05, "loss": 0.4837, "mean_token_accuracy": 0.8812949676066637, "num_tokens": 127311570.0, "step": 264 }, { "entropy": 0.5391845703125, "epoch": 0.16548077296940006, "grad_norm": 0.005334655740872404, "learning_rate": 2.7674273210378244e-05, "loss": 0.4892, "mean_token_accuracy": 0.8805368673056364, "num_tokens": 128292363.0, "step": 266 }, { "entropy": 0.52459716796875, "epoch": 0.16672498930751584, "grad_norm": 0.005404841905105513, "learning_rate": 2.765551734917162e-05, "loss": 0.477, "mean_token_accuracy": 0.882431847974658, "num_tokens": 129256872.0, "step": 268 }, { "entropy": 0.51409912109375, "epoch": 0.16796920564563164, "grad_norm": 0.00680497039064007, "learning_rate": 2.7636761487964987e-05, "loss": 0.467, "mean_token_accuracy": 0.8841554988175631, "num_tokens": 130209312.0, "step": 270 }, { "entropy": 0.51226806640625, "epoch": 0.16921342198374742, "grad_norm": 0.00966453030617695, "learning_rate": 2.7618005626758363e-05, "loss": 0.4638, "mean_token_accuracy": 0.8842370640486479, "num_tokens": 131165468.0, "step": 272 }, { "entropy": 0.5189208984375, "epoch": 0.17045763832186323, "grad_norm": 0.010724506890179331, "learning_rate": 2.7599249765551738e-05, "loss": 0.476, "mean_token_accuracy": 0.8822486810386181, "num_tokens": 132127505.0, "step": 274 }, { "entropy": 0.52581787109375, "epoch": 0.171701854659979, "grad_norm": 0.023365346055409998, "learning_rate": 2.758049390434511e-05, "loss": 0.4814, "mean_token_accuracy": 0.881921524181962, "num_tokens": 133098851.0, "step": 276 }, { "entropy": 0.52716064453125, "epoch": 0.17294607099809478, "grad_norm": 0.050981395163568886, "learning_rate": 2.756173804313848e-05, "loss": 0.4741, "mean_token_accuracy": 0.8824981413781643, "num_tokens": 134082880.0, "step": 278 }, { "entropy": 0.52099609375, "epoch": 0.1741902873362106, "grad_norm": 0.03383909574907662, "learning_rate": 2.7542982181931853e-05, "loss": 0.4771, "mean_token_accuracy": 0.8825534675270319, "num_tokens": 135058906.0, "step": 280 }, { "entropy": 0.54901123046875, "epoch": 0.17543450367432636, "grad_norm": 1.0709143398068652, "learning_rate": 2.752422632072523e-05, "loss": 0.4815, "mean_token_accuracy": 0.8821801356971264, "num_tokens": 136019797.0, "step": 282 }, { "entropy": 0.5281982421875, "epoch": 0.17667872001244217, "grad_norm": 0.023875902141304983, "learning_rate": 2.75054704595186e-05, "loss": 0.4776, "mean_token_accuracy": 0.8827728591859341, "num_tokens": 136974924.0, "step": 284 }, { "entropy": 0.53155517578125, "epoch": 0.17792293635055795, "grad_norm": 0.01810897069920511, "learning_rate": 2.7486714598311972e-05, "loss": 0.4822, "mean_token_accuracy": 0.8827848546206951, "num_tokens": 137938387.0, "step": 286 }, { "entropy": 0.534423828125, "epoch": 0.17916715268867375, "grad_norm": 0.023368989173116334, "learning_rate": 2.7467958737105344e-05, "loss": 0.4749, "mean_token_accuracy": 0.8840332254767418, "num_tokens": 138882541.0, "step": 288 }, { "entropy": 0.52618408203125, "epoch": 0.18041136902678953, "grad_norm": 0.008847330585999721, "learning_rate": 2.744920287589872e-05, "loss": 0.4727, "mean_token_accuracy": 0.8844450321048498, "num_tokens": 139834136.0, "step": 290 }, { "entropy": 0.51177978515625, "epoch": 0.18165558536490534, "grad_norm": 0.007230035683927291, "learning_rate": 2.7430447014692094e-05, "loss": 0.4667, "mean_token_accuracy": 0.8858618028461933, "num_tokens": 140801473.0, "step": 292 }, { "entropy": 0.5269775390625, "epoch": 0.18289980170302111, "grad_norm": 0.00591762886464884, "learning_rate": 2.7411691153485466e-05, "loss": 0.4847, "mean_token_accuracy": 0.8820613399147987, "num_tokens": 141774333.0, "step": 294 }, { "entropy": 0.52105712890625, "epoch": 0.1841440180411369, "grad_norm": 0.005779537682240174, "learning_rate": 2.7392935292278838e-05, "loss": 0.4753, "mean_token_accuracy": 0.8836770989000797, "num_tokens": 142747100.0, "step": 296 }, { "entropy": 0.52313232421875, "epoch": 0.1853882343792527, "grad_norm": 0.006023867846183252, "learning_rate": 2.737417943107221e-05, "loss": 0.474, "mean_token_accuracy": 0.883987670764327, "num_tokens": 143717770.0, "step": 298 }, { "entropy": 0.52142333984375, "epoch": 0.18663245071736848, "grad_norm": 0.005119689495386404, "learning_rate": 2.7355423569865585e-05, "loss": 0.4747, "mean_token_accuracy": 0.8837161175906658, "num_tokens": 144684804.0, "step": 300 }, { "entropy": 0.5142822265625, "epoch": 0.18787666705548428, "grad_norm": 0.0052708601475933425, "learning_rate": 2.7336667708658957e-05, "loss": 0.4724, "mean_token_accuracy": 0.8843117468059063, "num_tokens": 145645327.0, "step": 302 }, { "entropy": 0.51446533203125, "epoch": 0.18912088339360006, "grad_norm": 0.004922216998316442, "learning_rate": 2.731791184745233e-05, "loss": 0.4657, "mean_token_accuracy": 0.8856261819601059, "num_tokens": 146617663.0, "step": 304 }, { "entropy": 0.51544189453125, "epoch": 0.19036509973171586, "grad_norm": 0.005014665257788346, "learning_rate": 2.72991559862457e-05, "loss": 0.4715, "mean_token_accuracy": 0.8838825188577175, "num_tokens": 147583412.0, "step": 306 }, { "entropy": 0.50738525390625, "epoch": 0.19160931606983164, "grad_norm": 0.005115551712906193, "learning_rate": 2.7280400125039075e-05, "loss": 0.4697, "mean_token_accuracy": 0.884763790294528, "num_tokens": 148557923.0, "step": 308 }, { "entropy": 0.51544189453125, "epoch": 0.19285353240794742, "grad_norm": 0.0043485829084442896, "learning_rate": 2.726164426383245e-05, "loss": 0.4765, "mean_token_accuracy": 0.8833438903093338, "num_tokens": 149531915.0, "step": 310 }, { "entropy": 0.5172119140625, "epoch": 0.19409774874606323, "grad_norm": 0.004695010047700339, "learning_rate": 2.7242888402625822e-05, "loss": 0.474, "mean_token_accuracy": 0.883532090112567, "num_tokens": 150495539.0, "step": 312 }, { "entropy": 0.525146484375, "epoch": 0.195341965084179, "grad_norm": 0.0045213210875779865, "learning_rate": 2.7224132541419194e-05, "loss": 0.4813, "mean_token_accuracy": 0.8825348038226366, "num_tokens": 151459414.0, "step": 314 }, { "entropy": 0.5130615234375, "epoch": 0.1965861814222948, "grad_norm": 0.004323425876520526, "learning_rate": 2.7205376680212566e-05, "loss": 0.4701, "mean_token_accuracy": 0.8840730246156454, "num_tokens": 152430105.0, "step": 316 }, { "entropy": 0.50567626953125, "epoch": 0.1978303977604106, "grad_norm": 0.00466352184037346, "learning_rate": 2.718662081900594e-05, "loss": 0.4644, "mean_token_accuracy": 0.8858729880303144, "num_tokens": 153399254.0, "step": 318 }, { "entropy": 0.50164794921875, "epoch": 0.1990746140985264, "grad_norm": 0.004288192130061775, "learning_rate": 2.7167864957799313e-05, "loss": 0.4624, "mean_token_accuracy": 0.8859351649880409, "num_tokens": 154371167.0, "step": 320 }, { "entropy": 0.51800537109375, "epoch": 0.20031883043664217, "grad_norm": 0.004152993528814156, "learning_rate": 2.7149109096592685e-05, "loss": 0.4752, "mean_token_accuracy": 0.8830097503960133, "num_tokens": 155350218.0, "step": 322 }, { "entropy": 0.51104736328125, "epoch": 0.20156304677475795, "grad_norm": 0.0046113646530450475, "learning_rate": 2.713035323538606e-05, "loss": 0.4694, "mean_token_accuracy": 0.8843841180205345, "num_tokens": 156314175.0, "step": 324 }, { "entropy": 0.50146484375, "epoch": 0.20280726311287375, "grad_norm": 0.004269408571451082, "learning_rate": 2.7111597374179432e-05, "loss": 0.46, "mean_token_accuracy": 0.8860575053840876, "num_tokens": 157261751.0, "step": 326 }, { "entropy": 0.50396728515625, "epoch": 0.20405147945098953, "grad_norm": 0.0051287328174686305, "learning_rate": 2.7092841512972807e-05, "loss": 0.4659, "mean_token_accuracy": 0.8850192446261644, "num_tokens": 158229795.0, "step": 328 }, { "entropy": 0.49884033203125, "epoch": 0.20529569578910534, "grad_norm": 0.00445349920685799, "learning_rate": 2.7074085651766175e-05, "loss": 0.4629, "mean_token_accuracy": 0.8855830244719982, "num_tokens": 159188728.0, "step": 330 }, { "entropy": 0.505859375, "epoch": 0.20653991212722111, "grad_norm": 0.004537975678796345, "learning_rate": 2.705532979055955e-05, "loss": 0.4692, "mean_token_accuracy": 0.8839662168174982, "num_tokens": 160153310.0, "step": 332 }, { "entropy": 0.5103759765625, "epoch": 0.20778412846533692, "grad_norm": 0.004987569924797283, "learning_rate": 2.7036573929352922e-05, "loss": 0.4684, "mean_token_accuracy": 0.8842402920126915, "num_tokens": 161104577.0, "step": 334 }, { "entropy": 0.50714111328125, "epoch": 0.2090283448034527, "grad_norm": 0.004851728673771707, "learning_rate": 2.7017818068146298e-05, "loss": 0.4669, "mean_token_accuracy": 0.8851086385548115, "num_tokens": 162075663.0, "step": 336 }, { "entropy": 0.5118408203125, "epoch": 0.2102725611415685, "grad_norm": 0.004942360273733116, "learning_rate": 2.699906220693967e-05, "loss": 0.4714, "mean_token_accuracy": 0.8842042814940214, "num_tokens": 163051455.0, "step": 338 }, { "entropy": 0.50506591796875, "epoch": 0.21151677747968428, "grad_norm": 0.005287784067246522, "learning_rate": 2.698030634573304e-05, "loss": 0.4648, "mean_token_accuracy": 0.8851244226098061, "num_tokens": 164016264.0, "step": 340 }, { "entropy": 0.5029296875, "epoch": 0.21276099381780006, "grad_norm": 0.00558567650622737, "learning_rate": 2.6961550484526416e-05, "loss": 0.4676, "mean_token_accuracy": 0.8846770003437996, "num_tokens": 164979276.0, "step": 342 }, { "entropy": 0.50885009765625, "epoch": 0.21400521015591586, "grad_norm": 0.004543453194693435, "learning_rate": 2.6942794623319788e-05, "loss": 0.4676, "mean_token_accuracy": 0.8840575665235519, "num_tokens": 165961176.0, "step": 344 }, { "entropy": 0.50128173828125, "epoch": 0.21524942649403164, "grad_norm": 0.0048737992863131605, "learning_rate": 2.6924038762113163e-05, "loss": 0.4562, "mean_token_accuracy": 0.8864925540983677, "num_tokens": 166917729.0, "step": 346 }, { "entropy": 0.5018310546875, "epoch": 0.21649364283214745, "grad_norm": 0.004969150127860555, "learning_rate": 2.6905282900906532e-05, "loss": 0.4648, "mean_token_accuracy": 0.8847461380064487, "num_tokens": 167878867.0, "step": 348 }, { "entropy": 0.49920654296875, "epoch": 0.21773785917026323, "grad_norm": 0.0045757386232092075, "learning_rate": 2.6886527039699907e-05, "loss": 0.4584, "mean_token_accuracy": 0.8854996263980865, "num_tokens": 168849189.0, "step": 350 }, { "entropy": 0.49591064453125, "epoch": 0.21898207550837903, "grad_norm": 0.004520784364191752, "learning_rate": 2.686777117849328e-05, "loss": 0.4603, "mean_token_accuracy": 0.8854717332869768, "num_tokens": 169816582.0, "step": 352 }, { "entropy": 0.5050048828125, "epoch": 0.2202262918464948, "grad_norm": 0.005072150995066861, "learning_rate": 2.6849015317286654e-05, "loss": 0.466, "mean_token_accuracy": 0.8841619547456503, "num_tokens": 170775008.0, "step": 354 }, { "entropy": 0.4949951171875, "epoch": 0.2214705081846106, "grad_norm": 0.005594963684371403, "learning_rate": 2.6830259456080022e-05, "loss": 0.4585, "mean_token_accuracy": 0.8863363694399595, "num_tokens": 171741147.0, "step": 356 }, { "entropy": 0.4974365234375, "epoch": 0.2227147245227264, "grad_norm": 0.005182738091358197, "learning_rate": 2.6811503594873398e-05, "loss": 0.4566, "mean_token_accuracy": 0.8867263589054346, "num_tokens": 172693427.0, "step": 358 }, { "entropy": 0.496337890625, "epoch": 0.22395894086084217, "grad_norm": 0.005262668867115653, "learning_rate": 2.6792747733666773e-05, "loss": 0.4557, "mean_token_accuracy": 0.8863802403211594, "num_tokens": 173637477.0, "step": 360 }, { "entropy": 0.5008544921875, "epoch": 0.22520315719895798, "grad_norm": 0.005178073799981167, "learning_rate": 2.6773991872460145e-05, "loss": 0.4584, "mean_token_accuracy": 0.8855376802384853, "num_tokens": 174605713.0, "step": 362 }, { "entropy": 0.4913330078125, "epoch": 0.22644737353707375, "grad_norm": 0.005907129506877732, "learning_rate": 2.675523601125352e-05, "loss": 0.454, "mean_token_accuracy": 0.8861636780202389, "num_tokens": 175581897.0, "step": 364 }, { "entropy": 0.49261474609375, "epoch": 0.22769158987518956, "grad_norm": 0.005451056484619964, "learning_rate": 2.6736480150046888e-05, "loss": 0.4536, "mean_token_accuracy": 0.8860989715903997, "num_tokens": 176562251.0, "step": 366 }, { "entropy": 0.49249267578125, "epoch": 0.22893580621330534, "grad_norm": 0.006059216156246989, "learning_rate": 2.6717724288840263e-05, "loss": 0.4502, "mean_token_accuracy": 0.8866901956498623, "num_tokens": 177524852.0, "step": 368 }, { "entropy": 0.49267578125, "epoch": 0.23018002255142112, "grad_norm": 0.006051217114050417, "learning_rate": 2.6698968427633635e-05, "loss": 0.4468, "mean_token_accuracy": 0.8874351494014263, "num_tokens": 178499836.0, "step": 370 }, { "entropy": 0.489990234375, "epoch": 0.23142423888953692, "grad_norm": 0.006466399213379338, "learning_rate": 2.668021256642701e-05, "loss": 0.4501, "mean_token_accuracy": 0.8868417982012033, "num_tokens": 179466026.0, "step": 372 }, { "entropy": 0.49090576171875, "epoch": 0.2326684552276527, "grad_norm": 0.006788336517849395, "learning_rate": 2.666145670522038e-05, "loss": 0.4542, "mean_token_accuracy": 0.886442145332694, "num_tokens": 180436577.0, "step": 374 }, { "entropy": 0.4935302734375, "epoch": 0.2339126715657685, "grad_norm": 0.007041085950798347, "learning_rate": 2.6642700844013754e-05, "loss": 0.4504, "mean_token_accuracy": 0.8865497447550297, "num_tokens": 181403075.0, "step": 376 }, { "entropy": 0.49542236328125, "epoch": 0.23515688790388428, "grad_norm": 0.008111778854524583, "learning_rate": 2.662394498280713e-05, "loss": 0.4556, "mean_token_accuracy": 0.8853060007095337, "num_tokens": 182375569.0, "step": 378 }, { "entropy": 0.4893798828125, "epoch": 0.2364011042420001, "grad_norm": 0.01032864583200041, "learning_rate": 2.66051891216005e-05, "loss": 0.4454, "mean_token_accuracy": 0.8872009590268135, "num_tokens": 183340051.0, "step": 380 }, { "entropy": 0.48736572265625, "epoch": 0.23764532058011587, "grad_norm": 0.013244334422330322, "learning_rate": 2.6586433260393876e-05, "loss": 0.4432, "mean_token_accuracy": 0.8867799993604422, "num_tokens": 184304177.0, "step": 382 }, { "entropy": 0.48675537109375, "epoch": 0.23888953691823167, "grad_norm": 0.013648450112316027, "learning_rate": 2.6567677399187245e-05, "loss": 0.4428, "mean_token_accuracy": 0.8865546360611916, "num_tokens": 185251151.0, "step": 384 }, { "entropy": 0.493408203125, "epoch": 0.24013375325634745, "grad_norm": 0.01788336508658616, "learning_rate": 2.654892153798062e-05, "loss": 0.4418, "mean_token_accuracy": 0.8885820787400007, "num_tokens": 186212613.0, "step": 386 }, { "entropy": 0.49517822265625, "epoch": 0.24137796959446323, "grad_norm": 0.02686746126856259, "learning_rate": 2.6530165676773992e-05, "loss": 0.445, "mean_token_accuracy": 0.8876909986138344, "num_tokens": 187181781.0, "step": 388 }, { "entropy": 0.477294921875, "epoch": 0.24262218593257903, "grad_norm": 0.009239502027767377, "learning_rate": 2.6511409815567367e-05, "loss": 0.4353, "mean_token_accuracy": 0.8891771286725998, "num_tokens": 188138987.0, "step": 390 }, { "entropy": 0.48614501953125, "epoch": 0.2438664022706948, "grad_norm": 0.009675534041973853, "learning_rate": 2.649265395436074e-05, "loss": 0.4464, "mean_token_accuracy": 0.8870788104832172, "num_tokens": 189113622.0, "step": 392 }, { "entropy": 0.48712158203125, "epoch": 0.24511061860881062, "grad_norm": 0.012764519522815536, "learning_rate": 2.647389809315411e-05, "loss": 0.4336, "mean_token_accuracy": 0.888870244845748, "num_tokens": 190067601.0, "step": 394 }, { "entropy": 0.48846435546875, "epoch": 0.2463548349469264, "grad_norm": 0.008909102618395691, "learning_rate": 2.6455142231947486e-05, "loss": 0.4371, "mean_token_accuracy": 0.8889909945428371, "num_tokens": 191051966.0, "step": 396 }, { "entropy": 0.47308349609375, "epoch": 0.2475990512850422, "grad_norm": 0.00567123613584487, "learning_rate": 2.6436386370740858e-05, "loss": 0.434, "mean_token_accuracy": 0.888976726680994, "num_tokens": 191997432.0, "step": 398 }, { "entropy": 0.46685791015625, "epoch": 0.24884326762315798, "grad_norm": 0.007697971460837049, "learning_rate": 2.641763050953423e-05, "loss": 0.4338, "mean_token_accuracy": 0.8891915380954742, "num_tokens": 192958737.0, "step": 400 }, { "epoch": 0.24884326762315798, "eval_entropy": 0.4720287689254062, "eval_loss": 0.43580877780914307, "eval_mean_token_accuracy": 0.8887623557752133, "eval_num_tokens": 192958737.0, "eval_runtime": 425.6289, "eval_samples_per_second": 203.534, "eval_steps_per_second": 3.181, "step": 400 }, { "entropy": 0.4796142578125, "epoch": 0.2500874839612738, "grad_norm": 0.006897028838757869, "learning_rate": 2.63988746483276e-05, "loss": 0.44, "mean_token_accuracy": 0.8880260232836008, "num_tokens": 193934720.0, "step": 402 }, { "entropy": 0.4840087890625, "epoch": 0.25133170029938956, "grad_norm": 0.0057817394357148905, "learning_rate": 2.6380118787120976e-05, "loss": 0.4411, "mean_token_accuracy": 0.8872070126235485, "num_tokens": 194907365.0, "step": 404 }, { "entropy": 0.46368408203125, "epoch": 0.25257591663750534, "grad_norm": 0.004739683940332359, "learning_rate": 2.6361362925914348e-05, "loss": 0.4254, "mean_token_accuracy": 0.890717014670372, "num_tokens": 195858776.0, "step": 406 }, { "entropy": 0.466552734375, "epoch": 0.2538201329756211, "grad_norm": 0.005218983574168385, "learning_rate": 2.6342607064707723e-05, "loss": 0.4331, "mean_token_accuracy": 0.8897191379219294, "num_tokens": 196814547.0, "step": 408 }, { "entropy": 0.47247314453125, "epoch": 0.25506434931373695, "grad_norm": 0.004494164653523746, "learning_rate": 2.6323851203501095e-05, "loss": 0.4359, "mean_token_accuracy": 0.8881633393466473, "num_tokens": 197777990.0, "step": 410 }, { "entropy": 0.46734619140625, "epoch": 0.2563085656518527, "grad_norm": 0.005021098754889972, "learning_rate": 2.6305095342294467e-05, "loss": 0.4298, "mean_token_accuracy": 0.8900037594139576, "num_tokens": 198733243.0, "step": 412 }, { "entropy": 0.47259521484375, "epoch": 0.2575527819899685, "grad_norm": 0.005396352610617961, "learning_rate": 2.6286339481087842e-05, "loss": 0.431, "mean_token_accuracy": 0.889799203723669, "num_tokens": 199703301.0, "step": 414 }, { "entropy": 0.47418212890625, "epoch": 0.2587969983280843, "grad_norm": 0.004514583683483479, "learning_rate": 2.6267583619881214e-05, "loss": 0.4346, "mean_token_accuracy": 0.8891310822218657, "num_tokens": 200662536.0, "step": 416 }, { "entropy": 0.45526123046875, "epoch": 0.26004121466620006, "grad_norm": 0.004513856518161222, "learning_rate": 2.6248827758674586e-05, "loss": 0.4193, "mean_token_accuracy": 0.8919303603470325, "num_tokens": 201610408.0, "step": 418 }, { "entropy": 0.462890625, "epoch": 0.2612854310043159, "grad_norm": 0.005561316107281183, "learning_rate": 2.6230071897467958e-05, "loss": 0.4248, "mean_token_accuracy": 0.8909801263362169, "num_tokens": 202574407.0, "step": 420 }, { "entropy": 0.471923828125, "epoch": 0.26252964734243167, "grad_norm": 0.004874097663680325, "learning_rate": 2.6211316036261333e-05, "loss": 0.4337, "mean_token_accuracy": 0.8890033438801765, "num_tokens": 203533882.0, "step": 422 }, { "entropy": 0.46832275390625, "epoch": 0.26377386368054745, "grad_norm": 0.004665521044375344, "learning_rate": 2.6192560175054705e-05, "loss": 0.4332, "mean_token_accuracy": 0.8889031261205673, "num_tokens": 204511039.0, "step": 424 }, { "entropy": 0.46832275390625, "epoch": 0.2650180800186632, "grad_norm": 0.004496898871788555, "learning_rate": 2.617380431384808e-05, "loss": 0.4339, "mean_token_accuracy": 0.8885703664273024, "num_tokens": 205486536.0, "step": 426 }, { "entropy": 0.47052001953125, "epoch": 0.26626229635677906, "grad_norm": 0.004518568298646117, "learning_rate": 2.615504845264145e-05, "loss": 0.4306, "mean_token_accuracy": 0.8897290006279945, "num_tokens": 206461202.0, "step": 428 }, { "entropy": 0.46466064453125, "epoch": 0.26750651269489484, "grad_norm": 0.004575599804043483, "learning_rate": 2.6136292591434823e-05, "loss": 0.4265, "mean_token_accuracy": 0.8902626466006041, "num_tokens": 207431201.0, "step": 430 }, { "entropy": 0.4727783203125, "epoch": 0.2687507290330106, "grad_norm": 0.004811657580266161, "learning_rate": 2.61175367302282e-05, "loss": 0.4392, "mean_token_accuracy": 0.8881986327469349, "num_tokens": 208391480.0, "step": 432 }, { "entropy": 0.464111328125, "epoch": 0.2699949453711264, "grad_norm": 0.004439664447917304, "learning_rate": 2.609878086902157e-05, "loss": 0.428, "mean_token_accuracy": 0.8901105523109436, "num_tokens": 209359643.0, "step": 434 }, { "entropy": 0.47125244140625, "epoch": 0.27123916170924217, "grad_norm": 0.004462800489477215, "learning_rate": 2.6080025007814942e-05, "loss": 0.4315, "mean_token_accuracy": 0.8892256580293179, "num_tokens": 210321556.0, "step": 436 }, { "entropy": 0.46185302734375, "epoch": 0.272483378047358, "grad_norm": 0.0048451457742654785, "learning_rate": 2.6061269146608314e-05, "loss": 0.4279, "mean_token_accuracy": 0.8905858621001244, "num_tokens": 211298602.0, "step": 438 }, { "entropy": 0.459228515625, "epoch": 0.2737275943854738, "grad_norm": 0.00428487185632129, "learning_rate": 2.604251328540169e-05, "loss": 0.4246, "mean_token_accuracy": 0.8907031863927841, "num_tokens": 212263843.0, "step": 440 }, { "entropy": 0.47454833984375, "epoch": 0.27497181072358956, "grad_norm": 0.004399314198389893, "learning_rate": 2.602375742419506e-05, "loss": 0.4365, "mean_token_accuracy": 0.8880468718707561, "num_tokens": 213236820.0, "step": 442 }, { "entropy": 0.46240234375, "epoch": 0.27621602706170534, "grad_norm": 0.004605209226289197, "learning_rate": 2.6005001562988433e-05, "loss": 0.4254, "mean_token_accuracy": 0.8907176367938519, "num_tokens": 214200379.0, "step": 444 }, { "entropy": 0.460205078125, "epoch": 0.27746024339982117, "grad_norm": 0.004941616611446251, "learning_rate": 2.5986245701781808e-05, "loss": 0.4268, "mean_token_accuracy": 0.890496289357543, "num_tokens": 215172752.0, "step": 446 }, { "entropy": 0.46612548828125, "epoch": 0.27870445973793695, "grad_norm": 0.004930770933576395, "learning_rate": 2.596748984057518e-05, "loss": 0.4301, "mean_token_accuracy": 0.8898529168218374, "num_tokens": 216141318.0, "step": 448 }, { "entropy": 0.45709228515625, "epoch": 0.2799486760760527, "grad_norm": 0.00483268654552316, "learning_rate": 2.5948733979368555e-05, "loss": 0.4277, "mean_token_accuracy": 0.8904906753450632, "num_tokens": 217118321.0, "step": 450 }, { "entropy": 0.46856689453125, "epoch": 0.2811928924141685, "grad_norm": 0.004871722908737315, "learning_rate": 2.5929978118161927e-05, "loss": 0.4347, "mean_token_accuracy": 0.888842137530446, "num_tokens": 218084019.0, "step": 452 }, { "entropy": 0.470703125, "epoch": 0.2824371087522843, "grad_norm": 0.00574500400730459, "learning_rate": 2.59112222569553e-05, "loss": 0.429, "mean_token_accuracy": 0.8903706949204206, "num_tokens": 219049973.0, "step": 454 }, { "entropy": 0.468994140625, "epoch": 0.2836813250904001, "grad_norm": 0.005475406900381574, "learning_rate": 2.589246639574867e-05, "loss": 0.433, "mean_token_accuracy": 0.8888360410928726, "num_tokens": 220007772.0, "step": 456 }, { "entropy": 0.45599365234375, "epoch": 0.2849255414285159, "grad_norm": 0.006500639488628257, "learning_rate": 2.5873710534542046e-05, "loss": 0.4273, "mean_token_accuracy": 0.8904174882918596, "num_tokens": 220976347.0, "step": 458 }, { "entropy": 0.4593505859375, "epoch": 0.28616975776663167, "grad_norm": 0.004850228675234435, "learning_rate": 2.585495467333542e-05, "loss": 0.4252, "mean_token_accuracy": 0.890261696651578, "num_tokens": 221925230.0, "step": 460 }, { "entropy": 0.46051025390625, "epoch": 0.28741397410474745, "grad_norm": 0.005096194001429068, "learning_rate": 2.583619881212879e-05, "loss": 0.4236, "mean_token_accuracy": 0.8910165466368198, "num_tokens": 222884121.0, "step": 462 }, { "entropy": 0.45745849609375, "epoch": 0.2886581904428632, "grad_norm": 0.005934226162691709, "learning_rate": 2.5817442950922164e-05, "loss": 0.4222, "mean_token_accuracy": 0.8916088528931141, "num_tokens": 223845208.0, "step": 464 }, { "entropy": 0.46331787109375, "epoch": 0.28990240678097906, "grad_norm": 0.005368519336846048, "learning_rate": 2.5798687089715536e-05, "loss": 0.4301, "mean_token_accuracy": 0.8894176427274942, "num_tokens": 224811948.0, "step": 466 }, { "entropy": 0.45672607421875, "epoch": 0.29114662311909484, "grad_norm": 0.005128102241738148, "learning_rate": 2.577993122850891e-05, "loss": 0.4246, "mean_token_accuracy": 0.8909121509641409, "num_tokens": 225779336.0, "step": 468 }, { "entropy": 0.4669189453125, "epoch": 0.2923908394572106, "grad_norm": 0.004463987437196139, "learning_rate": 2.576117536730228e-05, "loss": 0.4306, "mean_token_accuracy": 0.889289166778326, "num_tokens": 226739848.0, "step": 470 }, { "entropy": 0.453857421875, "epoch": 0.2936350557953264, "grad_norm": 0.00516933905355652, "learning_rate": 2.5742419506095655e-05, "loss": 0.4188, "mean_token_accuracy": 0.8919338099658489, "num_tokens": 227705921.0, "step": 472 }, { "entropy": 0.45989990234375, "epoch": 0.2948792721334422, "grad_norm": 0.004582521234086089, "learning_rate": 2.5723663644889027e-05, "loss": 0.4289, "mean_token_accuracy": 0.8903973195701838, "num_tokens": 228691371.0, "step": 474 }, { "entropy": 0.46075439453125, "epoch": 0.296123488471558, "grad_norm": 0.004682070684254571, "learning_rate": 2.5704907783682402e-05, "loss": 0.4247, "mean_token_accuracy": 0.890631852671504, "num_tokens": 229666909.0, "step": 476 }, { "entropy": 0.45880126953125, "epoch": 0.2973677048096738, "grad_norm": 0.004339210841702677, "learning_rate": 2.5686151922475777e-05, "loss": 0.4244, "mean_token_accuracy": 0.8906896058470011, "num_tokens": 230643064.0, "step": 478 }, { "entropy": 0.4603271484375, "epoch": 0.29861192114778956, "grad_norm": 0.004283397808777102, "learning_rate": 2.5667396061269146e-05, "loss": 0.4239, "mean_token_accuracy": 0.890802588313818, "num_tokens": 231606376.0, "step": 480 }, { "entropy": 0.458740234375, "epoch": 0.29985613748590534, "grad_norm": 0.005901089093638439, "learning_rate": 2.564864020006252e-05, "loss": 0.4241, "mean_token_accuracy": 0.8907635007053614, "num_tokens": 232573256.0, "step": 482 }, { "entropy": 0.4639892578125, "epoch": 0.30110035382402117, "grad_norm": 0.006868626379641323, "learning_rate": 2.5629884338855893e-05, "loss": 0.4328, "mean_token_accuracy": 0.8889743648469448, "num_tokens": 233536757.0, "step": 484 }, { "entropy": 0.46112060546875, "epoch": 0.30234457016213695, "grad_norm": 0.006130771146817717, "learning_rate": 2.5611128477649268e-05, "loss": 0.4245, "mean_token_accuracy": 0.8908517342060804, "num_tokens": 234499288.0, "step": 486 }, { "entropy": 0.465087890625, "epoch": 0.3035887865002527, "grad_norm": 0.005612334682109576, "learning_rate": 2.5592372616442636e-05, "loss": 0.4282, "mean_token_accuracy": 0.8895002659410238, "num_tokens": 235476706.0, "step": 488 }, { "entropy": 0.4515380859375, "epoch": 0.3048330028383685, "grad_norm": 0.006497627206412982, "learning_rate": 2.557361675523601e-05, "loss": 0.4218, "mean_token_accuracy": 0.8913768474012613, "num_tokens": 236428820.0, "step": 490 }, { "entropy": 0.45098876953125, "epoch": 0.30607721917648434, "grad_norm": 0.004292781383146647, "learning_rate": 2.5554860894029383e-05, "loss": 0.422, "mean_token_accuracy": 0.8912322521209717, "num_tokens": 237401348.0, "step": 492 }, { "entropy": 0.45428466796875, "epoch": 0.3073214355146001, "grad_norm": 0.006284288317873616, "learning_rate": 2.553610503282276e-05, "loss": 0.4165, "mean_token_accuracy": 0.8921407107263803, "num_tokens": 238380238.0, "step": 494 }, { "entropy": 0.4573974609375, "epoch": 0.3085656518527159, "grad_norm": 0.004883860055587397, "learning_rate": 2.5517349171616134e-05, "loss": 0.4251, "mean_token_accuracy": 0.890363235026598, "num_tokens": 239345817.0, "step": 496 }, { "entropy": 0.45074462890625, "epoch": 0.30980986819083167, "grad_norm": 0.005177136040094253, "learning_rate": 2.5498593310409502e-05, "loss": 0.4221, "mean_token_accuracy": 0.8911670856177807, "num_tokens": 240300723.0, "step": 498 }, { "entropy": 0.45941162109375, "epoch": 0.31105408452894745, "grad_norm": 0.005931150384184672, "learning_rate": 2.5479837449202877e-05, "loss": 0.4214, "mean_token_accuracy": 0.8909163642674685, "num_tokens": 241255776.0, "step": 500 }, { "entropy": 0.4527587890625, "epoch": 0.3122983008670633, "grad_norm": 0.005465767911805945, "learning_rate": 2.546108158799625e-05, "loss": 0.4207, "mean_token_accuracy": 0.8913468904793262, "num_tokens": 242215406.0, "step": 502 }, { "entropy": 0.45208740234375, "epoch": 0.31354251720517906, "grad_norm": 0.004217191658836834, "learning_rate": 2.5442325726789624e-05, "loss": 0.4209, "mean_token_accuracy": 0.8911511097103357, "num_tokens": 243166922.0, "step": 504 }, { "entropy": 0.45599365234375, "epoch": 0.31478673354329484, "grad_norm": 0.005791570845179005, "learning_rate": 2.5423569865582993e-05, "loss": 0.4212, "mean_token_accuracy": 0.8910525739192963, "num_tokens": 244124866.0, "step": 506 }, { "entropy": 0.45294189453125, "epoch": 0.3160309498814106, "grad_norm": 0.004605564952735109, "learning_rate": 2.5404814004376368e-05, "loss": 0.4179, "mean_token_accuracy": 0.892103822901845, "num_tokens": 245091224.0, "step": 508 }, { "entropy": 0.452392578125, "epoch": 0.3172751662195264, "grad_norm": 0.0050936141441662915, "learning_rate": 2.538605814316974e-05, "loss": 0.4241, "mean_token_accuracy": 0.8904783334583044, "num_tokens": 246052624.0, "step": 510 }, { "entropy": 0.458984375, "epoch": 0.3185193825576422, "grad_norm": 0.00516200173479674, "learning_rate": 2.5367302281963115e-05, "loss": 0.4251, "mean_token_accuracy": 0.8901417311280966, "num_tokens": 247015400.0, "step": 512 }, { "entropy": 0.46148681640625, "epoch": 0.319763598895758, "grad_norm": 0.005492270043605151, "learning_rate": 2.5348546420756487e-05, "loss": 0.4244, "mean_token_accuracy": 0.8900549355894327, "num_tokens": 247987638.0, "step": 514 }, { "entropy": 0.444091796875, "epoch": 0.3210078152338738, "grad_norm": 0.0056916206832826844, "learning_rate": 2.532979055954986e-05, "loss": 0.4117, "mean_token_accuracy": 0.8927079197019339, "num_tokens": 248944286.0, "step": 516 }, { "entropy": 0.44708251953125, "epoch": 0.32225203157198956, "grad_norm": 0.004974091021039047, "learning_rate": 2.5311034698343234e-05, "loss": 0.4213, "mean_token_accuracy": 0.8911348562687635, "num_tokens": 249900792.0, "step": 518 }, { "entropy": 0.4476318359375, "epoch": 0.3234962479101054, "grad_norm": 0.005950328324362021, "learning_rate": 2.5292278837136606e-05, "loss": 0.4156, "mean_token_accuracy": 0.8923181276768446, "num_tokens": 250855130.0, "step": 520 }, { "entropy": 0.45611572265625, "epoch": 0.3247404642482212, "grad_norm": 0.005108538539545101, "learning_rate": 2.527352297592998e-05, "loss": 0.4207, "mean_token_accuracy": 0.8908461909741163, "num_tokens": 251832888.0, "step": 522 }, { "entropy": 0.4443359375, "epoch": 0.32598468058633695, "grad_norm": 0.005812179545524564, "learning_rate": 2.525476711472335e-05, "loss": 0.4157, "mean_token_accuracy": 0.892452746629715, "num_tokens": 252796638.0, "step": 524 }, { "entropy": 0.4493408203125, "epoch": 0.32722889692445273, "grad_norm": 0.006620426486238952, "learning_rate": 2.5236011253516724e-05, "loss": 0.4198, "mean_token_accuracy": 0.8916963096708059, "num_tokens": 253763706.0, "step": 526 }, { "entropy": 0.46075439453125, "epoch": 0.3284731132625685, "grad_norm": 0.006489658176494185, "learning_rate": 2.52172553923101e-05, "loss": 0.4257, "mean_token_accuracy": 0.8903600815683603, "num_tokens": 254727177.0, "step": 528 }, { "entropy": 0.45751953125, "epoch": 0.32971732960068434, "grad_norm": 0.005699417972821578, "learning_rate": 2.519849953110347e-05, "loss": 0.4209, "mean_token_accuracy": 0.8912414405494928, "num_tokens": 255699588.0, "step": 530 }, { "entropy": 0.44866943359375, "epoch": 0.3309615459388001, "grad_norm": 0.006670073005625764, "learning_rate": 2.5179743669896843e-05, "loss": 0.4217, "mean_token_accuracy": 0.8910941835492849, "num_tokens": 256655197.0, "step": 532 }, { "entropy": 0.45294189453125, "epoch": 0.3322057622769159, "grad_norm": 0.006200220647366234, "learning_rate": 2.5160987808690215e-05, "loss": 0.417, "mean_token_accuracy": 0.892251830548048, "num_tokens": 257610109.0, "step": 534 }, { "entropy": 0.45355224609375, "epoch": 0.3334499786150317, "grad_norm": 0.006548133340795684, "learning_rate": 2.514223194748359e-05, "loss": 0.4224, "mean_token_accuracy": 0.8909701388329268, "num_tokens": 258576915.0, "step": 536 }, { "entropy": 0.44744873046875, "epoch": 0.3346941949531475, "grad_norm": 0.0057484443637258525, "learning_rate": 2.5123476086276962e-05, "loss": 0.4158, "mean_token_accuracy": 0.8919319752603769, "num_tokens": 259536807.0, "step": 538 }, { "entropy": 0.44818115234375, "epoch": 0.3359384112912633, "grad_norm": 0.005553567661281277, "learning_rate": 2.5104720225070337e-05, "loss": 0.4177, "mean_token_accuracy": 0.8914633300155401, "num_tokens": 260507252.0, "step": 540 }, { "entropy": 0.4468994140625, "epoch": 0.33718262762937906, "grad_norm": 0.00523702745421564, "learning_rate": 2.5085964363863706e-05, "loss": 0.4154, "mean_token_accuracy": 0.892024802044034, "num_tokens": 261481516.0, "step": 542 }, { "entropy": 0.4580078125, "epoch": 0.33842684396749484, "grad_norm": 0.005373074905691709, "learning_rate": 2.506720850265708e-05, "loss": 0.4273, "mean_token_accuracy": 0.8901299517601728, "num_tokens": 262447540.0, "step": 544 }, { "entropy": 0.44781494140625, "epoch": 0.3396710603056106, "grad_norm": 0.006153607876578623, "learning_rate": 2.5048452641450456e-05, "loss": 0.415, "mean_token_accuracy": 0.8923297133296728, "num_tokens": 263413675.0, "step": 546 }, { "entropy": 0.4508056640625, "epoch": 0.34091527664372645, "grad_norm": 0.0068131801229844345, "learning_rate": 2.5029696780243828e-05, "loss": 0.4178, "mean_token_accuracy": 0.8917399439960718, "num_tokens": 264385695.0, "step": 548 }, { "entropy": 0.45306396484375, "epoch": 0.34215949298184223, "grad_norm": 0.006557276801475367, "learning_rate": 2.50109409190372e-05, "loss": 0.4228, "mean_token_accuracy": 0.8911078590899706, "num_tokens": 265358126.0, "step": 550 }, { "entropy": 0.4498291015625, "epoch": 0.343403709319958, "grad_norm": 0.006363721454860279, "learning_rate": 2.499218505783057e-05, "loss": 0.4173, "mean_token_accuracy": 0.8916207514703274, "num_tokens": 266344252.0, "step": 552 }, { "entropy": 0.44970703125, "epoch": 0.3446479256580738, "grad_norm": 0.004522733428341086, "learning_rate": 2.4973429196623947e-05, "loss": 0.418, "mean_token_accuracy": 0.8916903976351023, "num_tokens": 267318467.0, "step": 554 }, { "entropy": 0.44439697265625, "epoch": 0.34589214199618956, "grad_norm": 0.005950289004872655, "learning_rate": 2.495467333541732e-05, "loss": 0.4149, "mean_token_accuracy": 0.8924087416380644, "num_tokens": 268282998.0, "step": 556 }, { "entropy": 0.449462890625, "epoch": 0.3471363583343054, "grad_norm": 0.005744945354120763, "learning_rate": 2.493591747421069e-05, "loss": 0.4216, "mean_token_accuracy": 0.8911764528602362, "num_tokens": 269245399.0, "step": 558 }, { "entropy": 0.44647216796875, "epoch": 0.3483805746724212, "grad_norm": 0.005769935885274805, "learning_rate": 2.4917161613004062e-05, "loss": 0.4162, "mean_token_accuracy": 0.8925590496510267, "num_tokens": 270212411.0, "step": 560 }, { "entropy": 0.44635009765625, "epoch": 0.34962479101053695, "grad_norm": 0.006435919702031474, "learning_rate": 2.4898405751797437e-05, "loss": 0.4131, "mean_token_accuracy": 0.8932364322245121, "num_tokens": 271164338.0, "step": 562 }, { "entropy": 0.4493408203125, "epoch": 0.35086900734865273, "grad_norm": 0.005759117933727746, "learning_rate": 2.4879649890590812e-05, "loss": 0.4169, "mean_token_accuracy": 0.8917686976492405, "num_tokens": 272134547.0, "step": 564 }, { "entropy": 0.44696044921875, "epoch": 0.35211322368676856, "grad_norm": 0.004704156619157619, "learning_rate": 2.4860894029384184e-05, "loss": 0.416, "mean_token_accuracy": 0.8921545967459679, "num_tokens": 273104678.0, "step": 566 }, { "entropy": 0.44451904296875, "epoch": 0.35335744002488434, "grad_norm": 0.005407003237696463, "learning_rate": 2.4842138168177556e-05, "loss": 0.4154, "mean_token_accuracy": 0.8922662418335676, "num_tokens": 274062402.0, "step": 568 }, { "entropy": 0.4521484375, "epoch": 0.3546016563630001, "grad_norm": 0.006272963348070624, "learning_rate": 2.4823382306970928e-05, "loss": 0.418, "mean_token_accuracy": 0.8916661534458399, "num_tokens": 275030472.0, "step": 570 }, { "entropy": 0.43792724609375, "epoch": 0.3558458727011159, "grad_norm": 0.005777897875567104, "learning_rate": 2.4804626445764303e-05, "loss": 0.41, "mean_token_accuracy": 0.8930404800921679, "num_tokens": 275979821.0, "step": 572 }, { "entropy": 0.44659423828125, "epoch": 0.3570900890392317, "grad_norm": 0.005014889518788805, "learning_rate": 2.4785870584557675e-05, "loss": 0.4195, "mean_token_accuracy": 0.8917618580162525, "num_tokens": 276963945.0, "step": 574 }, { "entropy": 0.4459228515625, "epoch": 0.3583343053773475, "grad_norm": 0.005501167050581574, "learning_rate": 2.4767114723351047e-05, "loss": 0.41, "mean_token_accuracy": 0.8932163268327713, "num_tokens": 277934252.0, "step": 576 }, { "entropy": 0.450927734375, "epoch": 0.3595785217154633, "grad_norm": 0.0053459789989875085, "learning_rate": 2.4748358862144422e-05, "loss": 0.4212, "mean_token_accuracy": 0.8909201137721539, "num_tokens": 278901438.0, "step": 578 }, { "entropy": 0.43634033203125, "epoch": 0.36082273805357906, "grad_norm": 0.005602289106774139, "learning_rate": 2.4729603000937794e-05, "loss": 0.4121, "mean_token_accuracy": 0.892591955140233, "num_tokens": 279865946.0, "step": 580 }, { "entropy": 0.4403076171875, "epoch": 0.36206695439169484, "grad_norm": 0.005025557825782334, "learning_rate": 2.471084713973117e-05, "loss": 0.4061, "mean_token_accuracy": 0.8943810816854239, "num_tokens": 280811344.0, "step": 582 }, { "entropy": 0.443603515625, "epoch": 0.3633111707298107, "grad_norm": 0.0049178949394613, "learning_rate": 2.469209127852454e-05, "loss": 0.4101, "mean_token_accuracy": 0.8930227998644114, "num_tokens": 281786348.0, "step": 584 }, { "entropy": 0.44146728515625, "epoch": 0.36455538706792645, "grad_norm": 0.005635964495800884, "learning_rate": 2.4673335417317912e-05, "loss": 0.4111, "mean_token_accuracy": 0.8927997201681137, "num_tokens": 282761639.0, "step": 586 }, { "entropy": 0.43072509765625, "epoch": 0.36579960340604223, "grad_norm": 0.0052053459394645375, "learning_rate": 2.4654579556111284e-05, "loss": 0.4015, "mean_token_accuracy": 0.894774416461587, "num_tokens": 283705561.0, "step": 588 }, { "entropy": 0.45001220703125, "epoch": 0.367043819744158, "grad_norm": 0.005396773541551127, "learning_rate": 2.463582369490466e-05, "loss": 0.4184, "mean_token_accuracy": 0.8920012563467026, "num_tokens": 284681866.0, "step": 590 }, { "entropy": 0.43853759765625, "epoch": 0.3682880360822738, "grad_norm": 0.005624418923580711, "learning_rate": 2.461706783369803e-05, "loss": 0.4082, "mean_token_accuracy": 0.8942910209298134, "num_tokens": 285646468.0, "step": 592 }, { "entropy": 0.43707275390625, "epoch": 0.3695322524203896, "grad_norm": 0.005683897862109425, "learning_rate": 2.4598311972491403e-05, "loss": 0.41, "mean_token_accuracy": 0.8926704041659832, "num_tokens": 286607632.0, "step": 594 }, { "entropy": 0.44305419921875, "epoch": 0.3707764687585054, "grad_norm": 0.005356112329511339, "learning_rate": 2.4579556111284778e-05, "loss": 0.4116, "mean_token_accuracy": 0.8930394221097231, "num_tokens": 287581064.0, "step": 596 }, { "entropy": 0.44842529296875, "epoch": 0.3720206850966212, "grad_norm": 0.0053792455488526, "learning_rate": 2.456080025007815e-05, "loss": 0.4182, "mean_token_accuracy": 0.891961582005024, "num_tokens": 288557694.0, "step": 598 }, { "entropy": 0.4482421875, "epoch": 0.37326490143473695, "grad_norm": 0.004990706953077352, "learning_rate": 2.4542044388871525e-05, "loss": 0.4161, "mean_token_accuracy": 0.8919712118804455, "num_tokens": 289525499.0, "step": 600 }, { "epoch": 0.37326490143473695, "eval_entropy": 0.4405537412296898, "eval_loss": 0.4120519161224365, "eval_mean_token_accuracy": 0.8926645798204216, "eval_num_tokens": 289525499.0, "eval_runtime": 425.6452, "eval_samples_per_second": 203.526, "eval_steps_per_second": 3.181, "step": 600 }, { "entropy": 0.43896484375, "epoch": 0.37450911777285273, "grad_norm": 0.0058490651668419835, "learning_rate": 2.4523288527664894e-05, "loss": 0.4099, "mean_token_accuracy": 0.8930901885032654, "num_tokens": 290504479.0, "step": 602 }, { "entropy": 0.44097900390625, "epoch": 0.37575333411096856, "grad_norm": 0.004998461273268484, "learning_rate": 2.450453266645827e-05, "loss": 0.4117, "mean_token_accuracy": 0.8926319163292646, "num_tokens": 291477249.0, "step": 604 }, { "entropy": 0.44342041015625, "epoch": 0.37699755044908434, "grad_norm": 0.004611969382503389, "learning_rate": 2.448577680525164e-05, "loss": 0.4112, "mean_token_accuracy": 0.8929608706384897, "num_tokens": 292441336.0, "step": 606 }, { "entropy": 0.435302734375, "epoch": 0.3782417667872001, "grad_norm": 0.004794590614084971, "learning_rate": 2.4467020944045016e-05, "loss": 0.4074, "mean_token_accuracy": 0.8939055521041155, "num_tokens": 293421508.0, "step": 608 }, { "entropy": 0.437744140625, "epoch": 0.3794859831253159, "grad_norm": 0.0053690987869954385, "learning_rate": 2.4448265082838388e-05, "loss": 0.4099, "mean_token_accuracy": 0.8931601271033287, "num_tokens": 294373317.0, "step": 610 }, { "entropy": 0.4404296875, "epoch": 0.38073019946343173, "grad_norm": 0.005984729348259475, "learning_rate": 2.442950922163176e-05, "loss": 0.4054, "mean_token_accuracy": 0.8942964505404234, "num_tokens": 295336729.0, "step": 612 }, { "entropy": 0.44140625, "epoch": 0.3819744158015475, "grad_norm": 0.0073068263168549075, "learning_rate": 2.4410753360425135e-05, "loss": 0.4112, "mean_token_accuracy": 0.8931441064924002, "num_tokens": 296305878.0, "step": 614 }, { "entropy": 0.44476318359375, "epoch": 0.3832186321396633, "grad_norm": 0.006311230816655906, "learning_rate": 2.4391997499218506e-05, "loss": 0.4154, "mean_token_accuracy": 0.8916477803140879, "num_tokens": 297263987.0, "step": 616 }, { "entropy": 0.43975830078125, "epoch": 0.38446284847777906, "grad_norm": 0.004651645383306393, "learning_rate": 2.437324163801188e-05, "loss": 0.4093, "mean_token_accuracy": 0.8933092020452023, "num_tokens": 298228205.0, "step": 618 }, { "entropy": 0.4337158203125, "epoch": 0.38570706481589484, "grad_norm": 0.00536508367412953, "learning_rate": 2.435448577680525e-05, "loss": 0.4041, "mean_token_accuracy": 0.8944570738822222, "num_tokens": 299191097.0, "step": 620 }, { "entropy": 0.4283447265625, "epoch": 0.3869512811540107, "grad_norm": 0.0063015530584679316, "learning_rate": 2.4335729915598625e-05, "loss": 0.402, "mean_token_accuracy": 0.8945734184235334, "num_tokens": 300132569.0, "step": 622 }, { "entropy": 0.43408203125, "epoch": 0.38819549749212645, "grad_norm": 0.00576085691868759, "learning_rate": 2.4316974054391997e-05, "loss": 0.4047, "mean_token_accuracy": 0.8939513359218836, "num_tokens": 301085191.0, "step": 624 }, { "entropy": 0.430419921875, "epoch": 0.38943971383024223, "grad_norm": 0.005248211652171765, "learning_rate": 2.4298218193185372e-05, "loss": 0.4024, "mean_token_accuracy": 0.8949748110026121, "num_tokens": 302042264.0, "step": 626 }, { "entropy": 0.44293212890625, "epoch": 0.390683930168358, "grad_norm": 0.007081462168208972, "learning_rate": 2.427946233197874e-05, "loss": 0.4103, "mean_token_accuracy": 0.8928579315543175, "num_tokens": 303011286.0, "step": 628 }, { "entropy": 0.42889404296875, "epoch": 0.39192814650647384, "grad_norm": 0.006296741897190914, "learning_rate": 2.4260706470772116e-05, "loss": 0.4014, "mean_token_accuracy": 0.8950808681547642, "num_tokens": 303966714.0, "step": 630 }, { "entropy": 0.43463134765625, "epoch": 0.3931723628445896, "grad_norm": 0.005626995312167991, "learning_rate": 2.424195060956549e-05, "loss": 0.4068, "mean_token_accuracy": 0.8938818722963333, "num_tokens": 304937290.0, "step": 632 }, { "entropy": 0.43902587890625, "epoch": 0.3944165791827054, "grad_norm": 0.005891874733082082, "learning_rate": 2.4223194748358863e-05, "loss": 0.409, "mean_token_accuracy": 0.8931249491870403, "num_tokens": 305895242.0, "step": 634 }, { "entropy": 0.439208984375, "epoch": 0.3956607955208212, "grad_norm": 0.004836160798453518, "learning_rate": 2.4204438887152238e-05, "loss": 0.4118, "mean_token_accuracy": 0.8934380076825619, "num_tokens": 306858078.0, "step": 636 }, { "entropy": 0.429931640625, "epoch": 0.39690501185893695, "grad_norm": 0.00501238929123632, "learning_rate": 2.4185683025945607e-05, "loss": 0.4035, "mean_token_accuracy": 0.894173750653863, "num_tokens": 307822497.0, "step": 638 }, { "entropy": 0.45245361328125, "epoch": 0.3981492281970528, "grad_norm": 0.005241586121080287, "learning_rate": 2.4166927164738982e-05, "loss": 0.4246, "mean_token_accuracy": 0.8902607318013906, "num_tokens": 308790336.0, "step": 640 }, { "entropy": 0.439697265625, "epoch": 0.39939344453516856, "grad_norm": 0.00638257981675173, "learning_rate": 2.4148171303532354e-05, "loss": 0.4067, "mean_token_accuracy": 0.8936410620808601, "num_tokens": 309743381.0, "step": 642 }, { "entropy": 0.43035888671875, "epoch": 0.40063766087328434, "grad_norm": 0.005590265589036512, "learning_rate": 2.412941544232573e-05, "loss": 0.4053, "mean_token_accuracy": 0.8935759421437979, "num_tokens": 310705280.0, "step": 644 }, { "entropy": 0.4365234375, "epoch": 0.4018818772114001, "grad_norm": 0.0047993046023315404, "learning_rate": 2.41106595811191e-05, "loss": 0.4093, "mean_token_accuracy": 0.8934315554797649, "num_tokens": 311668154.0, "step": 646 }, { "entropy": 0.445068359375, "epoch": 0.4031260935495159, "grad_norm": 0.0050762695852753234, "learning_rate": 2.4091903719912472e-05, "loss": 0.4126, "mean_token_accuracy": 0.8927694912999868, "num_tokens": 312640458.0, "step": 648 }, { "entropy": 0.43707275390625, "epoch": 0.40437030988763173, "grad_norm": 0.005752923878242651, "learning_rate": 2.4073147858705848e-05, "loss": 0.4054, "mean_token_accuracy": 0.8936334028840065, "num_tokens": 313597817.0, "step": 650 }, { "entropy": 0.42681884765625, "epoch": 0.4056145262257475, "grad_norm": 0.007284977784973453, "learning_rate": 2.405439199749922e-05, "loss": 0.4011, "mean_token_accuracy": 0.8951170798391104, "num_tokens": 314571538.0, "step": 652 }, { "entropy": 0.43414306640625, "epoch": 0.4068587425638633, "grad_norm": 0.005437702954066041, "learning_rate": 2.4035636136292595e-05, "loss": 0.4051, "mean_token_accuracy": 0.8935840968042612, "num_tokens": 315544179.0, "step": 654 }, { "entropy": 0.45166015625, "epoch": 0.40810295890197906, "grad_norm": 0.005654911979708561, "learning_rate": 2.4016880275085963e-05, "loss": 0.417, "mean_token_accuracy": 0.8919120728969574, "num_tokens": 316511820.0, "step": 656 }, { "entropy": 0.435546875, "epoch": 0.4093471752400949, "grad_norm": 0.005884474520445735, "learning_rate": 2.3998124413879338e-05, "loss": 0.4086, "mean_token_accuracy": 0.8935281205922365, "num_tokens": 317479722.0, "step": 658 }, { "entropy": 0.4378662109375, "epoch": 0.4105913915782107, "grad_norm": 0.005121505763981191, "learning_rate": 2.397936855267271e-05, "loss": 0.4093, "mean_token_accuracy": 0.8933096490800381, "num_tokens": 318446589.0, "step": 660 }, { "entropy": 0.44036865234375, "epoch": 0.41183560791632645, "grad_norm": 0.005576840240818018, "learning_rate": 2.3960612691466085e-05, "loss": 0.4091, "mean_token_accuracy": 0.8929143939167261, "num_tokens": 319406455.0, "step": 662 }, { "entropy": 0.44281005859375, "epoch": 0.41307982425444223, "grad_norm": 0.005619344459445944, "learning_rate": 2.3941856830259457e-05, "loss": 0.4143, "mean_token_accuracy": 0.8924025259912014, "num_tokens": 320380719.0, "step": 664 }, { "entropy": 0.43707275390625, "epoch": 0.414324040592558, "grad_norm": 0.006403324815015543, "learning_rate": 2.392310096905283e-05, "loss": 0.4092, "mean_token_accuracy": 0.8932097051292658, "num_tokens": 321361592.0, "step": 666 }, { "entropy": 0.43206787109375, "epoch": 0.41556825693067384, "grad_norm": 0.006200706298627916, "learning_rate": 2.3904345107846204e-05, "loss": 0.4073, "mean_token_accuracy": 0.893327634781599, "num_tokens": 322345469.0, "step": 668 }, { "entropy": 0.447021484375, "epoch": 0.4168124732687896, "grad_norm": 0.005715393406326943, "learning_rate": 2.3885589246639576e-05, "loss": 0.4137, "mean_token_accuracy": 0.8922103606164455, "num_tokens": 323315613.0, "step": 670 }, { "entropy": 0.432861328125, "epoch": 0.4180566896069054, "grad_norm": 0.00508667172611282, "learning_rate": 2.3866833385432948e-05, "loss": 0.4018, "mean_token_accuracy": 0.8948980867862701, "num_tokens": 324284700.0, "step": 672 }, { "entropy": 0.4248046875, "epoch": 0.4193009059450212, "grad_norm": 0.006422766602411412, "learning_rate": 2.384807752422632e-05, "loss": 0.4039, "mean_token_accuracy": 0.8944504167884588, "num_tokens": 325246930.0, "step": 674 }, { "entropy": 0.43695068359375, "epoch": 0.420545122283137, "grad_norm": 0.0057700657889702385, "learning_rate": 2.3829321663019695e-05, "loss": 0.4074, "mean_token_accuracy": 0.8937112130224705, "num_tokens": 326211526.0, "step": 676 }, { "entropy": 0.433349609375, "epoch": 0.4217893386212528, "grad_norm": 0.004420688618941281, "learning_rate": 2.3810565801813066e-05, "loss": 0.4057, "mean_token_accuracy": 0.8938558753579855, "num_tokens": 327177214.0, "step": 678 }, { "entropy": 0.43231201171875, "epoch": 0.42303355495936856, "grad_norm": 0.006272018617321604, "learning_rate": 2.379180994060644e-05, "loss": 0.4064, "mean_token_accuracy": 0.8934580031782389, "num_tokens": 328135520.0, "step": 680 }, { "entropy": 0.43572998046875, "epoch": 0.42427777129748434, "grad_norm": 0.004960285852070079, "learning_rate": 2.3773054079399813e-05, "loss": 0.4072, "mean_token_accuracy": 0.8936238437891006, "num_tokens": 329086619.0, "step": 682 }, { "entropy": 0.4381103515625, "epoch": 0.4255219876356001, "grad_norm": 0.005235124245612957, "learning_rate": 2.3754298218193185e-05, "loss": 0.4059, "mean_token_accuracy": 0.8939711581915617, "num_tokens": 330053586.0, "step": 684 }, { "entropy": 0.44134521484375, "epoch": 0.42676620397371595, "grad_norm": 0.005423664468516734, "learning_rate": 2.373554235698656e-05, "loss": 0.411, "mean_token_accuracy": 0.8931374661624432, "num_tokens": 331023843.0, "step": 686 }, { "entropy": 0.43157958984375, "epoch": 0.42801042031183173, "grad_norm": 0.0041753817916951865, "learning_rate": 2.3716786495779932e-05, "loss": 0.4073, "mean_token_accuracy": 0.8933002445846796, "num_tokens": 331991282.0, "step": 688 }, { "entropy": 0.43121337890625, "epoch": 0.4292546366499475, "grad_norm": 0.005135733276935802, "learning_rate": 2.3698030634573304e-05, "loss": 0.4031, "mean_token_accuracy": 0.8948918152600527, "num_tokens": 332964318.0, "step": 690 }, { "entropy": 0.441650390625, "epoch": 0.4304988529880633, "grad_norm": 0.004947681557309468, "learning_rate": 2.3679274773366676e-05, "loss": 0.4143, "mean_token_accuracy": 0.8924829419702291, "num_tokens": 333943743.0, "step": 692 }, { "entropy": 0.43353271484375, "epoch": 0.43174306932617906, "grad_norm": 0.005128613475071985, "learning_rate": 2.366051891216005e-05, "loss": 0.402, "mean_token_accuracy": 0.8951289635151625, "num_tokens": 334903246.0, "step": 694 }, { "entropy": 0.431640625, "epoch": 0.4329872856642949, "grad_norm": 0.006591225159473631, "learning_rate": 2.3641763050953423e-05, "loss": 0.4058, "mean_token_accuracy": 0.8936793152242899, "num_tokens": 335872418.0, "step": 696 }, { "entropy": 0.4361572265625, "epoch": 0.4342315020024107, "grad_norm": 0.005870311198304932, "learning_rate": 2.3623007189746798e-05, "loss": 0.4061, "mean_token_accuracy": 0.8937596697360277, "num_tokens": 336843723.0, "step": 698 }, { "entropy": 0.43109130859375, "epoch": 0.43547571834052645, "grad_norm": 0.005772369946573655, "learning_rate": 2.360425132854017e-05, "loss": 0.4021, "mean_token_accuracy": 0.8946464397013187, "num_tokens": 337810282.0, "step": 700 }, { "entropy": 0.4307861328125, "epoch": 0.43671993467864223, "grad_norm": 0.005710684645796877, "learning_rate": 2.358549546733354e-05, "loss": 0.4015, "mean_token_accuracy": 0.8945929054170847, "num_tokens": 338777013.0, "step": 702 }, { "entropy": 0.43115234375, "epoch": 0.43796415101675806, "grad_norm": 0.005822359447247074, "learning_rate": 2.3566739606126917e-05, "loss": 0.4038, "mean_token_accuracy": 0.8942882213741541, "num_tokens": 339736705.0, "step": 704 }, { "entropy": 0.4351806640625, "epoch": 0.43920836735487384, "grad_norm": 0.006174833963348362, "learning_rate": 2.354798374492029e-05, "loss": 0.4045, "mean_token_accuracy": 0.8944994900375605, "num_tokens": 340716593.0, "step": 706 }, { "entropy": 0.430908203125, "epoch": 0.4404525836929896, "grad_norm": 0.005770131710762657, "learning_rate": 2.352922788371366e-05, "loss": 0.4018, "mean_token_accuracy": 0.8946074191480875, "num_tokens": 341672258.0, "step": 708 }, { "entropy": 0.43072509765625, "epoch": 0.4416968000311054, "grad_norm": 0.005482276822345939, "learning_rate": 2.3510472022507032e-05, "loss": 0.398, "mean_token_accuracy": 0.8952425215393305, "num_tokens": 342653090.0, "step": 710 }, { "entropy": 0.4263916015625, "epoch": 0.4429410163692212, "grad_norm": 0.005382285167703327, "learning_rate": 2.3491716161300407e-05, "loss": 0.4, "mean_token_accuracy": 0.895192451775074, "num_tokens": 343605645.0, "step": 712 }, { "entropy": 0.43670654296875, "epoch": 0.444185232707337, "grad_norm": 0.005208421808607282, "learning_rate": 2.3472960300093783e-05, "loss": 0.4076, "mean_token_accuracy": 0.8938938360661268, "num_tokens": 344556919.0, "step": 714 }, { "entropy": 0.43359375, "epoch": 0.4454294490454528, "grad_norm": 0.006393842805142314, "learning_rate": 2.345420443888715e-05, "loss": 0.4044, "mean_token_accuracy": 0.8941977024078369, "num_tokens": 345515071.0, "step": 716 }, { "entropy": 0.4290771484375, "epoch": 0.44667366538356856, "grad_norm": 0.006510158975703788, "learning_rate": 2.3435448577680526e-05, "loss": 0.4043, "mean_token_accuracy": 0.8940989747643471, "num_tokens": 346485533.0, "step": 718 }, { "entropy": 0.4324951171875, "epoch": 0.44791788172168434, "grad_norm": 0.00651392926206167, "learning_rate": 2.3416692716473898e-05, "loss": 0.3998, "mean_token_accuracy": 0.8950411733239889, "num_tokens": 347437260.0, "step": 720 }, { "entropy": 0.43389892578125, "epoch": 0.4491620980598002, "grad_norm": 0.004867949660902705, "learning_rate": 2.3397936855267273e-05, "loss": 0.4042, "mean_token_accuracy": 0.8945440873503685, "num_tokens": 348403098.0, "step": 722 }, { "entropy": 0.42791748046875, "epoch": 0.45040631439791595, "grad_norm": 0.005504387714449053, "learning_rate": 2.3379180994060645e-05, "loss": 0.401, "mean_token_accuracy": 0.8949379548430443, "num_tokens": 349363661.0, "step": 724 }, { "entropy": 0.43218994140625, "epoch": 0.45165053073603173, "grad_norm": 0.006611332921103142, "learning_rate": 2.3360425132854017e-05, "loss": 0.4031, "mean_token_accuracy": 0.8939683884382248, "num_tokens": 350341427.0, "step": 726 }, { "entropy": 0.4332275390625, "epoch": 0.4528947470741475, "grad_norm": 0.006292361325100708, "learning_rate": 2.334166927164739e-05, "loss": 0.4054, "mean_token_accuracy": 0.8937624506652355, "num_tokens": 351309206.0, "step": 728 }, { "entropy": 0.42633056640625, "epoch": 0.4541389634122633, "grad_norm": 0.005663590335629667, "learning_rate": 2.3322913410440764e-05, "loss": 0.3985, "mean_token_accuracy": 0.8950860146433115, "num_tokens": 352280099.0, "step": 730 }, { "entropy": 0.43280029296875, "epoch": 0.4553831797503791, "grad_norm": 0.004599612879597806, "learning_rate": 2.330415754923414e-05, "loss": 0.4037, "mean_token_accuracy": 0.8936074003577232, "num_tokens": 353252742.0, "step": 732 }, { "entropy": 0.427490234375, "epoch": 0.4566273960884949, "grad_norm": 0.005277226456565376, "learning_rate": 2.3285401688027507e-05, "loss": 0.4019, "mean_token_accuracy": 0.8945307694375515, "num_tokens": 354206062.0, "step": 734 }, { "entropy": 0.42828369140625, "epoch": 0.4578716124266107, "grad_norm": 0.005110025592910202, "learning_rate": 2.3266645826820883e-05, "loss": 0.4018, "mean_token_accuracy": 0.8943059388548136, "num_tokens": 355194861.0, "step": 736 }, { "entropy": 0.4281005859375, "epoch": 0.45911582876472645, "grad_norm": 0.006017740303250675, "learning_rate": 2.3247889965614254e-05, "loss": 0.4003, "mean_token_accuracy": 0.8945384509861469, "num_tokens": 356165814.0, "step": 738 }, { "entropy": 0.43621826171875, "epoch": 0.46036004510284223, "grad_norm": 0.006405182689396423, "learning_rate": 2.322913410440763e-05, "loss": 0.4069, "mean_token_accuracy": 0.89368736743927, "num_tokens": 357139502.0, "step": 740 }, { "entropy": 0.42791748046875, "epoch": 0.46160426144095806, "grad_norm": 0.004775439385777363, "learning_rate": 2.3210378243201e-05, "loss": 0.4012, "mean_token_accuracy": 0.8948260340839624, "num_tokens": 358122406.0, "step": 742 }, { "entropy": 0.439453125, "epoch": 0.46284847777907384, "grad_norm": 0.006108561300270353, "learning_rate": 2.3191622381994373e-05, "loss": 0.4097, "mean_token_accuracy": 0.8931525833904743, "num_tokens": 359111273.0, "step": 744 }, { "entropy": 0.4295654296875, "epoch": 0.4640926941171896, "grad_norm": 0.006037910871837177, "learning_rate": 2.3172866520787745e-05, "loss": 0.3994, "mean_token_accuracy": 0.8954188451170921, "num_tokens": 360075118.0, "step": 746 }, { "entropy": 0.4259033203125, "epoch": 0.4653369104553054, "grad_norm": 0.005398624874834788, "learning_rate": 2.315411065958112e-05, "loss": 0.3959, "mean_token_accuracy": 0.8958598673343658, "num_tokens": 361048083.0, "step": 748 }, { "entropy": 0.43084716796875, "epoch": 0.46658112679342123, "grad_norm": 0.006173722574372501, "learning_rate": 2.3135354798374495e-05, "loss": 0.4057, "mean_token_accuracy": 0.8936866298317909, "num_tokens": 362016153.0, "step": 750 }, { "entropy": 0.42877197265625, "epoch": 0.467825343131537, "grad_norm": 0.005455269825648191, "learning_rate": 2.3116598937167864e-05, "loss": 0.404, "mean_token_accuracy": 0.8946151975542307, "num_tokens": 362966561.0, "step": 752 }, { "entropy": 0.43280029296875, "epoch": 0.4690695594696528, "grad_norm": 0.00532478665123582, "learning_rate": 2.309784307596124e-05, "loss": 0.4027, "mean_token_accuracy": 0.8944839723408222, "num_tokens": 363922349.0, "step": 754 }, { "entropy": 0.42608642578125, "epoch": 0.47031377580776856, "grad_norm": 0.005873907184272343, "learning_rate": 2.307908721475461e-05, "loss": 0.4009, "mean_token_accuracy": 0.8950569182634354, "num_tokens": 364903947.0, "step": 756 }, { "entropy": 0.41534423828125, "epoch": 0.47155799214588434, "grad_norm": 0.00504096915847256, "learning_rate": 2.3060331353547986e-05, "loss": 0.3886, "mean_token_accuracy": 0.8968163318932056, "num_tokens": 365865846.0, "step": 758 }, { "entropy": 0.43707275390625, "epoch": 0.4728022084840002, "grad_norm": 0.005506291626418638, "learning_rate": 2.3041575492341354e-05, "loss": 0.4083, "mean_token_accuracy": 0.8930707555264235, "num_tokens": 366841587.0, "step": 760 }, { "entropy": 0.427734375, "epoch": 0.47404642482211595, "grad_norm": 0.00496008047223737, "learning_rate": 2.302281963113473e-05, "loss": 0.3984, "mean_token_accuracy": 0.8945848625153303, "num_tokens": 367808593.0, "step": 762 }, { "entropy": 0.4227294921875, "epoch": 0.47529064116023173, "grad_norm": 0.005690306782088039, "learning_rate": 2.30040637699281e-05, "loss": 0.4014, "mean_token_accuracy": 0.8945651352405548, "num_tokens": 368775425.0, "step": 764 }, { "entropy": 0.43878173828125, "epoch": 0.4765348574983475, "grad_norm": 0.006071004811156871, "learning_rate": 2.2985307908721477e-05, "loss": 0.4046, "mean_token_accuracy": 0.8941430021077394, "num_tokens": 369756512.0, "step": 766 }, { "entropy": 0.4263916015625, "epoch": 0.47777907383646334, "grad_norm": 0.006312602503497685, "learning_rate": 2.2966552047514852e-05, "loss": 0.404, "mean_token_accuracy": 0.8939662631601095, "num_tokens": 370710781.0, "step": 768 }, { "entropy": 0.4229736328125, "epoch": 0.4790232901745791, "grad_norm": 0.006205897714825419, "learning_rate": 2.294779618630822e-05, "loss": 0.3965, "mean_token_accuracy": 0.8949891608208418, "num_tokens": 371669736.0, "step": 770 }, { "entropy": 0.42822265625, "epoch": 0.4802675065126949, "grad_norm": 0.007105037730702388, "learning_rate": 2.2929040325101595e-05, "loss": 0.4013, "mean_token_accuracy": 0.8940982632339001, "num_tokens": 372630893.0, "step": 772 }, { "entropy": 0.42645263671875, "epoch": 0.4815117228508107, "grad_norm": 0.005394271821725288, "learning_rate": 2.2910284463894967e-05, "loss": 0.3998, "mean_token_accuracy": 0.8947566635906696, "num_tokens": 373599940.0, "step": 774 }, { "entropy": 0.4326171875, "epoch": 0.48275593918892645, "grad_norm": 0.00485473242806387, "learning_rate": 2.2891528602688343e-05, "loss": 0.4045, "mean_token_accuracy": 0.8936690259724855, "num_tokens": 374575558.0, "step": 776 }, { "entropy": 0.42926025390625, "epoch": 0.4840001555270423, "grad_norm": 0.006105559006497616, "learning_rate": 2.287277274148171e-05, "loss": 0.3997, "mean_token_accuracy": 0.8953676205128431, "num_tokens": 375542189.0, "step": 778 }, { "entropy": 0.427490234375, "epoch": 0.48524437186515806, "grad_norm": 0.005448372260778256, "learning_rate": 2.2854016880275086e-05, "loss": 0.4023, "mean_token_accuracy": 0.8939507845789194, "num_tokens": 376503886.0, "step": 780 }, { "entropy": 0.41748046875, "epoch": 0.48648858820327384, "grad_norm": 0.005873859126771416, "learning_rate": 2.283526101906846e-05, "loss": 0.3918, "mean_token_accuracy": 0.8968390040099621, "num_tokens": 377464521.0, "step": 782 }, { "entropy": 0.43475341796875, "epoch": 0.4877328045413896, "grad_norm": 0.005840053505549676, "learning_rate": 2.2816505157861833e-05, "loss": 0.4047, "mean_token_accuracy": 0.8934551868587732, "num_tokens": 378444285.0, "step": 784 }, { "entropy": 0.43316650390625, "epoch": 0.4889770208795054, "grad_norm": 0.00548037614413805, "learning_rate": 2.2797749296655205e-05, "loss": 0.4043, "mean_token_accuracy": 0.8937012832611799, "num_tokens": 379415427.0, "step": 786 }, { "entropy": 0.42352294921875, "epoch": 0.49022123721762123, "grad_norm": 0.005334286795354248, "learning_rate": 2.2778993435448577e-05, "loss": 0.3996, "mean_token_accuracy": 0.8948980197310448, "num_tokens": 380397380.0, "step": 788 }, { "entropy": 0.42578125, "epoch": 0.491465453555737, "grad_norm": 0.005798303250283738, "learning_rate": 2.2760237574241952e-05, "loss": 0.3965, "mean_token_accuracy": 0.8951926827430725, "num_tokens": 381354387.0, "step": 790 }, { "entropy": 0.42633056640625, "epoch": 0.4927096698938528, "grad_norm": 0.006042108600671853, "learning_rate": 2.2741481713035324e-05, "loss": 0.401, "mean_token_accuracy": 0.8943822812289, "num_tokens": 382314431.0, "step": 792 }, { "entropy": 0.425048828125, "epoch": 0.49395388623196856, "grad_norm": 0.00520768089486778, "learning_rate": 2.27227258518287e-05, "loss": 0.3991, "mean_token_accuracy": 0.8951034266501665, "num_tokens": 383282772.0, "step": 794 }, { "entropy": 0.43157958984375, "epoch": 0.4951981025700844, "grad_norm": 0.004742200328555844, "learning_rate": 2.2703969990622067e-05, "loss": 0.4048, "mean_token_accuracy": 0.8936592414975166, "num_tokens": 384240545.0, "step": 796 }, { "entropy": 0.4295654296875, "epoch": 0.4964423189082002, "grad_norm": 0.006326424154226473, "learning_rate": 2.2685214129415443e-05, "loss": 0.4006, "mean_token_accuracy": 0.8943749088793993, "num_tokens": 385204206.0, "step": 798 }, { "entropy": 0.43170166015625, "epoch": 0.49768653524631595, "grad_norm": 0.005426861707163634, "learning_rate": 2.2666458268208818e-05, "loss": 0.4027, "mean_token_accuracy": 0.8943722769618034, "num_tokens": 386173483.0, "step": 800 }, { "epoch": 0.49768653524631595, "eval_entropy": 0.42349202594165436, "eval_loss": 0.39906057715415955, "eval_mean_token_accuracy": 0.8948053320102185, "eval_num_tokens": 386173483.0, "eval_runtime": 426.5423, "eval_samples_per_second": 203.098, "eval_steps_per_second": 3.174, "step": 800 }, { "entropy": 0.42010498046875, "epoch": 0.49893075158443173, "grad_norm": 0.005210935473224394, "learning_rate": 2.264770240700219e-05, "loss": 0.3975, "mean_token_accuracy": 0.8951733857393265, "num_tokens": 387135915.0, "step": 802 }, { "entropy": 0.4239501953125, "epoch": 0.5001749679225476, "grad_norm": 0.005684581730772873, "learning_rate": 2.262894654579556e-05, "loss": 0.394, "mean_token_accuracy": 0.895956264808774, "num_tokens": 388083980.0, "step": 804 }, { "entropy": 0.4224853515625, "epoch": 0.5014191842606633, "grad_norm": 0.006251160609709536, "learning_rate": 2.2610190684588933e-05, "loss": 0.3963, "mean_token_accuracy": 0.8958156444132328, "num_tokens": 389039687.0, "step": 806 }, { "entropy": 0.42578125, "epoch": 0.5026634005987791, "grad_norm": 0.0052651006218964475, "learning_rate": 2.259143482338231e-05, "loss": 0.3999, "mean_token_accuracy": 0.8947747759521008, "num_tokens": 390001080.0, "step": 808 }, { "entropy": 0.4332275390625, "epoch": 0.5039076169368949, "grad_norm": 0.005121608737530232, "learning_rate": 2.257267896217568e-05, "loss": 0.4031, "mean_token_accuracy": 0.8937738910317421, "num_tokens": 390973736.0, "step": 810 }, { "entropy": 0.42730712890625, "epoch": 0.5051518332750107, "grad_norm": 0.006374642770867791, "learning_rate": 2.2553923100969055e-05, "loss": 0.402, "mean_token_accuracy": 0.8944003246724606, "num_tokens": 391941965.0, "step": 812 }, { "entropy": 0.42523193359375, "epoch": 0.5063960496131265, "grad_norm": 0.005528362003571673, "learning_rate": 2.2535167239762424e-05, "loss": 0.3967, "mean_token_accuracy": 0.8955277260392904, "num_tokens": 392906737.0, "step": 814 }, { "entropy": 0.4296875, "epoch": 0.5076402659512422, "grad_norm": 0.005483826248175075, "learning_rate": 2.25164113785558e-05, "loss": 0.4017, "mean_token_accuracy": 0.8942700810730457, "num_tokens": 393891666.0, "step": 816 }, { "entropy": 0.4254150390625, "epoch": 0.508884482289358, "grad_norm": 0.005535485437322444, "learning_rate": 2.2497655517349174e-05, "loss": 0.4015, "mean_token_accuracy": 0.8937990870326757, "num_tokens": 394849298.0, "step": 818 }, { "entropy": 0.423583984375, "epoch": 0.5101286986274739, "grad_norm": 0.005207767656973928, "learning_rate": 2.2478899656142546e-05, "loss": 0.3985, "mean_token_accuracy": 0.8951779305934906, "num_tokens": 395797328.0, "step": 820 }, { "entropy": 0.4256591796875, "epoch": 0.5113729149655897, "grad_norm": 0.005158752580815098, "learning_rate": 2.2460143794935918e-05, "loss": 0.3989, "mean_token_accuracy": 0.8950697015970945, "num_tokens": 396760244.0, "step": 822 }, { "entropy": 0.424560546875, "epoch": 0.5126171313037055, "grad_norm": 0.00515149111518127, "learning_rate": 2.244138793372929e-05, "loss": 0.3961, "mean_token_accuracy": 0.8956128414720297, "num_tokens": 397721356.0, "step": 824 }, { "entropy": 0.41876220703125, "epoch": 0.5138613476418212, "grad_norm": 0.005421687423445054, "learning_rate": 2.2422632072522665e-05, "loss": 0.3943, "mean_token_accuracy": 0.8957666009664536, "num_tokens": 398683850.0, "step": 826 }, { "entropy": 0.42144775390625, "epoch": 0.515105563979937, "grad_norm": 0.0052093777925430575, "learning_rate": 2.2403876211316037e-05, "loss": 0.3929, "mean_token_accuracy": 0.896053334698081, "num_tokens": 399642467.0, "step": 828 }, { "entropy": 0.4229736328125, "epoch": 0.5163497803180528, "grad_norm": 0.0049202534571896285, "learning_rate": 2.238512035010941e-05, "loss": 0.3985, "mean_token_accuracy": 0.8951233774423599, "num_tokens": 400611257.0, "step": 830 }, { "entropy": 0.42413330078125, "epoch": 0.5175939966561686, "grad_norm": 0.004888660382220434, "learning_rate": 2.236636448890278e-05, "loss": 0.3957, "mean_token_accuracy": 0.8956693802028894, "num_tokens": 401584723.0, "step": 832 }, { "entropy": 0.4266357421875, "epoch": 0.5188382129942843, "grad_norm": 0.0055791077541120625, "learning_rate": 2.2347608627696155e-05, "loss": 0.3988, "mean_token_accuracy": 0.8948435429483652, "num_tokens": 402549120.0, "step": 834 }, { "entropy": 0.41754150390625, "epoch": 0.5200824293324001, "grad_norm": 0.004853943211091741, "learning_rate": 2.232885276648953e-05, "loss": 0.3911, "mean_token_accuracy": 0.8962593711912632, "num_tokens": 403510622.0, "step": 836 }, { "entropy": 0.4234619140625, "epoch": 0.521326645670516, "grad_norm": 0.005408744118553746, "learning_rate": 2.2310096905282902e-05, "loss": 0.3983, "mean_token_accuracy": 0.8950631767511368, "num_tokens": 404483959.0, "step": 838 }, { "entropy": 0.41851806640625, "epoch": 0.5225708620086318, "grad_norm": 0.005000743693621137, "learning_rate": 2.2291341044076274e-05, "loss": 0.3939, "mean_token_accuracy": 0.8957247994840145, "num_tokens": 405432416.0, "step": 840 }, { "entropy": 0.42535400390625, "epoch": 0.5238150783467476, "grad_norm": 0.006133094534259653, "learning_rate": 2.2272585182869646e-05, "loss": 0.3972, "mean_token_accuracy": 0.8951957300305367, "num_tokens": 406400931.0, "step": 842 }, { "entropy": 0.424560546875, "epoch": 0.5250592946848633, "grad_norm": 0.004842243492219702, "learning_rate": 2.225382932166302e-05, "loss": 0.3972, "mean_token_accuracy": 0.8951044119894505, "num_tokens": 407355699.0, "step": 844 }, { "entropy": 0.42535400390625, "epoch": 0.5263035110229791, "grad_norm": 0.005410402430468196, "learning_rate": 2.2235073460456393e-05, "loss": 0.4017, "mean_token_accuracy": 0.8942126501351595, "num_tokens": 408318930.0, "step": 846 }, { "entropy": 0.4132080078125, "epoch": 0.5275477273610949, "grad_norm": 0.006054432019417272, "learning_rate": 2.2216317599249765e-05, "loss": 0.3846, "mean_token_accuracy": 0.8976531885564327, "num_tokens": 409272007.0, "step": 848 }, { "entropy": 0.41888427734375, "epoch": 0.5287919436992107, "grad_norm": 0.005550754889764139, "learning_rate": 2.219756173804314e-05, "loss": 0.3935, "mean_token_accuracy": 0.8961314652115107, "num_tokens": 410224905.0, "step": 850 }, { "entropy": 0.42218017578125, "epoch": 0.5300361600373265, "grad_norm": 0.005509633864997546, "learning_rate": 2.2178805876836512e-05, "loss": 0.3956, "mean_token_accuracy": 0.8959508761763573, "num_tokens": 411184741.0, "step": 852 }, { "entropy": 0.43206787109375, "epoch": 0.5312803763754422, "grad_norm": 0.005106420945942814, "learning_rate": 2.2160050015629887e-05, "loss": 0.4046, "mean_token_accuracy": 0.8941086158156395, "num_tokens": 412170095.0, "step": 854 }, { "entropy": 0.4261474609375, "epoch": 0.5325245927135581, "grad_norm": 0.005120240232705429, "learning_rate": 2.214129415442326e-05, "loss": 0.4007, "mean_token_accuracy": 0.8944654427468777, "num_tokens": 413139145.0, "step": 856 }, { "entropy": 0.423095703125, "epoch": 0.5337688090516739, "grad_norm": 0.005718612203169976, "learning_rate": 2.212253829321663e-05, "loss": 0.392, "mean_token_accuracy": 0.8961930088698864, "num_tokens": 414113353.0, "step": 858 }, { "entropy": 0.41552734375, "epoch": 0.5350130253897897, "grad_norm": 0.005694992106292023, "learning_rate": 2.2103782432010002e-05, "loss": 0.3932, "mean_token_accuracy": 0.8958910331130028, "num_tokens": 415083689.0, "step": 860 }, { "entropy": 0.41815185546875, "epoch": 0.5362572417279055, "grad_norm": 0.006142074431874336, "learning_rate": 2.2085026570803378e-05, "loss": 0.3952, "mean_token_accuracy": 0.8955587688833475, "num_tokens": 416045407.0, "step": 862 }, { "entropy": 0.42449951171875, "epoch": 0.5375014580660212, "grad_norm": 0.005822866510149946, "learning_rate": 2.206627070959675e-05, "loss": 0.3979, "mean_token_accuracy": 0.8949081879109144, "num_tokens": 417014377.0, "step": 864 }, { "entropy": 0.4227294921875, "epoch": 0.538745674404137, "grad_norm": 0.00545338630885265, "learning_rate": 2.204751484839012e-05, "loss": 0.3983, "mean_token_accuracy": 0.8952354658395052, "num_tokens": 417972829.0, "step": 866 }, { "entropy": 0.4251708984375, "epoch": 0.5399898907422528, "grad_norm": 0.005678831720360496, "learning_rate": 2.2028758987183496e-05, "loss": 0.3953, "mean_token_accuracy": 0.8954550456255674, "num_tokens": 418940693.0, "step": 868 }, { "entropy": 0.4207763671875, "epoch": 0.5412341070803686, "grad_norm": 0.007021437237875551, "learning_rate": 2.2010003125976868e-05, "loss": 0.3958, "mean_token_accuracy": 0.8954338114708662, "num_tokens": 419897204.0, "step": 870 }, { "entropy": 0.42047119140625, "epoch": 0.5424783234184843, "grad_norm": 0.006689095391585308, "learning_rate": 2.1991247264770243e-05, "loss": 0.3928, "mean_token_accuracy": 0.8957767300307751, "num_tokens": 420857712.0, "step": 872 }, { "entropy": 0.41375732421875, "epoch": 0.5437225397566002, "grad_norm": 0.0066080892949004644, "learning_rate": 2.1972491403563612e-05, "loss": 0.393, "mean_token_accuracy": 0.8958307560533285, "num_tokens": 421812479.0, "step": 874 }, { "entropy": 0.41925048828125, "epoch": 0.544966756094716, "grad_norm": 0.006524558186253598, "learning_rate": 2.1953735542356987e-05, "loss": 0.3926, "mean_token_accuracy": 0.8959525153040886, "num_tokens": 422778535.0, "step": 876 }, { "entropy": 0.42535400390625, "epoch": 0.5462109724328318, "grad_norm": 0.006096285945152476, "learning_rate": 2.193497968115036e-05, "loss": 0.399, "mean_token_accuracy": 0.8946866076439619, "num_tokens": 423758706.0, "step": 878 }, { "entropy": 0.41357421875, "epoch": 0.5474551887709476, "grad_norm": 0.005394357170184733, "learning_rate": 2.1916223819943734e-05, "loss": 0.3885, "mean_token_accuracy": 0.8966800943017006, "num_tokens": 424706121.0, "step": 880 }, { "entropy": 0.427490234375, "epoch": 0.5486994051090633, "grad_norm": 0.0062373279603809536, "learning_rate": 2.1897467958737106e-05, "loss": 0.3991, "mean_token_accuracy": 0.8946398999541998, "num_tokens": 425671884.0, "step": 882 }, { "entropy": 0.4146728515625, "epoch": 0.5499436214471791, "grad_norm": 0.005452059225351367, "learning_rate": 2.1878712097530478e-05, "loss": 0.3912, "mean_token_accuracy": 0.8959906548261642, "num_tokens": 426647306.0, "step": 884 }, { "entropy": 0.42449951171875, "epoch": 0.5511878377852949, "grad_norm": 0.005732830716264577, "learning_rate": 2.1859956236323853e-05, "loss": 0.399, "mean_token_accuracy": 0.8948819655925035, "num_tokens": 427627251.0, "step": 886 }, { "entropy": 0.421630859375, "epoch": 0.5524320541234107, "grad_norm": 0.00509891019961575, "learning_rate": 2.1841200375117225e-05, "loss": 0.3938, "mean_token_accuracy": 0.8955186046659946, "num_tokens": 428588737.0, "step": 888 }, { "entropy": 0.41729736328125, "epoch": 0.5536762704615265, "grad_norm": 0.005756626093746206, "learning_rate": 2.18224445139106e-05, "loss": 0.3966, "mean_token_accuracy": 0.8952113427221775, "num_tokens": 429549752.0, "step": 890 }, { "entropy": 0.4169921875, "epoch": 0.5549204867996423, "grad_norm": 0.006493255375678816, "learning_rate": 2.1803688652703968e-05, "loss": 0.3896, "mean_token_accuracy": 0.8964546527713537, "num_tokens": 430528925.0, "step": 892 }, { "entropy": 0.42340087890625, "epoch": 0.5561647031377581, "grad_norm": 0.0052620880447129086, "learning_rate": 2.1784932791497343e-05, "loss": 0.3994, "mean_token_accuracy": 0.8946353942155838, "num_tokens": 431497090.0, "step": 894 }, { "entropy": 0.423583984375, "epoch": 0.5574089194758739, "grad_norm": 0.005225091149238138, "learning_rate": 2.1766176930290715e-05, "loss": 0.3974, "mean_token_accuracy": 0.8949866145849228, "num_tokens": 432462774.0, "step": 896 }, { "entropy": 0.4168701171875, "epoch": 0.5586531358139897, "grad_norm": 0.005333520690167019, "learning_rate": 2.174742106908409e-05, "loss": 0.3876, "mean_token_accuracy": 0.8970209751278162, "num_tokens": 433430552.0, "step": 898 }, { "entropy": 0.4136962890625, "epoch": 0.5598973521521055, "grad_norm": 0.00491674217937417, "learning_rate": 2.1728665207877462e-05, "loss": 0.3908, "mean_token_accuracy": 0.8967483062297106, "num_tokens": 434401408.0, "step": 900 }, { "entropy": 0.42095947265625, "epoch": 0.5611415684902212, "grad_norm": 0.005268884860327853, "learning_rate": 2.1709909346670834e-05, "loss": 0.3953, "mean_token_accuracy": 0.8958153557032347, "num_tokens": 435368547.0, "step": 902 }, { "entropy": 0.41876220703125, "epoch": 0.562385784828337, "grad_norm": 0.005263832706934334, "learning_rate": 2.169115348546421e-05, "loss": 0.3924, "mean_token_accuracy": 0.8956536669284105, "num_tokens": 436320642.0, "step": 904 }, { "entropy": 0.4176025390625, "epoch": 0.5636300011664528, "grad_norm": 0.006764095729803573, "learning_rate": 2.167239762425758e-05, "loss": 0.3957, "mean_token_accuracy": 0.8953577503561974, "num_tokens": 437284775.0, "step": 906 }, { "entropy": 0.41375732421875, "epoch": 0.5648742175045686, "grad_norm": 0.005540746079894225, "learning_rate": 2.1653641763050956e-05, "loss": 0.386, "mean_token_accuracy": 0.8972478602081537, "num_tokens": 438238034.0, "step": 908 }, { "entropy": 0.426025390625, "epoch": 0.5661184338426843, "grad_norm": 0.005110930308108029, "learning_rate": 2.1634885901844325e-05, "loss": 0.3974, "mean_token_accuracy": 0.8950601629912853, "num_tokens": 439206003.0, "step": 910 }, { "entropy": 0.41552734375, "epoch": 0.5673626501808002, "grad_norm": 0.005395006251167204, "learning_rate": 2.16161300406377e-05, "loss": 0.3901, "mean_token_accuracy": 0.8967151660472155, "num_tokens": 440176667.0, "step": 912 }, { "entropy": 0.4217529296875, "epoch": 0.568606866518916, "grad_norm": 0.005311057819537625, "learning_rate": 2.1597374179431072e-05, "loss": 0.3958, "mean_token_accuracy": 0.895380400121212, "num_tokens": 441142740.0, "step": 914 }, { "entropy": 0.41888427734375, "epoch": 0.5698510828570318, "grad_norm": 0.005940374180452318, "learning_rate": 2.1578618318224447e-05, "loss": 0.391, "mean_token_accuracy": 0.8958764150738716, "num_tokens": 442109543.0, "step": 916 }, { "entropy": 0.41845703125, "epoch": 0.5710952991951476, "grad_norm": 0.006542688991423096, "learning_rate": 2.155986245701782e-05, "loss": 0.3966, "mean_token_accuracy": 0.8952454756945372, "num_tokens": 443068435.0, "step": 918 }, { "entropy": 0.422607421875, "epoch": 0.5723395155332633, "grad_norm": 0.005179266542414117, "learning_rate": 2.154110659581119e-05, "loss": 0.3966, "mean_token_accuracy": 0.8957321774214506, "num_tokens": 444048308.0, "step": 920 }, { "entropy": 0.41522216796875, "epoch": 0.5735837318713791, "grad_norm": 0.005918696913501304, "learning_rate": 2.1522350734604566e-05, "loss": 0.3897, "mean_token_accuracy": 0.896596223115921, "num_tokens": 445006600.0, "step": 922 }, { "entropy": 0.41650390625, "epoch": 0.5748279482094949, "grad_norm": 0.005496031881707057, "learning_rate": 2.1503594873397938e-05, "loss": 0.3878, "mean_token_accuracy": 0.8963942788541317, "num_tokens": 445962540.0, "step": 924 }, { "entropy": 0.4178466796875, "epoch": 0.5760721645476107, "grad_norm": 0.006067212984241164, "learning_rate": 2.1484839012191313e-05, "loss": 0.3964, "mean_token_accuracy": 0.8954455945640802, "num_tokens": 446926262.0, "step": 926 }, { "entropy": 0.41961669921875, "epoch": 0.5773163808857265, "grad_norm": 0.007333882347511627, "learning_rate": 2.146608315098468e-05, "loss": 0.3912, "mean_token_accuracy": 0.8964776452630758, "num_tokens": 447903542.0, "step": 928 }, { "entropy": 0.41925048828125, "epoch": 0.5785605972238423, "grad_norm": 0.006592587543581702, "learning_rate": 2.1447327289778056e-05, "loss": 0.3942, "mean_token_accuracy": 0.8953205682337284, "num_tokens": 448868181.0, "step": 930 }, { "entropy": 0.41217041015625, "epoch": 0.5798048135619581, "grad_norm": 0.005481044939357204, "learning_rate": 2.1428571428571428e-05, "loss": 0.3908, "mean_token_accuracy": 0.8962942380458117, "num_tokens": 449831370.0, "step": 932 }, { "entropy": 0.42138671875, "epoch": 0.5810490299000739, "grad_norm": 0.005106893255015684, "learning_rate": 2.1409815567364803e-05, "loss": 0.3902, "mean_token_accuracy": 0.8959232661873102, "num_tokens": 450787844.0, "step": 934 }, { "entropy": 0.40704345703125, "epoch": 0.5822932462381897, "grad_norm": 0.00775065757236855, "learning_rate": 2.1391059706158175e-05, "loss": 0.3859, "mean_token_accuracy": 0.8974800556898117, "num_tokens": 451734804.0, "step": 936 }, { "entropy": 0.41278076171875, "epoch": 0.5835374625763055, "grad_norm": 0.005365363016262376, "learning_rate": 2.1372303844951547e-05, "loss": 0.3882, "mean_token_accuracy": 0.8968946225941181, "num_tokens": 452693682.0, "step": 938 }, { "entropy": 0.4244384765625, "epoch": 0.5847816789144212, "grad_norm": 0.00546846512727055, "learning_rate": 2.1353547983744922e-05, "loss": 0.3986, "mean_token_accuracy": 0.8949402868747711, "num_tokens": 453647832.0, "step": 940 }, { "entropy": 0.4178466796875, "epoch": 0.586025895252537, "grad_norm": 0.005525322099725653, "learning_rate": 2.1334792122538294e-05, "loss": 0.3929, "mean_token_accuracy": 0.8958199508488178, "num_tokens": 454619182.0, "step": 942 }, { "entropy": 0.42041015625, "epoch": 0.5872701115906528, "grad_norm": 0.005176118503221973, "learning_rate": 2.1316036261331666e-05, "loss": 0.3961, "mean_token_accuracy": 0.8951426073908806, "num_tokens": 455585773.0, "step": 944 }, { "entropy": 0.41864013671875, "epoch": 0.5885143279287686, "grad_norm": 0.005217591410724491, "learning_rate": 2.1297280400125038e-05, "loss": 0.3904, "mean_token_accuracy": 0.8962810467928648, "num_tokens": 456551580.0, "step": 946 }, { "entropy": 0.4178466796875, "epoch": 0.5897585442668845, "grad_norm": 0.005334491269210712, "learning_rate": 2.1278524538918413e-05, "loss": 0.3905, "mean_token_accuracy": 0.8959276229143143, "num_tokens": 457513377.0, "step": 948 }, { "entropy": 0.4169921875, "epoch": 0.5910027606050002, "grad_norm": 0.006072787016798396, "learning_rate": 2.1259768677711785e-05, "loss": 0.3939, "mean_token_accuracy": 0.8955205325037241, "num_tokens": 458480907.0, "step": 950 }, { "entropy": 0.42376708984375, "epoch": 0.592246976943116, "grad_norm": 0.005686175970057782, "learning_rate": 2.124101281650516e-05, "loss": 0.399, "mean_token_accuracy": 0.894887562841177, "num_tokens": 459447563.0, "step": 952 }, { "entropy": 0.41119384765625, "epoch": 0.5934911932812318, "grad_norm": 0.004854803605695167, "learning_rate": 2.122225695529853e-05, "loss": 0.3874, "mean_token_accuracy": 0.8969063758850098, "num_tokens": 460425227.0, "step": 954 }, { "entropy": 0.4111328125, "epoch": 0.5947354096193476, "grad_norm": 0.005282969548197382, "learning_rate": 2.1203501094091903e-05, "loss": 0.3892, "mean_token_accuracy": 0.8965616319328547, "num_tokens": 461381647.0, "step": 956 }, { "entropy": 0.41790771484375, "epoch": 0.5959796259574633, "grad_norm": 0.005404057337375383, "learning_rate": 2.118474523288528e-05, "loss": 0.3918, "mean_token_accuracy": 0.895570520311594, "num_tokens": 462348832.0, "step": 958 }, { "entropy": 0.4205322265625, "epoch": 0.5972238422955791, "grad_norm": 0.005210715731725561, "learning_rate": 2.116598937167865e-05, "loss": 0.3923, "mean_token_accuracy": 0.8959782626479864, "num_tokens": 463316969.0, "step": 960 }, { "entropy": 0.41131591796875, "epoch": 0.5984680586336949, "grad_norm": 0.005590049637100757, "learning_rate": 2.1147233510472022e-05, "loss": 0.3865, "mean_token_accuracy": 0.8975994568318129, "num_tokens": 464268835.0, "step": 962 }, { "entropy": 0.40716552734375, "epoch": 0.5997122749718107, "grad_norm": 0.005100676347107371, "learning_rate": 2.1128477649265394e-05, "loss": 0.3851, "mean_token_accuracy": 0.8974907249212265, "num_tokens": 465222631.0, "step": 964 }, { "entropy": 0.413818359375, "epoch": 0.6009564913099266, "grad_norm": 0.005964640349392827, "learning_rate": 2.110972178805877e-05, "loss": 0.3866, "mean_token_accuracy": 0.8966084290295839, "num_tokens": 466180606.0, "step": 966 }, { "entropy": 0.418701171875, "epoch": 0.6022007076480423, "grad_norm": 0.005120312628314165, "learning_rate": 2.109096592685214e-05, "loss": 0.3906, "mean_token_accuracy": 0.8962235953658819, "num_tokens": 467134168.0, "step": 968 }, { "entropy": 0.4149169921875, "epoch": 0.6034449239861581, "grad_norm": 0.0051620261379773675, "learning_rate": 2.1072210065645516e-05, "loss": 0.3926, "mean_token_accuracy": 0.8957581054419279, "num_tokens": 468093622.0, "step": 970 }, { "entropy": 0.41571044921875, "epoch": 0.6046891403242739, "grad_norm": 0.00541400054657491, "learning_rate": 2.1053454204438888e-05, "loss": 0.3916, "mean_token_accuracy": 0.8962336555123329, "num_tokens": 469058433.0, "step": 972 }, { "entropy": 0.41357421875, "epoch": 0.6059333566623897, "grad_norm": 0.005771640687800066, "learning_rate": 2.103469834323226e-05, "loss": 0.385, "mean_token_accuracy": 0.8976070433855057, "num_tokens": 470004840.0, "step": 974 }, { "entropy": 0.4112548828125, "epoch": 0.6071775730005055, "grad_norm": 0.005717723557986113, "learning_rate": 2.1015942482025635e-05, "loss": 0.3871, "mean_token_accuracy": 0.8968115597963333, "num_tokens": 470958205.0, "step": 976 }, { "entropy": 0.42095947265625, "epoch": 0.6084217893386212, "grad_norm": 0.005491747810104913, "learning_rate": 2.0997186620819007e-05, "loss": 0.4009, "mean_token_accuracy": 0.8947882018983364, "num_tokens": 471930906.0, "step": 978 }, { "entropy": 0.42901611328125, "epoch": 0.609666005676737, "grad_norm": 0.0066821638601860765, "learning_rate": 2.097843075961238e-05, "loss": 0.393, "mean_token_accuracy": 0.8960137739777565, "num_tokens": 472903190.0, "step": 980 }, { "entropy": 0.41650390625, "epoch": 0.6109102220148528, "grad_norm": 0.005864078122789096, "learning_rate": 2.095967489840575e-05, "loss": 0.3902, "mean_token_accuracy": 0.8962555807083845, "num_tokens": 473876457.0, "step": 982 }, { "entropy": 0.4095458984375, "epoch": 0.6121544383529687, "grad_norm": 0.005349562268201124, "learning_rate": 2.0940919037199126e-05, "loss": 0.388, "mean_token_accuracy": 0.896767919883132, "num_tokens": 474836723.0, "step": 984 }, { "entropy": 0.42498779296875, "epoch": 0.6133986546910845, "grad_norm": 0.005670662004264218, "learning_rate": 2.09221631759925e-05, "loss": 0.394, "mean_token_accuracy": 0.8958625681698322, "num_tokens": 475801511.0, "step": 986 }, { "entropy": 0.41021728515625, "epoch": 0.6146428710292002, "grad_norm": 0.005551624053510003, "learning_rate": 2.090340731478587e-05, "loss": 0.3879, "mean_token_accuracy": 0.8971525598317385, "num_tokens": 476764336.0, "step": 988 }, { "entropy": 0.41351318359375, "epoch": 0.615887087367316, "grad_norm": 0.0053418284881240645, "learning_rate": 2.0884651453579244e-05, "loss": 0.3885, "mean_token_accuracy": 0.8968911021947861, "num_tokens": 477731200.0, "step": 990 }, { "entropy": 0.41705322265625, "epoch": 0.6171313037054318, "grad_norm": 0.005785835062548755, "learning_rate": 2.0865895592372616e-05, "loss": 0.3921, "mean_token_accuracy": 0.8961762506514788, "num_tokens": 478697972.0, "step": 992 }, { "entropy": 0.4154052734375, "epoch": 0.6183755200435476, "grad_norm": 0.005214538001014637, "learning_rate": 2.084713973116599e-05, "loss": 0.3893, "mean_token_accuracy": 0.8963216431438923, "num_tokens": 479663827.0, "step": 994 }, { "entropy": 0.40765380859375, "epoch": 0.6196197363816633, "grad_norm": 0.00521225128741818, "learning_rate": 2.0828383869959363e-05, "loss": 0.3822, "mean_token_accuracy": 0.8979937825351954, "num_tokens": 480651422.0, "step": 996 }, { "entropy": 0.41375732421875, "epoch": 0.6208639527197791, "grad_norm": 0.006006659590877432, "learning_rate": 2.0809628008752735e-05, "loss": 0.3869, "mean_token_accuracy": 0.8966730367392302, "num_tokens": 481610474.0, "step": 998 }, { "entropy": 0.41302490234375, "epoch": 0.6221081690578949, "grad_norm": 0.006100626505091027, "learning_rate": 2.0790872147546107e-05, "loss": 0.3866, "mean_token_accuracy": 0.8971083834767342, "num_tokens": 482578491.0, "step": 1000 }, { "epoch": 0.6221081690578949, "eval_entropy": 0.4110289535635155, "eval_loss": 0.3900891840457916, "eval_mean_token_accuracy": 0.8963545168608295, "eval_num_tokens": 482578491.0, "eval_runtime": 425.7215, "eval_samples_per_second": 203.49, "eval_steps_per_second": 3.18, "step": 1000 }, { "entropy": 0.4139404296875, "epoch": 0.6233523853960107, "grad_norm": 0.005565603771424403, "learning_rate": 2.0772116286339482e-05, "loss": 0.3935, "mean_token_accuracy": 0.8956682551652193, "num_tokens": 483543622.0, "step": 1002 }, { "entropy": 0.42120361328125, "epoch": 0.6245966017341266, "grad_norm": 0.005335766827071883, "learning_rate": 2.0753360425132857e-05, "loss": 0.394, "mean_token_accuracy": 0.8953648395836353, "num_tokens": 484523685.0, "step": 1004 }, { "entropy": 0.4039306640625, "epoch": 0.6258408180722423, "grad_norm": 0.0054480087765319644, "learning_rate": 2.0734604563926226e-05, "loss": 0.3836, "mean_token_accuracy": 0.897576168179512, "num_tokens": 485478874.0, "step": 1006 }, { "entropy": 0.41143798828125, "epoch": 0.6270850344103581, "grad_norm": 0.005064260800619826, "learning_rate": 2.07158487027196e-05, "loss": 0.3891, "mean_token_accuracy": 0.8967862222343683, "num_tokens": 486456658.0, "step": 1008 }, { "entropy": 0.41668701171875, "epoch": 0.6283292507484739, "grad_norm": 0.005936727671255164, "learning_rate": 2.0697092841512973e-05, "loss": 0.3865, "mean_token_accuracy": 0.8969843033701181, "num_tokens": 487429206.0, "step": 1010 }, { "entropy": 0.4111328125, "epoch": 0.6295734670865897, "grad_norm": 0.006968814804260769, "learning_rate": 2.0678336980306348e-05, "loss": 0.3919, "mean_token_accuracy": 0.8963109701871872, "num_tokens": 488387742.0, "step": 1012 }, { "entropy": 0.41033935546875, "epoch": 0.6308176834247055, "grad_norm": 0.006894010314515511, "learning_rate": 2.065958111909972e-05, "loss": 0.3844, "mean_token_accuracy": 0.8980285171419382, "num_tokens": 489351279.0, "step": 1014 }, { "entropy": 0.41741943359375, "epoch": 0.6320618997628212, "grad_norm": 0.005361119824338836, "learning_rate": 2.064082525789309e-05, "loss": 0.3892, "mean_token_accuracy": 0.8967837411910295, "num_tokens": 490322551.0, "step": 1016 }, { "entropy": 0.413818359375, "epoch": 0.633306116100937, "grad_norm": 0.005780349747848976, "learning_rate": 2.0622069396686463e-05, "loss": 0.3885, "mean_token_accuracy": 0.8962384201586246, "num_tokens": 491300480.0, "step": 1018 }, { "entropy": 0.4263916015625, "epoch": 0.6345503324390528, "grad_norm": 0.005854125884656781, "learning_rate": 2.060331353547984e-05, "loss": 0.4018, "mean_token_accuracy": 0.8937140423804522, "num_tokens": 492272770.0, "step": 1020 }, { "entropy": 0.4217529296875, "epoch": 0.6357945487771687, "grad_norm": 0.005520334524796313, "learning_rate": 2.0584557674273214e-05, "loss": 0.39, "mean_token_accuracy": 0.8965720795094967, "num_tokens": 493253895.0, "step": 1022 }, { "entropy": 0.40399169921875, "epoch": 0.6370387651152845, "grad_norm": 0.006606519308837761, "learning_rate": 2.0565801813066582e-05, "loss": 0.3812, "mean_token_accuracy": 0.8982302602380514, "num_tokens": 494210671.0, "step": 1024 }, { "entropy": 0.4112548828125, "epoch": 0.6382829814534002, "grad_norm": 0.005746590797221311, "learning_rate": 2.0547045951859957e-05, "loss": 0.3884, "mean_token_accuracy": 0.8967691697180271, "num_tokens": 495168428.0, "step": 1026 }, { "entropy": 0.4200439453125, "epoch": 0.639527197791516, "grad_norm": 0.005858471253625426, "learning_rate": 2.052829009065333e-05, "loss": 0.3938, "mean_token_accuracy": 0.896326458081603, "num_tokens": 496130987.0, "step": 1028 }, { "entropy": 0.40765380859375, "epoch": 0.6407714141296318, "grad_norm": 0.006388810067691431, "learning_rate": 2.0509534229446704e-05, "loss": 0.385, "mean_token_accuracy": 0.8975093718618155, "num_tokens": 497093874.0, "step": 1030 }, { "entropy": 0.40985107421875, "epoch": 0.6420156304677476, "grad_norm": 0.0055447028142307725, "learning_rate": 2.0490778368240073e-05, "loss": 0.3835, "mean_token_accuracy": 0.8980830013751984, "num_tokens": 498051613.0, "step": 1032 }, { "entropy": 0.42486572265625, "epoch": 0.6432598468058633, "grad_norm": 0.006223126797969318, "learning_rate": 2.0472022507033448e-05, "loss": 0.3935, "mean_token_accuracy": 0.8959597405046225, "num_tokens": 499021332.0, "step": 1034 }, { "entropy": 0.4095458984375, "epoch": 0.6445040631439791, "grad_norm": 0.0054579041560269045, "learning_rate": 2.045326664582682e-05, "loss": 0.3871, "mean_token_accuracy": 0.896736666560173, "num_tokens": 499989397.0, "step": 1036 }, { "entropy": 0.409912109375, "epoch": 0.6457482794820949, "grad_norm": 0.0053096533741166544, "learning_rate": 2.0434510784620195e-05, "loss": 0.3847, "mean_token_accuracy": 0.8976735696196556, "num_tokens": 500954721.0, "step": 1038 }, { "entropy": 0.42291259765625, "epoch": 0.6469924958202108, "grad_norm": 0.0056120788017657176, "learning_rate": 2.041575492341357e-05, "loss": 0.3945, "mean_token_accuracy": 0.8958178330212831, "num_tokens": 501931119.0, "step": 1040 }, { "entropy": 0.414794921875, "epoch": 0.6482367121583266, "grad_norm": 0.006008045151301368, "learning_rate": 2.039699906220694e-05, "loss": 0.392, "mean_token_accuracy": 0.8959078155457973, "num_tokens": 502901720.0, "step": 1042 }, { "entropy": 0.40966796875, "epoch": 0.6494809284964423, "grad_norm": 0.005416058694271658, "learning_rate": 2.0378243201000314e-05, "loss": 0.3888, "mean_token_accuracy": 0.8966613598167896, "num_tokens": 503867998.0, "step": 1044 }, { "entropy": 0.424560546875, "epoch": 0.6507251448345581, "grad_norm": 0.005607197377611443, "learning_rate": 2.0359487339793686e-05, "loss": 0.3943, "mean_token_accuracy": 0.8953194171190262, "num_tokens": 504836399.0, "step": 1046 }, { "entropy": 0.4102783203125, "epoch": 0.6519693611726739, "grad_norm": 0.005985227109415413, "learning_rate": 2.034073147858706e-05, "loss": 0.3918, "mean_token_accuracy": 0.8959389068186283, "num_tokens": 505795869.0, "step": 1048 }, { "entropy": 0.416259765625, "epoch": 0.6532135775107897, "grad_norm": 0.005203320627691866, "learning_rate": 2.032197561738043e-05, "loss": 0.3908, "mean_token_accuracy": 0.8959005642682314, "num_tokens": 506772086.0, "step": 1050 }, { "entropy": 0.4150390625, "epoch": 0.6544577938489055, "grad_norm": 0.005015488686619663, "learning_rate": 2.0303219756173804e-05, "loss": 0.3882, "mean_token_accuracy": 0.8967312499880791, "num_tokens": 507728032.0, "step": 1052 }, { "entropy": 0.406494140625, "epoch": 0.6557020101870212, "grad_norm": 0.005461205404690161, "learning_rate": 2.028446389496718e-05, "loss": 0.3876, "mean_token_accuracy": 0.8971593976020813, "num_tokens": 508696363.0, "step": 1054 }, { "entropy": 0.409423828125, "epoch": 0.656946226525137, "grad_norm": 0.0057406053852382235, "learning_rate": 2.026570803376055e-05, "loss": 0.3811, "mean_token_accuracy": 0.898093581199646, "num_tokens": 509644434.0, "step": 1056 }, { "entropy": 0.41015625, "epoch": 0.6581904428632529, "grad_norm": 0.0053602473684116565, "learning_rate": 2.0246952172553927e-05, "loss": 0.3875, "mean_token_accuracy": 0.8965869285166264, "num_tokens": 510611970.0, "step": 1058 }, { "entropy": 0.416748046875, "epoch": 0.6594346592013687, "grad_norm": 0.00481563706997308, "learning_rate": 2.0228196311347295e-05, "loss": 0.3942, "mean_token_accuracy": 0.8958507999777794, "num_tokens": 511589880.0, "step": 1060 }, { "entropy": 0.4180908203125, "epoch": 0.6606788755394845, "grad_norm": 0.0059280290985392545, "learning_rate": 2.020944045014067e-05, "loss": 0.3919, "mean_token_accuracy": 0.8960464298725128, "num_tokens": 512565989.0, "step": 1062 }, { "entropy": 0.412841796875, "epoch": 0.6619230918776002, "grad_norm": 0.005769346600825443, "learning_rate": 2.0190684588934042e-05, "loss": 0.3834, "mean_token_accuracy": 0.897436136379838, "num_tokens": 513524389.0, "step": 1064 }, { "entropy": 0.40814208984375, "epoch": 0.663167308215716, "grad_norm": 0.006258067903600261, "learning_rate": 2.0171928727727417e-05, "loss": 0.3858, "mean_token_accuracy": 0.8971982151269913, "num_tokens": 514479153.0, "step": 1066 }, { "entropy": 0.41204833984375, "epoch": 0.6644115245538318, "grad_norm": 0.005090755918430515, "learning_rate": 2.0153172866520786e-05, "loss": 0.3919, "mean_token_accuracy": 0.8961809463799, "num_tokens": 515449891.0, "step": 1068 }, { "entropy": 0.41912841796875, "epoch": 0.6656557408919476, "grad_norm": 0.005965384616597388, "learning_rate": 2.013441700531416e-05, "loss": 0.39, "mean_token_accuracy": 0.8962800856679678, "num_tokens": 516407451.0, "step": 1070 }, { "entropy": 0.40545654296875, "epoch": 0.6668999572300633, "grad_norm": 0.0063683988513924756, "learning_rate": 2.0115661144107536e-05, "loss": 0.383, "mean_token_accuracy": 0.8979792650789022, "num_tokens": 517369583.0, "step": 1072 }, { "entropy": 0.40679931640625, "epoch": 0.6681441735681791, "grad_norm": 0.0053553616494144765, "learning_rate": 2.0096905282900908e-05, "loss": 0.3863, "mean_token_accuracy": 0.8972724936902523, "num_tokens": 518343701.0, "step": 1074 }, { "entropy": 0.41448974609375, "epoch": 0.669388389906295, "grad_norm": 0.005454613685491045, "learning_rate": 2.007814942169428e-05, "loss": 0.3868, "mean_token_accuracy": 0.8969937581568956, "num_tokens": 519314830.0, "step": 1076 }, { "entropy": 0.41229248046875, "epoch": 0.6706326062444108, "grad_norm": 0.005081160328860916, "learning_rate": 2.005939356048765e-05, "loss": 0.3844, "mean_token_accuracy": 0.8973781671375036, "num_tokens": 520279330.0, "step": 1078 }, { "entropy": 0.411376953125, "epoch": 0.6718768225825266, "grad_norm": 0.005868257465954111, "learning_rate": 2.0040637699281027e-05, "loss": 0.3892, "mean_token_accuracy": 0.896363602951169, "num_tokens": 521260911.0, "step": 1080 }, { "entropy": 0.42291259765625, "epoch": 0.6731210389206423, "grad_norm": 0.005956660240034949, "learning_rate": 2.00218818380744e-05, "loss": 0.3924, "mean_token_accuracy": 0.8959739655256271, "num_tokens": 522232071.0, "step": 1082 }, { "entropy": 0.4090576171875, "epoch": 0.6743652552587581, "grad_norm": 0.005014926488511839, "learning_rate": 2.0003125976867774e-05, "loss": 0.3824, "mean_token_accuracy": 0.8980595096945763, "num_tokens": 523193129.0, "step": 1084 }, { "entropy": 0.4073486328125, "epoch": 0.6756094715968739, "grad_norm": 0.005680186847733003, "learning_rate": 1.9984370115661142e-05, "loss": 0.3889, "mean_token_accuracy": 0.8971110507845879, "num_tokens": 524166459.0, "step": 1086 }, { "entropy": 0.413330078125, "epoch": 0.6768536879349897, "grad_norm": 0.006159391102145929, "learning_rate": 1.9965614254454517e-05, "loss": 0.3883, "mean_token_accuracy": 0.8964810874313116, "num_tokens": 525123344.0, "step": 1088 }, { "entropy": 0.40203857421875, "epoch": 0.6780979042731055, "grad_norm": 0.007155015158371388, "learning_rate": 1.9946858393247892e-05, "loss": 0.3793, "mean_token_accuracy": 0.8986930977553129, "num_tokens": 526092213.0, "step": 1090 }, { "entropy": 0.4195556640625, "epoch": 0.6793421206112212, "grad_norm": 0.005995833736856974, "learning_rate": 1.9928102532041264e-05, "loss": 0.3925, "mean_token_accuracy": 0.8958219159394503, "num_tokens": 527054439.0, "step": 1092 }, { "entropy": 0.40936279296875, "epoch": 0.680586336949337, "grad_norm": 0.005523127733734252, "learning_rate": 1.9909346670834636e-05, "loss": 0.3832, "mean_token_accuracy": 0.8977016974240541, "num_tokens": 528010580.0, "step": 1094 }, { "entropy": 0.39971923828125, "epoch": 0.6818305532874529, "grad_norm": 0.005506022023921297, "learning_rate": 1.9890590809628008e-05, "loss": 0.3779, "mean_token_accuracy": 0.8987138736993074, "num_tokens": 528971724.0, "step": 1096 }, { "entropy": 0.41552734375, "epoch": 0.6830747696255687, "grad_norm": 0.006616724409778324, "learning_rate": 1.9871834948421383e-05, "loss": 0.3893, "mean_token_accuracy": 0.8962969742715359, "num_tokens": 529928251.0, "step": 1098 }, { "entropy": 0.40216064453125, "epoch": 0.6843189859636845, "grad_norm": 0.006904848190639688, "learning_rate": 1.9853079087214755e-05, "loss": 0.3755, "mean_token_accuracy": 0.8991440329700708, "num_tokens": 530888673.0, "step": 1100 }, { "entropy": 0.4068603515625, "epoch": 0.6855632023018002, "grad_norm": 0.005124197409425911, "learning_rate": 1.9834323226008127e-05, "loss": 0.3857, "mean_token_accuracy": 0.8967606462538242, "num_tokens": 531852720.0, "step": 1102 }, { "entropy": 0.41937255859375, "epoch": 0.686807418639916, "grad_norm": 0.006820514918052845, "learning_rate": 1.9815567364801502e-05, "loss": 0.3915, "mean_token_accuracy": 0.8960362896323204, "num_tokens": 532818017.0, "step": 1104 }, { "entropy": 0.410400390625, "epoch": 0.6880516349780318, "grad_norm": 0.005965889925857235, "learning_rate": 1.9796811503594874e-05, "loss": 0.3879, "mean_token_accuracy": 0.8966072387993336, "num_tokens": 533788353.0, "step": 1106 }, { "entropy": 0.4158935546875, "epoch": 0.6892958513161476, "grad_norm": 0.005826731430176432, "learning_rate": 1.977805564238825e-05, "loss": 0.3932, "mean_token_accuracy": 0.8958346974104643, "num_tokens": 534747189.0, "step": 1108 }, { "entropy": 0.417236328125, "epoch": 0.6905400676542633, "grad_norm": 0.005408871220802163, "learning_rate": 1.975929978118162e-05, "loss": 0.3938, "mean_token_accuracy": 0.8960236236453056, "num_tokens": 535715515.0, "step": 1110 }, { "entropy": 0.422119140625, "epoch": 0.6917842839923791, "grad_norm": 0.005466616383346732, "learning_rate": 1.9740543919974992e-05, "loss": 0.394, "mean_token_accuracy": 0.8954806588590145, "num_tokens": 536685939.0, "step": 1112 }, { "entropy": 0.40869140625, "epoch": 0.693028500330495, "grad_norm": 0.0052664885018983655, "learning_rate": 1.9721788058768364e-05, "loss": 0.3823, "mean_token_accuracy": 0.8979523498564959, "num_tokens": 537644622.0, "step": 1114 }, { "entropy": 0.40283203125, "epoch": 0.6942727166686108, "grad_norm": 0.005325622143951246, "learning_rate": 1.970303219756174e-05, "loss": 0.3836, "mean_token_accuracy": 0.8973144106566906, "num_tokens": 538626877.0, "step": 1116 }, { "entropy": 0.4119873046875, "epoch": 0.6955169330067266, "grad_norm": 0.00525250333147363, "learning_rate": 1.968427633635511e-05, "loss": 0.3866, "mean_token_accuracy": 0.897228492423892, "num_tokens": 539585760.0, "step": 1118 }, { "entropy": 0.41180419921875, "epoch": 0.6967611493448423, "grad_norm": 0.004732373438815831, "learning_rate": 1.9665520475148483e-05, "loss": 0.3843, "mean_token_accuracy": 0.8972988482564688, "num_tokens": 540555300.0, "step": 1120 }, { "entropy": 0.41387939453125, "epoch": 0.6980053656829581, "grad_norm": 0.005056241895966053, "learning_rate": 1.9646764613941858e-05, "loss": 0.3916, "mean_token_accuracy": 0.8961893692612648, "num_tokens": 541538715.0, "step": 1122 }, { "entropy": 0.4130859375, "epoch": 0.6992495820210739, "grad_norm": 0.004980528197962088, "learning_rate": 1.962800875273523e-05, "loss": 0.3898, "mean_token_accuracy": 0.896166292950511, "num_tokens": 542500513.0, "step": 1124 }, { "entropy": 0.4129638671875, "epoch": 0.7004937983591897, "grad_norm": 0.00567010460561657, "learning_rate": 1.9609252891528605e-05, "loss": 0.3867, "mean_token_accuracy": 0.896731536835432, "num_tokens": 543468081.0, "step": 1126 }, { "entropy": 0.4075927734375, "epoch": 0.7017380146973055, "grad_norm": 0.0057942043042454785, "learning_rate": 1.9590497030321977e-05, "loss": 0.3871, "mean_token_accuracy": 0.8968832138925791, "num_tokens": 544441319.0, "step": 1128 }, { "entropy": 0.4154052734375, "epoch": 0.7029822310354212, "grad_norm": 0.005075589240747897, "learning_rate": 1.957174116911535e-05, "loss": 0.3899, "mean_token_accuracy": 0.8962947819381952, "num_tokens": 545401360.0, "step": 1130 }, { "entropy": 0.40667724609375, "epoch": 0.7042264473735371, "grad_norm": 0.005453267949309768, "learning_rate": 1.955298530790872e-05, "loss": 0.3829, "mean_token_accuracy": 0.8977912161499262, "num_tokens": 546349822.0, "step": 1132 }, { "entropy": 0.40765380859375, "epoch": 0.7054706637116529, "grad_norm": 0.004858191830245093, "learning_rate": 1.9534229446702096e-05, "loss": 0.3831, "mean_token_accuracy": 0.897522484883666, "num_tokens": 547303409.0, "step": 1134 }, { "entropy": 0.4019775390625, "epoch": 0.7067148800497687, "grad_norm": 0.006263781907127375, "learning_rate": 1.9515473585495468e-05, "loss": 0.3793, "mean_token_accuracy": 0.8982968758791685, "num_tokens": 548270245.0, "step": 1136 }, { "entropy": 0.41595458984375, "epoch": 0.7079590963878845, "grad_norm": 0.005225892107250052, "learning_rate": 1.949671772428884e-05, "loss": 0.3917, "mean_token_accuracy": 0.8959584943950176, "num_tokens": 549257289.0, "step": 1138 }, { "entropy": 0.4150390625, "epoch": 0.7092033127260002, "grad_norm": 0.006676720335572354, "learning_rate": 1.9477961863082215e-05, "loss": 0.3858, "mean_token_accuracy": 0.8973506949841976, "num_tokens": 550233834.0, "step": 1140 }, { "entropy": 0.42022705078125, "epoch": 0.710447529064116, "grad_norm": 0.00641443854903958, "learning_rate": 1.9459206001875586e-05, "loss": 0.3976, "mean_token_accuracy": 0.8950800392776728, "num_tokens": 551213792.0, "step": 1142 }, { "entropy": 0.40625, "epoch": 0.7116917454022318, "grad_norm": 0.005355077055787162, "learning_rate": 1.944045014066896e-05, "loss": 0.3828, "mean_token_accuracy": 0.8978765271604061, "num_tokens": 552170743.0, "step": 1144 }, { "entropy": 0.40948486328125, "epoch": 0.7129359617403476, "grad_norm": 0.005743001679617995, "learning_rate": 1.942169427946233e-05, "loss": 0.3849, "mean_token_accuracy": 0.8970100916922092, "num_tokens": 553137363.0, "step": 1146 }, { "entropy": 0.40570068359375, "epoch": 0.7141801780784633, "grad_norm": 0.006270416987513748, "learning_rate": 1.9402938418255705e-05, "loss": 0.3826, "mean_token_accuracy": 0.8975347969681025, "num_tokens": 554102075.0, "step": 1148 }, { "entropy": 0.405517578125, "epoch": 0.7154243944165792, "grad_norm": 0.0053510711977867624, "learning_rate": 1.9384182557049077e-05, "loss": 0.3843, "mean_token_accuracy": 0.8972335699945688, "num_tokens": 555065853.0, "step": 1150 }, { "entropy": 0.41650390625, "epoch": 0.716668610754695, "grad_norm": 0.006301191589535862, "learning_rate": 1.9365426695842452e-05, "loss": 0.3869, "mean_token_accuracy": 0.8971068132668734, "num_tokens": 556045814.0, "step": 1152 }, { "entropy": 0.40087890625, "epoch": 0.7179128270928108, "grad_norm": 0.005435580859777373, "learning_rate": 1.9346670834635824e-05, "loss": 0.3788, "mean_token_accuracy": 0.8988061007112265, "num_tokens": 557002514.0, "step": 1154 }, { "entropy": 0.40191650390625, "epoch": 0.7191570434309266, "grad_norm": 0.005834018749708554, "learning_rate": 1.9327914973429196e-05, "loss": 0.3806, "mean_token_accuracy": 0.8983368631452322, "num_tokens": 557968317.0, "step": 1156 }, { "entropy": 0.4019775390625, "epoch": 0.7204012597690423, "grad_norm": 0.005627330163771361, "learning_rate": 1.930915911222257e-05, "loss": 0.3779, "mean_token_accuracy": 0.8987822849303484, "num_tokens": 558925156.0, "step": 1158 }, { "entropy": 0.41021728515625, "epoch": 0.7216454761071581, "grad_norm": 0.005332177036666967, "learning_rate": 1.9290403251015943e-05, "loss": 0.3846, "mean_token_accuracy": 0.897439006716013, "num_tokens": 559899120.0, "step": 1160 }, { "entropy": 0.40692138671875, "epoch": 0.7228896924452739, "grad_norm": 0.0053086326299097465, "learning_rate": 1.9271647389809318e-05, "loss": 0.3888, "mean_token_accuracy": 0.896560313180089, "num_tokens": 560872142.0, "step": 1162 }, { "entropy": 0.41412353515625, "epoch": 0.7241339087833897, "grad_norm": 0.005960387145219613, "learning_rate": 1.9252891528602687e-05, "loss": 0.3885, "mean_token_accuracy": 0.8966344501823187, "num_tokens": 561845634.0, "step": 1164 }, { "entropy": 0.4144287109375, "epoch": 0.7253781251215055, "grad_norm": 0.005636520490123857, "learning_rate": 1.9234135667396062e-05, "loss": 0.3907, "mean_token_accuracy": 0.8959302101284266, "num_tokens": 562812134.0, "step": 1166 }, { "entropy": 0.40423583984375, "epoch": 0.7266223414596213, "grad_norm": 0.005132670877256198, "learning_rate": 1.9215379806189434e-05, "loss": 0.3804, "mean_token_accuracy": 0.8982013892382383, "num_tokens": 563789953.0, "step": 1168 }, { "entropy": 0.40692138671875, "epoch": 0.7278665577977371, "grad_norm": 0.005979391533241717, "learning_rate": 1.919662394498281e-05, "loss": 0.3858, "mean_token_accuracy": 0.8966696038842201, "num_tokens": 564750976.0, "step": 1170 }, { "entropy": 0.4014892578125, "epoch": 0.7291107741358529, "grad_norm": 0.004889239834200079, "learning_rate": 1.917786808377618e-05, "loss": 0.3755, "mean_token_accuracy": 0.8995068166404963, "num_tokens": 565713558.0, "step": 1172 }, { "entropy": 0.40948486328125, "epoch": 0.7303549904739687, "grad_norm": 0.00488035693158937, "learning_rate": 1.9159112222569552e-05, "loss": 0.3862, "mean_token_accuracy": 0.8970502987504005, "num_tokens": 566666659.0, "step": 1174 }, { "entropy": 0.41448974609375, "epoch": 0.7315992068120845, "grad_norm": 0.005461312837149896, "learning_rate": 1.9140356361362928e-05, "loss": 0.3889, "mean_token_accuracy": 0.896641168743372, "num_tokens": 567652384.0, "step": 1176 }, { "entropy": 0.4100341796875, "epoch": 0.7328434231502002, "grad_norm": 0.006049871263846083, "learning_rate": 1.91216005001563e-05, "loss": 0.3867, "mean_token_accuracy": 0.8966421987861395, "num_tokens": 568619114.0, "step": 1178 }, { "entropy": 0.40435791015625, "epoch": 0.734087639488316, "grad_norm": 0.005271492095077976, "learning_rate": 1.9102844638949675e-05, "loss": 0.3797, "mean_token_accuracy": 0.8978498950600624, "num_tokens": 569587075.0, "step": 1180 }, { "entropy": 0.40252685546875, "epoch": 0.7353318558264318, "grad_norm": 0.00485046374710851, "learning_rate": 1.9084088777743043e-05, "loss": 0.377, "mean_token_accuracy": 0.898680591955781, "num_tokens": 570559190.0, "step": 1182 }, { "entropy": 0.40234375, "epoch": 0.7365760721645476, "grad_norm": 0.005914985123002147, "learning_rate": 1.9065332916536418e-05, "loss": 0.3784, "mean_token_accuracy": 0.8985530808568001, "num_tokens": 571526776.0, "step": 1184 }, { "entropy": 0.4163818359375, "epoch": 0.7378202885026633, "grad_norm": 0.006268742978386777, "learning_rate": 1.904657705532979e-05, "loss": 0.3933, "mean_token_accuracy": 0.8954425398260355, "num_tokens": 572486522.0, "step": 1186 }, { "entropy": 0.41033935546875, "epoch": 0.7390645048407792, "grad_norm": 0.004922426700454727, "learning_rate": 1.9027821194123165e-05, "loss": 0.388, "mean_token_accuracy": 0.8970408570021391, "num_tokens": 573446220.0, "step": 1188 }, { "entropy": 0.4073486328125, "epoch": 0.740308721178895, "grad_norm": 0.005120991799350429, "learning_rate": 1.9009065332916537e-05, "loss": 0.3812, "mean_token_accuracy": 0.8980522584170103, "num_tokens": 574398806.0, "step": 1190 }, { "entropy": 0.4112548828125, "epoch": 0.7415529375170108, "grad_norm": 0.00522148002147237, "learning_rate": 1.899030947170991e-05, "loss": 0.3885, "mean_token_accuracy": 0.8969650529325008, "num_tokens": 575362334.0, "step": 1192 }, { "entropy": 0.4031982421875, "epoch": 0.7427971538551266, "grad_norm": 0.005303196994868923, "learning_rate": 1.8971553610503284e-05, "loss": 0.3787, "mean_token_accuracy": 0.8984784092754126, "num_tokens": 576323654.0, "step": 1194 }, { "entropy": 0.4010009765625, "epoch": 0.7440413701932423, "grad_norm": 0.0052516994072822586, "learning_rate": 1.8952797749296656e-05, "loss": 0.3791, "mean_token_accuracy": 0.8985953517258167, "num_tokens": 577296647.0, "step": 1196 }, { "entropy": 0.40948486328125, "epoch": 0.7452855865313581, "grad_norm": 0.0050395263383580125, "learning_rate": 1.893404188809003e-05, "loss": 0.3845, "mean_token_accuracy": 0.8973930757492781, "num_tokens": 578253377.0, "step": 1198 }, { "entropy": 0.41168212890625, "epoch": 0.7465298028694739, "grad_norm": 0.005236363994941287, "learning_rate": 1.89152860268834e-05, "loss": 0.386, "mean_token_accuracy": 0.8968632351607084, "num_tokens": 579224127.0, "step": 1200 }, { "epoch": 0.7465298028694739, "eval_entropy": 0.4046113367799114, "eval_loss": 0.3833683431148529, "eval_mean_token_accuracy": 0.8974666306680203, "eval_num_tokens": 579224127.0, "eval_runtime": 425.7989, "eval_samples_per_second": 203.453, "eval_steps_per_second": 3.18, "step": 1200 }, { "entropy": 0.4056396484375, "epoch": 0.7477740192075897, "grad_norm": 0.005307090286866229, "learning_rate": 1.8896530165676775e-05, "loss": 0.384, "mean_token_accuracy": 0.8975871633738279, "num_tokens": 580172960.0, "step": 1202 }, { "entropy": 0.418701171875, "epoch": 0.7490182355457055, "grad_norm": 0.0049273604053481105, "learning_rate": 1.8877774304470146e-05, "loss": 0.3927, "mean_token_accuracy": 0.8958396892994642, "num_tokens": 581159478.0, "step": 1204 }, { "entropy": 0.40875244140625, "epoch": 0.7502624518838213, "grad_norm": 0.005398218138623982, "learning_rate": 1.885901844326352e-05, "loss": 0.3801, "mean_token_accuracy": 0.8978426903486252, "num_tokens": 582128003.0, "step": 1206 }, { "entropy": 0.3992919921875, "epoch": 0.7515066682219371, "grad_norm": 0.00495924259892581, "learning_rate": 1.8840262582056893e-05, "loss": 0.3787, "mean_token_accuracy": 0.898076806217432, "num_tokens": 583091222.0, "step": 1208 }, { "entropy": 0.40911865234375, "epoch": 0.7527508845600529, "grad_norm": 0.005004401770734297, "learning_rate": 1.8821506720850265e-05, "loss": 0.3872, "mean_token_accuracy": 0.8967468123883009, "num_tokens": 584061399.0, "step": 1210 }, { "entropy": 0.40625, "epoch": 0.7539951008981687, "grad_norm": 0.005036754106094893, "learning_rate": 1.880275085964364e-05, "loss": 0.3814, "mean_token_accuracy": 0.8981030844151974, "num_tokens": 585039948.0, "step": 1212 }, { "entropy": 0.4200439453125, "epoch": 0.7552393172362845, "grad_norm": 0.0048875178819882585, "learning_rate": 1.8783994998437012e-05, "loss": 0.3926, "mean_token_accuracy": 0.8957268316298723, "num_tokens": 586011087.0, "step": 1214 }, { "entropy": 0.40509033203125, "epoch": 0.7564835335744002, "grad_norm": 0.004957217103053479, "learning_rate": 1.8765239137230384e-05, "loss": 0.3804, "mean_token_accuracy": 0.8981908746063709, "num_tokens": 586971654.0, "step": 1216 }, { "entropy": 0.414306640625, "epoch": 0.757727749912516, "grad_norm": 0.005655237170363597, "learning_rate": 1.8746483276023756e-05, "loss": 0.3926, "mean_token_accuracy": 0.8960168790072203, "num_tokens": 587955355.0, "step": 1218 }, { "entropy": 0.40887451171875, "epoch": 0.7589719662506318, "grad_norm": 0.004589757817243662, "learning_rate": 1.872772741481713e-05, "loss": 0.3839, "mean_token_accuracy": 0.8976372517645359, "num_tokens": 588926927.0, "step": 1220 }, { "entropy": 0.40643310546875, "epoch": 0.7602161825887476, "grad_norm": 0.005168351111874489, "learning_rate": 1.8708971553610503e-05, "loss": 0.3828, "mean_token_accuracy": 0.8971650060266256, "num_tokens": 589904425.0, "step": 1222 }, { "entropy": 0.4140625, "epoch": 0.7614603989268635, "grad_norm": 0.005149512441676835, "learning_rate": 1.8690215692403878e-05, "loss": 0.3907, "mean_token_accuracy": 0.8963935133069754, "num_tokens": 590872447.0, "step": 1224 }, { "entropy": 0.406982421875, "epoch": 0.7627046152649792, "grad_norm": 0.004681147535072703, "learning_rate": 1.867145983119725e-05, "loss": 0.3817, "mean_token_accuracy": 0.8980054464191198, "num_tokens": 591843792.0, "step": 1226 }, { "entropy": 0.4072265625, "epoch": 0.763948831603095, "grad_norm": 0.005062718378793491, "learning_rate": 1.865270396999062e-05, "loss": 0.3815, "mean_token_accuracy": 0.8980558589100838, "num_tokens": 592818496.0, "step": 1228 }, { "entropy": 0.39593505859375, "epoch": 0.7651930479412108, "grad_norm": 0.0058545868812565184, "learning_rate": 1.8633948108783997e-05, "loss": 0.3751, "mean_token_accuracy": 0.899000383913517, "num_tokens": 593794922.0, "step": 1230 }, { "entropy": 0.422119140625, "epoch": 0.7664372642793266, "grad_norm": 0.005009216384405115, "learning_rate": 1.861519224757737e-05, "loss": 0.3963, "mean_token_accuracy": 0.8950500506907701, "num_tokens": 594768128.0, "step": 1232 }, { "entropy": 0.4107666015625, "epoch": 0.7676814806174423, "grad_norm": 0.005626237419338881, "learning_rate": 1.859643638637074e-05, "loss": 0.3869, "mean_token_accuracy": 0.8969231639057398, "num_tokens": 595726343.0, "step": 1234 }, { "entropy": 0.40618896484375, "epoch": 0.7689256969555581, "grad_norm": 0.004963600631725088, "learning_rate": 1.8577680525164112e-05, "loss": 0.38, "mean_token_accuracy": 0.8984140045940876, "num_tokens": 596686861.0, "step": 1236 }, { "entropy": 0.40484619140625, "epoch": 0.7701699132936739, "grad_norm": 0.0059391560952274805, "learning_rate": 1.8558924663957487e-05, "loss": 0.3818, "mean_token_accuracy": 0.8976399768143892, "num_tokens": 597659680.0, "step": 1238 }, { "entropy": 0.4154052734375, "epoch": 0.7714141296317897, "grad_norm": 0.005605888260680417, "learning_rate": 1.854016880275086e-05, "loss": 0.3916, "mean_token_accuracy": 0.8962047453969717, "num_tokens": 598616443.0, "step": 1240 }, { "entropy": 0.41729736328125, "epoch": 0.7726583459699056, "grad_norm": 0.00560409967465218, "learning_rate": 1.8521412941544234e-05, "loss": 0.3904, "mean_token_accuracy": 0.8959630280733109, "num_tokens": 599591374.0, "step": 1242 }, { "entropy": 0.41162109375, "epoch": 0.7739025623080213, "grad_norm": 0.005598396017706495, "learning_rate": 1.8502657080337606e-05, "loss": 0.3886, "mean_token_accuracy": 0.8962363302707672, "num_tokens": 600567411.0, "step": 1244 }, { "entropy": 0.40411376953125, "epoch": 0.7751467786461371, "grad_norm": 0.005384433903265137, "learning_rate": 1.8483901219130978e-05, "loss": 0.3841, "mean_token_accuracy": 0.8973080758005381, "num_tokens": 601536038.0, "step": 1246 }, { "entropy": 0.40887451171875, "epoch": 0.7763909949842529, "grad_norm": 0.006462946439902965, "learning_rate": 1.8465145357924353e-05, "loss": 0.3772, "mean_token_accuracy": 0.8988860230892897, "num_tokens": 602504926.0, "step": 1248 }, { "entropy": 0.40618896484375, "epoch": 0.7776352113223687, "grad_norm": 0.005693458305718539, "learning_rate": 1.8446389496717725e-05, "loss": 0.3837, "mean_token_accuracy": 0.8977437373250723, "num_tokens": 603464381.0, "step": 1250 }, { "entropy": 0.40008544921875, "epoch": 0.7788794276604845, "grad_norm": 0.00509362418421761, "learning_rate": 1.8427633635511097e-05, "loss": 0.3792, "mean_token_accuracy": 0.8984755855053663, "num_tokens": 604431012.0, "step": 1252 }, { "entropy": 0.40313720703125, "epoch": 0.7801236439986002, "grad_norm": 0.00543520623598083, "learning_rate": 1.840887777430447e-05, "loss": 0.3779, "mean_token_accuracy": 0.8986425511538982, "num_tokens": 605398358.0, "step": 1254 }, { "entropy": 0.39874267578125, "epoch": 0.781367860336716, "grad_norm": 0.005417691697249489, "learning_rate": 1.8390121913097844e-05, "loss": 0.378, "mean_token_accuracy": 0.8989900816231966, "num_tokens": 606351531.0, "step": 1256 }, { "entropy": 0.4014892578125, "epoch": 0.7826120766748318, "grad_norm": 0.0053906125515040685, "learning_rate": 1.837136605189122e-05, "loss": 0.3823, "mean_token_accuracy": 0.8980196267366409, "num_tokens": 607326899.0, "step": 1258 }, { "entropy": 0.4139404296875, "epoch": 0.7838562930129477, "grad_norm": 0.005137640296419085, "learning_rate": 1.8352610190684587e-05, "loss": 0.3876, "mean_token_accuracy": 0.8962286598980427, "num_tokens": 608301166.0, "step": 1260 }, { "entropy": 0.40374755859375, "epoch": 0.7851005093510635, "grad_norm": 0.00480737689185828, "learning_rate": 1.8333854329477963e-05, "loss": 0.3784, "mean_token_accuracy": 0.898613378405571, "num_tokens": 609273537.0, "step": 1262 }, { "entropy": 0.39825439453125, "epoch": 0.7863447256891792, "grad_norm": 0.005544028947518909, "learning_rate": 1.8315098468271334e-05, "loss": 0.3796, "mean_token_accuracy": 0.8983443062752485, "num_tokens": 610230475.0, "step": 1264 }, { "entropy": 0.40435791015625, "epoch": 0.787588942027295, "grad_norm": 0.0053641823175241575, "learning_rate": 1.829634260706471e-05, "loss": 0.3789, "mean_token_accuracy": 0.8984312936663628, "num_tokens": 611193110.0, "step": 1266 }, { "entropy": 0.40948486328125, "epoch": 0.7888331583654108, "grad_norm": 0.005506651029912604, "learning_rate": 1.827758674585808e-05, "loss": 0.3861, "mean_token_accuracy": 0.897303219884634, "num_tokens": 612155581.0, "step": 1268 }, { "entropy": 0.40380859375, "epoch": 0.7900773747035266, "grad_norm": 0.005467066960413349, "learning_rate": 1.8258830884651453e-05, "loss": 0.3781, "mean_token_accuracy": 0.8987308535724878, "num_tokens": 613124402.0, "step": 1270 }, { "entropy": 0.39990234375, "epoch": 0.7913215910416423, "grad_norm": 0.005183266153890559, "learning_rate": 1.8240075023444825e-05, "loss": 0.3778, "mean_token_accuracy": 0.8985103871673346, "num_tokens": 614090889.0, "step": 1272 }, { "entropy": 0.41131591796875, "epoch": 0.7925658073797581, "grad_norm": 0.0053897188005217886, "learning_rate": 1.82213191622382e-05, "loss": 0.3866, "mean_token_accuracy": 0.8970832955092192, "num_tokens": 615067642.0, "step": 1274 }, { "entropy": 0.40521240234375, "epoch": 0.7938100237178739, "grad_norm": 0.005339746623103771, "learning_rate": 1.8202563301031575e-05, "loss": 0.3817, "mean_token_accuracy": 0.898108372464776, "num_tokens": 616035813.0, "step": 1276 }, { "entropy": 0.3997802734375, "epoch": 0.7950542400559897, "grad_norm": 0.005430828646405775, "learning_rate": 1.8183807439824944e-05, "loss": 0.3787, "mean_token_accuracy": 0.8980689588934183, "num_tokens": 616991322.0, "step": 1278 }, { "entropy": 0.4036865234375, "epoch": 0.7962984563941056, "grad_norm": 0.00543583145164992, "learning_rate": 1.816505157861832e-05, "loss": 0.3783, "mean_token_accuracy": 0.898298978805542, "num_tokens": 617951889.0, "step": 1280 }, { "entropy": 0.39666748046875, "epoch": 0.7975426727322213, "grad_norm": 0.004982959692985278, "learning_rate": 1.814629571741169e-05, "loss": 0.3724, "mean_token_accuracy": 0.8996069561690092, "num_tokens": 618909080.0, "step": 1282 }, { "entropy": 0.39471435546875, "epoch": 0.7987868890703371, "grad_norm": 0.004813411086252739, "learning_rate": 1.8127539856205066e-05, "loss": 0.3771, "mean_token_accuracy": 0.8987947925925255, "num_tokens": 619863977.0, "step": 1284 }, { "entropy": 0.40203857421875, "epoch": 0.8000311054084529, "grad_norm": 0.005227747425898589, "learning_rate": 1.8108783994998438e-05, "loss": 0.377, "mean_token_accuracy": 0.8987550139427185, "num_tokens": 620821169.0, "step": 1286 }, { "entropy": 0.4058837890625, "epoch": 0.8012753217465687, "grad_norm": 0.005163496373068368, "learning_rate": 1.809002813379181e-05, "loss": 0.3855, "mean_token_accuracy": 0.8970469329506159, "num_tokens": 621795953.0, "step": 1288 }, { "entropy": 0.41485595703125, "epoch": 0.8025195380846845, "grad_norm": 0.0056228444792083615, "learning_rate": 1.807127227258518e-05, "loss": 0.3881, "mean_token_accuracy": 0.8971890043467283, "num_tokens": 622776905.0, "step": 1290 }, { "entropy": 0.4097900390625, "epoch": 0.8037637544228002, "grad_norm": 0.0052220812264867295, "learning_rate": 1.8052516411378557e-05, "loss": 0.3834, "mean_token_accuracy": 0.8970935381948948, "num_tokens": 623754579.0, "step": 1292 }, { "entropy": 0.4097900390625, "epoch": 0.805007970760916, "grad_norm": 0.005199596827656552, "learning_rate": 1.8033760550171932e-05, "loss": 0.3842, "mean_token_accuracy": 0.8970695789903402, "num_tokens": 624715268.0, "step": 1294 }, { "entropy": 0.40008544921875, "epoch": 0.8062521870990318, "grad_norm": 0.005706853657584404, "learning_rate": 1.80150046889653e-05, "loss": 0.3771, "mean_token_accuracy": 0.8987214807420969, "num_tokens": 625676706.0, "step": 1296 }, { "entropy": 0.3919677734375, "epoch": 0.8074964034371477, "grad_norm": 0.006037892128107661, "learning_rate": 1.7996248827758675e-05, "loss": 0.3738, "mean_token_accuracy": 0.8996866662055254, "num_tokens": 626638067.0, "step": 1298 }, { "entropy": 0.40557861328125, "epoch": 0.8087406197752635, "grad_norm": 0.00638738112649562, "learning_rate": 1.7977492966552047e-05, "loss": 0.378, "mean_token_accuracy": 0.8984963018447161, "num_tokens": 627598193.0, "step": 1300 }, { "entropy": 0.40838623046875, "epoch": 0.8099848361133792, "grad_norm": 0.005564672420683832, "learning_rate": 1.7958737105345422e-05, "loss": 0.38, "mean_token_accuracy": 0.8978992812335491, "num_tokens": 628564835.0, "step": 1302 }, { "entropy": 0.3973388671875, "epoch": 0.811229052451495, "grad_norm": 0.006736133999060941, "learning_rate": 1.793998124413879e-05, "loss": 0.3833, "mean_token_accuracy": 0.8973161764442921, "num_tokens": 629509649.0, "step": 1304 }, { "entropy": 0.404296875, "epoch": 0.8124732687896108, "grad_norm": 0.006370995978315379, "learning_rate": 1.7921225382932166e-05, "loss": 0.3781, "mean_token_accuracy": 0.8984268959611654, "num_tokens": 630470311.0, "step": 1306 }, { "entropy": 0.4105224609375, "epoch": 0.8137174851277266, "grad_norm": 0.005397322645354552, "learning_rate": 1.790246952172554e-05, "loss": 0.3853, "mean_token_accuracy": 0.8970671091228724, "num_tokens": 631431608.0, "step": 1308 }, { "entropy": 0.3924560546875, "epoch": 0.8149617014658423, "grad_norm": 0.005792910296434646, "learning_rate": 1.7883713660518913e-05, "loss": 0.3753, "mean_token_accuracy": 0.8992858286947012, "num_tokens": 632414408.0, "step": 1310 }, { "entropy": 0.40179443359375, "epoch": 0.8162059178039581, "grad_norm": 0.005254130039747673, "learning_rate": 1.786495779931229e-05, "loss": 0.3774, "mean_token_accuracy": 0.898780507966876, "num_tokens": 633369789.0, "step": 1312 }, { "entropy": 0.40875244140625, "epoch": 0.8174501341420739, "grad_norm": 0.005769934223439549, "learning_rate": 1.7846201938105657e-05, "loss": 0.3837, "mean_token_accuracy": 0.8971659056842327, "num_tokens": 634336356.0, "step": 1314 }, { "entropy": 0.39569091796875, "epoch": 0.8186943504801898, "grad_norm": 0.00496005932094977, "learning_rate": 1.7827446076899032e-05, "loss": 0.3776, "mean_token_accuracy": 0.8989657796919346, "num_tokens": 635304819.0, "step": 1316 }, { "entropy": 0.413330078125, "epoch": 0.8199385668183056, "grad_norm": 0.00518527535008451, "learning_rate": 1.7808690215692404e-05, "loss": 0.3887, "mean_token_accuracy": 0.8965956885367632, "num_tokens": 636278877.0, "step": 1318 }, { "entropy": 0.39971923828125, "epoch": 0.8211827831564213, "grad_norm": 0.005415593222267816, "learning_rate": 1.778993435448578e-05, "loss": 0.3772, "mean_token_accuracy": 0.8987345285713673, "num_tokens": 637250631.0, "step": 1320 }, { "entropy": 0.39892578125, "epoch": 0.8224269994945371, "grad_norm": 0.005501964971071469, "learning_rate": 1.7771178493279147e-05, "loss": 0.3762, "mean_token_accuracy": 0.8989961203187704, "num_tokens": 638203667.0, "step": 1322 }, { "entropy": 0.40423583984375, "epoch": 0.8236712158326529, "grad_norm": 0.005425708004189737, "learning_rate": 1.7752422632072523e-05, "loss": 0.3804, "mean_token_accuracy": 0.8984730821102858, "num_tokens": 639184725.0, "step": 1324 }, { "entropy": 0.4088134765625, "epoch": 0.8249154321707687, "grad_norm": 0.00523495405174905, "learning_rate": 1.7733666770865898e-05, "loss": 0.3875, "mean_token_accuracy": 0.8968045245856047, "num_tokens": 640150768.0, "step": 1326 }, { "entropy": 0.3995361328125, "epoch": 0.8261596485088845, "grad_norm": 0.005314818661639284, "learning_rate": 1.771491090965927e-05, "loss": 0.3783, "mean_token_accuracy": 0.898647403344512, "num_tokens": 641117772.0, "step": 1328 }, { "entropy": 0.40411376953125, "epoch": 0.8274038648470002, "grad_norm": 0.004676594681433151, "learning_rate": 1.7696155048452645e-05, "loss": 0.3846, "mean_token_accuracy": 0.896965192630887, "num_tokens": 642080222.0, "step": 1330 }, { "entropy": 0.40960693359375, "epoch": 0.828648081185116, "grad_norm": 0.005263981597671043, "learning_rate": 1.7677399187246013e-05, "loss": 0.386, "mean_token_accuracy": 0.8967100847512484, "num_tokens": 643052608.0, "step": 1332 }, { "entropy": 0.405517578125, "epoch": 0.8298922975232319, "grad_norm": 0.0053284696764008, "learning_rate": 1.765864332603939e-05, "loss": 0.3799, "mean_token_accuracy": 0.8983614630997181, "num_tokens": 644029685.0, "step": 1334 }, { "entropy": 0.3995361328125, "epoch": 0.8311365138613477, "grad_norm": 0.005318934911070496, "learning_rate": 1.763988746483276e-05, "loss": 0.377, "mean_token_accuracy": 0.8980212677270174, "num_tokens": 644989319.0, "step": 1336 }, { "entropy": 0.40472412109375, "epoch": 0.8323807301994635, "grad_norm": 0.004795263961747096, "learning_rate": 1.7621131603626135e-05, "loss": 0.3815, "mean_token_accuracy": 0.8976876083761454, "num_tokens": 645951802.0, "step": 1338 }, { "entropy": 0.400634765625, "epoch": 0.8336249465375792, "grad_norm": 0.004947341203782201, "learning_rate": 1.7602375742419504e-05, "loss": 0.3792, "mean_token_accuracy": 0.8978105429559946, "num_tokens": 646907599.0, "step": 1340 }, { "entropy": 0.40618896484375, "epoch": 0.834869162875695, "grad_norm": 0.004937723702785143, "learning_rate": 1.758361988121288e-05, "loss": 0.3826, "mean_token_accuracy": 0.8976176492869854, "num_tokens": 647865747.0, "step": 1342 }, { "entropy": 0.4046630859375, "epoch": 0.8361133792138108, "grad_norm": 0.005243058066924142, "learning_rate": 1.7564864020006254e-05, "loss": 0.381, "mean_token_accuracy": 0.8976841121912003, "num_tokens": 648824128.0, "step": 1344 }, { "entropy": 0.3994140625, "epoch": 0.8373575955519266, "grad_norm": 0.005621130589914771, "learning_rate": 1.7546108158799626e-05, "loss": 0.3779, "mean_token_accuracy": 0.8985711988061666, "num_tokens": 649787628.0, "step": 1346 }, { "entropy": 0.40911865234375, "epoch": 0.8386018118900423, "grad_norm": 0.005494165291077512, "learning_rate": 1.7527352297592998e-05, "loss": 0.3842, "mean_token_accuracy": 0.8971004374325275, "num_tokens": 650746355.0, "step": 1348 }, { "entropy": 0.410400390625, "epoch": 0.8398460282281581, "grad_norm": 0.0053335996680841475, "learning_rate": 1.750859643638637e-05, "loss": 0.3862, "mean_token_accuracy": 0.8971538785845041, "num_tokens": 651707016.0, "step": 1350 }, { "entropy": 0.39599609375, "epoch": 0.841090244566274, "grad_norm": 0.004572608260159585, "learning_rate": 1.7489840575179745e-05, "loss": 0.375, "mean_token_accuracy": 0.8992204777896404, "num_tokens": 652664116.0, "step": 1352 }, { "entropy": 0.40191650390625, "epoch": 0.8423344609043898, "grad_norm": 0.005166718761332788, "learning_rate": 1.7471084713973117e-05, "loss": 0.3796, "mean_token_accuracy": 0.8981178998947144, "num_tokens": 653614778.0, "step": 1354 }, { "entropy": 0.39593505859375, "epoch": 0.8435786772425056, "grad_norm": 0.0053558593152423385, "learning_rate": 1.7452328852766492e-05, "loss": 0.3726, "mean_token_accuracy": 0.8996075745671988, "num_tokens": 654576897.0, "step": 1356 }, { "entropy": 0.40167236328125, "epoch": 0.8448228935806213, "grad_norm": 0.0049391797689049945, "learning_rate": 1.743357299155986e-05, "loss": 0.376, "mean_token_accuracy": 0.8989114835858345, "num_tokens": 655543360.0, "step": 1358 }, { "entropy": 0.4068603515625, "epoch": 0.8460671099187371, "grad_norm": 0.004830793979932011, "learning_rate": 1.7414817130353235e-05, "loss": 0.3826, "mean_token_accuracy": 0.8973547574132681, "num_tokens": 656510060.0, "step": 1360 }, { "entropy": 0.40350341796875, "epoch": 0.8473113262568529, "grad_norm": 0.0058789672202246325, "learning_rate": 1.739606126914661e-05, "loss": 0.3801, "mean_token_accuracy": 0.8980994876474142, "num_tokens": 657486570.0, "step": 1362 }, { "entropy": 0.39959716796875, "epoch": 0.8485555425949687, "grad_norm": 0.005519205335271455, "learning_rate": 1.7377305407939982e-05, "loss": 0.3756, "mean_token_accuracy": 0.8989692833274603, "num_tokens": 658448931.0, "step": 1364 }, { "entropy": 0.40325927734375, "epoch": 0.8497997589330845, "grad_norm": 0.005295136740598054, "learning_rate": 1.7358549546733354e-05, "loss": 0.3821, "mean_token_accuracy": 0.897557033225894, "num_tokens": 659410836.0, "step": 1366 }, { "entropy": 0.39971923828125, "epoch": 0.8510439752712002, "grad_norm": 0.005530572845198934, "learning_rate": 1.7339793685526726e-05, "loss": 0.3802, "mean_token_accuracy": 0.898717338219285, "num_tokens": 660367343.0, "step": 1368 }, { "entropy": 0.40167236328125, "epoch": 0.852288191609316, "grad_norm": 0.005510680704431796, "learning_rate": 1.73210378243201e-05, "loss": 0.3798, "mean_token_accuracy": 0.8981695491820574, "num_tokens": 661311072.0, "step": 1370 }, { "entropy": 0.4017333984375, "epoch": 0.8535324079474319, "grad_norm": 0.00627544834073864, "learning_rate": 1.7302281963113473e-05, "loss": 0.3802, "mean_token_accuracy": 0.8979060519486666, "num_tokens": 662276173.0, "step": 1372 }, { "entropy": 0.40576171875, "epoch": 0.8547766242855477, "grad_norm": 0.005284287895948304, "learning_rate": 1.7283526101906845e-05, "loss": 0.38, "mean_token_accuracy": 0.8976269289851189, "num_tokens": 663237220.0, "step": 1374 }, { "entropy": 0.40313720703125, "epoch": 0.8560208406236635, "grad_norm": 0.0048372352353352015, "learning_rate": 1.726477024070022e-05, "loss": 0.3804, "mean_token_accuracy": 0.8974504210054874, "num_tokens": 664192184.0, "step": 1376 }, { "entropy": 0.39801025390625, "epoch": 0.8572650569617792, "grad_norm": 0.006044097694267276, "learning_rate": 1.7246014379493592e-05, "loss": 0.3765, "mean_token_accuracy": 0.8990536779165268, "num_tokens": 665173450.0, "step": 1378 }, { "entropy": 0.3997802734375, "epoch": 0.858509273299895, "grad_norm": 0.004917726681654031, "learning_rate": 1.7227258518286967e-05, "loss": 0.3778, "mean_token_accuracy": 0.8984164148569107, "num_tokens": 666140328.0, "step": 1380 }, { "entropy": 0.4068603515625, "epoch": 0.8597534896380108, "grad_norm": 0.005164612522375464, "learning_rate": 1.720850265708034e-05, "loss": 0.3814, "mean_token_accuracy": 0.8977449219673872, "num_tokens": 667108799.0, "step": 1382 }, { "entropy": 0.400634765625, "epoch": 0.8609977059761266, "grad_norm": 0.0048767014595824075, "learning_rate": 1.718974679587371e-05, "loss": 0.3788, "mean_token_accuracy": 0.8983617555350065, "num_tokens": 668079783.0, "step": 1384 }, { "entropy": 0.40985107421875, "epoch": 0.8622419223142423, "grad_norm": 0.005677661286329443, "learning_rate": 1.7170990934667082e-05, "loss": 0.3825, "mean_token_accuracy": 0.8976128529757261, "num_tokens": 669046344.0, "step": 1386 }, { "entropy": 0.40191650390625, "epoch": 0.8634861386523581, "grad_norm": 0.00660113277471055, "learning_rate": 1.7152235073460458e-05, "loss": 0.384, "mean_token_accuracy": 0.8972909357398748, "num_tokens": 670005732.0, "step": 1388 }, { "entropy": 0.40130615234375, "epoch": 0.864730354990474, "grad_norm": 0.00567762450560956, "learning_rate": 1.713347921225383e-05, "loss": 0.3735, "mean_token_accuracy": 0.8992943931370974, "num_tokens": 670966052.0, "step": 1390 }, { "entropy": 0.397216796875, "epoch": 0.8659745713285898, "grad_norm": 0.005341832827144533, "learning_rate": 1.71147233510472e-05, "loss": 0.3706, "mean_token_accuracy": 0.9002178777009249, "num_tokens": 671933769.0, "step": 1392 }, { "entropy": 0.3953857421875, "epoch": 0.8672187876667056, "grad_norm": 0.005799913304371058, "learning_rate": 1.7095967489840576e-05, "loss": 0.3807, "mean_token_accuracy": 0.8987330719828606, "num_tokens": 672893381.0, "step": 1394 }, { "entropy": 0.39410400390625, "epoch": 0.8684630040048213, "grad_norm": 0.005048682472670329, "learning_rate": 1.7077211628633948e-05, "loss": 0.3699, "mean_token_accuracy": 0.8998918514698744, "num_tokens": 673840539.0, "step": 1396 }, { "entropy": 0.39471435546875, "epoch": 0.8697072203429371, "grad_norm": 0.0052664151445726915, "learning_rate": 1.7058455767427323e-05, "loss": 0.3741, "mean_token_accuracy": 0.8993562012910843, "num_tokens": 674799970.0, "step": 1398 }, { "entropy": 0.395263671875, "epoch": 0.8709514366810529, "grad_norm": 0.005470348597093235, "learning_rate": 1.7039699906220695e-05, "loss": 0.3733, "mean_token_accuracy": 0.8995196856558323, "num_tokens": 675773117.0, "step": 1400 }, { "epoch": 0.8709514366810529, "eval_entropy": 0.4023653872784343, "eval_loss": 0.3780038058757782, "eval_mean_token_accuracy": 0.8983533945446353, "eval_num_tokens": 675773117.0, "eval_runtime": 425.7923, "eval_samples_per_second": 203.456, "eval_steps_per_second": 3.18, "step": 1400 }, { "entropy": 0.4022216796875, "epoch": 0.8721956530191687, "grad_norm": 0.006131056342886626, "learning_rate": 1.7020944045014067e-05, "loss": 0.3771, "mean_token_accuracy": 0.898376589640975, "num_tokens": 676743631.0, "step": 1402 }, { "entropy": 0.40570068359375, "epoch": 0.8734398693572845, "grad_norm": 0.005376564191053507, "learning_rate": 1.700218818380744e-05, "loss": 0.3828, "mean_token_accuracy": 0.897642008960247, "num_tokens": 677717294.0, "step": 1404 }, { "entropy": 0.40240478515625, "epoch": 0.8746840856954002, "grad_norm": 0.005260727059119352, "learning_rate": 1.6983432322600814e-05, "loss": 0.377, "mean_token_accuracy": 0.8990323208272457, "num_tokens": 678687397.0, "step": 1406 }, { "entropy": 0.4033203125, "epoch": 0.8759283020335161, "grad_norm": 0.004954229475148383, "learning_rate": 1.6964676461394186e-05, "loss": 0.3773, "mean_token_accuracy": 0.898332042619586, "num_tokens": 679648723.0, "step": 1408 }, { "entropy": 0.39385986328125, "epoch": 0.8771725183716319, "grad_norm": 0.00493596913517282, "learning_rate": 1.6945920600187558e-05, "loss": 0.3723, "mean_token_accuracy": 0.8995107840746641, "num_tokens": 680625233.0, "step": 1410 }, { "entropy": 0.38739013671875, "epoch": 0.8784167347097477, "grad_norm": 0.0052195752627374565, "learning_rate": 1.6927164738980933e-05, "loss": 0.3664, "mean_token_accuracy": 0.9008036181330681, "num_tokens": 681603515.0, "step": 1412 }, { "entropy": 0.40216064453125, "epoch": 0.8796609510478635, "grad_norm": 0.005174503255183653, "learning_rate": 1.6908408877774305e-05, "loss": 0.3797, "mean_token_accuracy": 0.8981857150793076, "num_tokens": 682573416.0, "step": 1414 }, { "entropy": 0.40362548828125, "epoch": 0.8809051673859792, "grad_norm": 0.004817606353062603, "learning_rate": 1.688965301656768e-05, "loss": 0.3843, "mean_token_accuracy": 0.8969882000237703, "num_tokens": 683541098.0, "step": 1416 }, { "entropy": 0.4066162109375, "epoch": 0.882149383724095, "grad_norm": 0.0056264179113335875, "learning_rate": 1.6870897155361048e-05, "loss": 0.3822, "mean_token_accuracy": 0.8977867681533098, "num_tokens": 684514300.0, "step": 1418 }, { "entropy": 0.3963623046875, "epoch": 0.8833936000622108, "grad_norm": 0.005373478086264892, "learning_rate": 1.6852141294154423e-05, "loss": 0.3739, "mean_token_accuracy": 0.899159511551261, "num_tokens": 685472528.0, "step": 1420 }, { "entropy": 0.39874267578125, "epoch": 0.8846378164003266, "grad_norm": 0.004700846041505724, "learning_rate": 1.6833385432947795e-05, "loss": 0.3758, "mean_token_accuracy": 0.8987201433628798, "num_tokens": 686437156.0, "step": 1422 }, { "entropy": 0.39581298828125, "epoch": 0.8858820327384423, "grad_norm": 0.0056117551205033004, "learning_rate": 1.681462957174117e-05, "loss": 0.3739, "mean_token_accuracy": 0.8992190212011337, "num_tokens": 687407389.0, "step": 1424 }, { "entropy": 0.4010009765625, "epoch": 0.8871262490765582, "grad_norm": 0.005357886207794235, "learning_rate": 1.6795873710534542e-05, "loss": 0.3748, "mean_token_accuracy": 0.8986546844244003, "num_tokens": 688386860.0, "step": 1426 }, { "entropy": 0.39813232421875, "epoch": 0.888370465414674, "grad_norm": 0.005361200348556276, "learning_rate": 1.6777117849327914e-05, "loss": 0.3795, "mean_token_accuracy": 0.897765226662159, "num_tokens": 689364433.0, "step": 1428 }, { "entropy": 0.4002685546875, "epoch": 0.8896146817527898, "grad_norm": 0.004975345848441456, "learning_rate": 1.675836198812129e-05, "loss": 0.3791, "mean_token_accuracy": 0.8983795884996653, "num_tokens": 690322881.0, "step": 1430 }, { "entropy": 0.3980712890625, "epoch": 0.8908588980909056, "grad_norm": 0.0051256664346420785, "learning_rate": 1.673960612691466e-05, "loss": 0.3734, "mean_token_accuracy": 0.8987835813313723, "num_tokens": 691283763.0, "step": 1432 }, { "entropy": 0.3939208984375, "epoch": 0.8921031144290213, "grad_norm": 0.005274837135999603, "learning_rate": 1.6720850265708036e-05, "loss": 0.3761, "mean_token_accuracy": 0.8993116375058889, "num_tokens": 692252507.0, "step": 1434 }, { "entropy": 0.4117431640625, "epoch": 0.8933473307671371, "grad_norm": 0.0062749052196716185, "learning_rate": 1.6702094404501405e-05, "loss": 0.3844, "mean_token_accuracy": 0.8966869618743658, "num_tokens": 693215702.0, "step": 1436 }, { "entropy": 0.393798828125, "epoch": 0.8945915471052529, "grad_norm": 0.0061363598344107645, "learning_rate": 1.668333854329478e-05, "loss": 0.3714, "mean_token_accuracy": 0.8997723218053579, "num_tokens": 694187271.0, "step": 1438 }, { "entropy": 0.39849853515625, "epoch": 0.8958357634433687, "grad_norm": 0.005437553358336505, "learning_rate": 1.6664582682088152e-05, "loss": 0.3777, "mean_token_accuracy": 0.8986912872642279, "num_tokens": 695153653.0, "step": 1440 }, { "entropy": 0.39971923828125, "epoch": 0.8970799797814845, "grad_norm": 0.005215995281306245, "learning_rate": 1.6645826820881527e-05, "loss": 0.3759, "mean_token_accuracy": 0.8989935517311096, "num_tokens": 696116982.0, "step": 1442 }, { "entropy": 0.40045166015625, "epoch": 0.8983241961196003, "grad_norm": 0.005550440398373322, "learning_rate": 1.6627070959674902e-05, "loss": 0.3757, "mean_token_accuracy": 0.899070356041193, "num_tokens": 697089442.0, "step": 1444 }, { "entropy": 0.3916015625, "epoch": 0.8995684124577161, "grad_norm": 0.00495205290349028, "learning_rate": 1.660831509846827e-05, "loss": 0.3721, "mean_token_accuracy": 0.8997447676956654, "num_tokens": 698036090.0, "step": 1446 }, { "entropy": 0.41064453125, "epoch": 0.9008126287958319, "grad_norm": 0.005262616132331276, "learning_rate": 1.6589559237261646e-05, "loss": 0.3842, "mean_token_accuracy": 0.8980662040412426, "num_tokens": 698995170.0, "step": 1448 }, { "entropy": 0.39508056640625, "epoch": 0.9020568451339477, "grad_norm": 0.006890175671426515, "learning_rate": 1.6570803376055018e-05, "loss": 0.3798, "mean_token_accuracy": 0.8978770300745964, "num_tokens": 699956590.0, "step": 1450 }, { "entropy": 0.3994140625, "epoch": 0.9033010614720635, "grad_norm": 0.00575231546204349, "learning_rate": 1.6552047514848393e-05, "loss": 0.375, "mean_token_accuracy": 0.8987007066607475, "num_tokens": 700925906.0, "step": 1452 }, { "entropy": 0.4013671875, "epoch": 0.9045452778101792, "grad_norm": 0.004960889035991832, "learning_rate": 1.653329165364176e-05, "loss": 0.3745, "mean_token_accuracy": 0.8986719343811274, "num_tokens": 701904585.0, "step": 1454 }, { "entropy": 0.3912353515625, "epoch": 0.905789494148295, "grad_norm": 0.006562120207696263, "learning_rate": 1.6514535792435136e-05, "loss": 0.3712, "mean_token_accuracy": 0.9003562591969967, "num_tokens": 702867833.0, "step": 1456 }, { "entropy": 0.40264892578125, "epoch": 0.9070337104864108, "grad_norm": 0.006415804251732369, "learning_rate": 1.6495779931228508e-05, "loss": 0.3792, "mean_token_accuracy": 0.8982608057558537, "num_tokens": 703836825.0, "step": 1458 }, { "entropy": 0.40277099609375, "epoch": 0.9082779268245266, "grad_norm": 0.005359902710167034, "learning_rate": 1.6477024070021883e-05, "loss": 0.3796, "mean_token_accuracy": 0.8978499639779329, "num_tokens": 704787031.0, "step": 1460 }, { "entropy": 0.39874267578125, "epoch": 0.9095221431626423, "grad_norm": 0.006374125153532628, "learning_rate": 1.6458268208815255e-05, "loss": 0.3812, "mean_token_accuracy": 0.897991493344307, "num_tokens": 705741351.0, "step": 1462 }, { "entropy": 0.40863037109375, "epoch": 0.9107663595007582, "grad_norm": 0.006494324307443002, "learning_rate": 1.6439512347608627e-05, "loss": 0.3803, "mean_token_accuracy": 0.8982013240456581, "num_tokens": 706707709.0, "step": 1464 }, { "entropy": 0.408935546875, "epoch": 0.912010575838874, "grad_norm": 0.005178518090486135, "learning_rate": 1.6420756486402002e-05, "loss": 0.3833, "mean_token_accuracy": 0.8975186496973038, "num_tokens": 707672808.0, "step": 1466 }, { "entropy": 0.3885498046875, "epoch": 0.9132547921769898, "grad_norm": 0.005970162602511452, "learning_rate": 1.6402000625195374e-05, "loss": 0.3711, "mean_token_accuracy": 0.8999240305274725, "num_tokens": 708635032.0, "step": 1468 }, { "entropy": 0.4071044921875, "epoch": 0.9144990085151056, "grad_norm": 0.006057190681632163, "learning_rate": 1.638324476398875e-05, "loss": 0.383, "mean_token_accuracy": 0.8973836079239845, "num_tokens": 709590855.0, "step": 1470 }, { "entropy": 0.4022216796875, "epoch": 0.9157432248532213, "grad_norm": 0.0052526443284699, "learning_rate": 1.6364488902782118e-05, "loss": 0.3769, "mean_token_accuracy": 0.8987176176160574, "num_tokens": 710560005.0, "step": 1472 }, { "entropy": 0.38970947265625, "epoch": 0.9169874411913371, "grad_norm": 0.00571673378001787, "learning_rate": 1.6345733041575493e-05, "loss": 0.3728, "mean_token_accuracy": 0.8996557351201773, "num_tokens": 711516262.0, "step": 1474 }, { "entropy": 0.40887451171875, "epoch": 0.9182316575294529, "grad_norm": 0.0062283128064645095, "learning_rate": 1.6326977180368865e-05, "loss": 0.3804, "mean_token_accuracy": 0.8975806459784508, "num_tokens": 712473895.0, "step": 1476 }, { "entropy": 0.39862060546875, "epoch": 0.9194758738675687, "grad_norm": 0.005078555353946511, "learning_rate": 1.630822131916224e-05, "loss": 0.3738, "mean_token_accuracy": 0.899371637031436, "num_tokens": 713435375.0, "step": 1478 }, { "entropy": 0.3916015625, "epoch": 0.9207200902056845, "grad_norm": 0.006194635132283466, "learning_rate": 1.628946545795561e-05, "loss": 0.3753, "mean_token_accuracy": 0.8989935554563999, "num_tokens": 714406833.0, "step": 1480 }, { "entropy": 0.40240478515625, "epoch": 0.9219643065438003, "grad_norm": 0.006494000059806045, "learning_rate": 1.6270709596748983e-05, "loss": 0.3766, "mean_token_accuracy": 0.8985822480171919, "num_tokens": 715364007.0, "step": 1482 }, { "entropy": 0.3994140625, "epoch": 0.9232085228819161, "grad_norm": 0.00527390569793584, "learning_rate": 1.625195373554236e-05, "loss": 0.377, "mean_token_accuracy": 0.8988350462168455, "num_tokens": 716327956.0, "step": 1484 }, { "entropy": 0.39569091796875, "epoch": 0.9244527392200319, "grad_norm": 0.005055820282382125, "learning_rate": 1.623319787433573e-05, "loss": 0.3761, "mean_token_accuracy": 0.8988909460604191, "num_tokens": 717305978.0, "step": 1486 }, { "entropy": 0.39898681640625, "epoch": 0.9256969555581477, "grad_norm": 0.005878587503454893, "learning_rate": 1.6214442013129106e-05, "loss": 0.3775, "mean_token_accuracy": 0.8984644617885351, "num_tokens": 718275389.0, "step": 1488 }, { "entropy": 0.39959716796875, "epoch": 0.9269411718962635, "grad_norm": 0.00515570211996698, "learning_rate": 1.6195686151922474e-05, "loss": 0.3748, "mean_token_accuracy": 0.8989472668617964, "num_tokens": 719221679.0, "step": 1490 }, { "entropy": 0.40350341796875, "epoch": 0.9281853882343792, "grad_norm": 0.006088963462881115, "learning_rate": 1.617693029071585e-05, "loss": 0.3808, "mean_token_accuracy": 0.8978701122105122, "num_tokens": 720178350.0, "step": 1492 }, { "entropy": 0.39801025390625, "epoch": 0.929429604572495, "grad_norm": 0.005566807285709537, "learning_rate": 1.615817442950922e-05, "loss": 0.374, "mean_token_accuracy": 0.8989576529711485, "num_tokens": 721167519.0, "step": 1494 }, { "entropy": 0.399169921875, "epoch": 0.9306738209106108, "grad_norm": 0.005234621077742, "learning_rate": 1.6139418568302596e-05, "loss": 0.3799, "mean_token_accuracy": 0.8978620246052742, "num_tokens": 722129514.0, "step": 1496 }, { "entropy": 0.4046630859375, "epoch": 0.9319180372487266, "grad_norm": 0.005171974304082647, "learning_rate": 1.6120662707095968e-05, "loss": 0.3818, "mean_token_accuracy": 0.8973477799445391, "num_tokens": 723090347.0, "step": 1498 }, { "entropy": 0.39801025390625, "epoch": 0.9331622535868425, "grad_norm": 0.005292652461341518, "learning_rate": 1.610190684588934e-05, "loss": 0.3773, "mean_token_accuracy": 0.8983227275311947, "num_tokens": 724051308.0, "step": 1500 }, { "entropy": 0.396240234375, "epoch": 0.9344064699249582, "grad_norm": 0.004967073983428944, "learning_rate": 1.6083150984682715e-05, "loss": 0.3742, "mean_token_accuracy": 0.8993950467556715, "num_tokens": 725010983.0, "step": 1502 }, { "entropy": 0.4033203125, "epoch": 0.935650686263074, "grad_norm": 0.005195342725374668, "learning_rate": 1.6064395123476087e-05, "loss": 0.376, "mean_token_accuracy": 0.8983919620513916, "num_tokens": 725999273.0, "step": 1504 }, { "entropy": 0.39361572265625, "epoch": 0.9368949026011898, "grad_norm": 0.004972742625747616, "learning_rate": 1.604563926226946e-05, "loss": 0.3731, "mean_token_accuracy": 0.8992523178458214, "num_tokens": 726955108.0, "step": 1506 }, { "entropy": 0.3973388671875, "epoch": 0.9381391189393056, "grad_norm": 0.00557529583574152, "learning_rate": 1.602688340106283e-05, "loss": 0.374, "mean_token_accuracy": 0.8989895451813936, "num_tokens": 727912370.0, "step": 1508 }, { "entropy": 0.40625, "epoch": 0.9393833352774213, "grad_norm": 0.0047476289746205625, "learning_rate": 1.6008127539856206e-05, "loss": 0.3826, "mean_token_accuracy": 0.8971068877726793, "num_tokens": 728880622.0, "step": 1510 }, { "entropy": 0.3934326171875, "epoch": 0.9406275516155371, "grad_norm": 0.005791978627608442, "learning_rate": 1.598937167864958e-05, "loss": 0.3742, "mean_token_accuracy": 0.8992074690759182, "num_tokens": 729848677.0, "step": 1512 }, { "entropy": 0.39862060546875, "epoch": 0.9418717679536529, "grad_norm": 0.005973120033733374, "learning_rate": 1.5970615817442953e-05, "loss": 0.3766, "mean_token_accuracy": 0.8987386003136635, "num_tokens": 730815171.0, "step": 1514 }, { "entropy": 0.3970947265625, "epoch": 0.9431159842917687, "grad_norm": 0.005863044283240537, "learning_rate": 1.5951859956236324e-05, "loss": 0.3738, "mean_token_accuracy": 0.8993502426892519, "num_tokens": 731773555.0, "step": 1516 }, { "entropy": 0.38873291015625, "epoch": 0.9443602006298846, "grad_norm": 0.004982910689765107, "learning_rate": 1.5933104095029696e-05, "loss": 0.3697, "mean_token_accuracy": 0.899744089692831, "num_tokens": 732709784.0, "step": 1518 }, { "entropy": 0.3961181640625, "epoch": 0.9456044169680003, "grad_norm": 0.005484882238558658, "learning_rate": 1.591434823382307e-05, "loss": 0.3745, "mean_token_accuracy": 0.8995058108121157, "num_tokens": 733675817.0, "step": 1520 }, { "entropy": 0.3992919921875, "epoch": 0.9468486333061161, "grad_norm": 0.005225651983886508, "learning_rate": 1.5895592372616443e-05, "loss": 0.3775, "mean_token_accuracy": 0.8985904045403004, "num_tokens": 734653466.0, "step": 1522 }, { "entropy": 0.394287109375, "epoch": 0.9480928496442319, "grad_norm": 0.005619378118107904, "learning_rate": 1.5876836511409815e-05, "loss": 0.3704, "mean_token_accuracy": 0.8994211591780186, "num_tokens": 735612734.0, "step": 1524 }, { "entropy": 0.40087890625, "epoch": 0.9493370659823477, "grad_norm": 0.0054990447875628385, "learning_rate": 1.5858080650203187e-05, "loss": 0.377, "mean_token_accuracy": 0.8988547753542662, "num_tokens": 736600139.0, "step": 1526 }, { "entropy": 0.3919677734375, "epoch": 0.9505812823204635, "grad_norm": 0.005399463283786181, "learning_rate": 1.5839324788996562e-05, "loss": 0.3716, "mean_token_accuracy": 0.899937218055129, "num_tokens": 737555365.0, "step": 1528 }, { "entropy": 0.39630126953125, "epoch": 0.9518254986585792, "grad_norm": 0.004991588596110672, "learning_rate": 1.5820568927789937e-05, "loss": 0.3754, "mean_token_accuracy": 0.8987826146185398, "num_tokens": 738498646.0, "step": 1530 }, { "entropy": 0.391357421875, "epoch": 0.953069714996695, "grad_norm": 0.005166776894576175, "learning_rate": 1.5801813066583306e-05, "loss": 0.3669, "mean_token_accuracy": 0.9006898663938046, "num_tokens": 739444650.0, "step": 1532 }, { "entropy": 0.4019775390625, "epoch": 0.9543139313348108, "grad_norm": 0.005570000961921399, "learning_rate": 1.578305720537668e-05, "loss": 0.3819, "mean_token_accuracy": 0.8976934049278498, "num_tokens": 740397948.0, "step": 1534 }, { "entropy": 0.39129638671875, "epoch": 0.9555581476729267, "grad_norm": 0.004791044050483484, "learning_rate": 1.5764301344170053e-05, "loss": 0.3697, "mean_token_accuracy": 0.9004064463078976, "num_tokens": 741367015.0, "step": 1536 }, { "entropy": 0.40594482421875, "epoch": 0.9568023640110425, "grad_norm": 0.00507585453174105, "learning_rate": 1.5745545482963428e-05, "loss": 0.3819, "mean_token_accuracy": 0.8976070992648602, "num_tokens": 742327080.0, "step": 1538 }, { "entropy": 0.387939453125, "epoch": 0.9580465803491582, "grad_norm": 0.005211770899971171, "learning_rate": 1.57267896217568e-05, "loss": 0.3667, "mean_token_accuracy": 0.9011343512684107, "num_tokens": 743283392.0, "step": 1540 }, { "entropy": 0.39312744140625, "epoch": 0.959290796687274, "grad_norm": 0.005350962173676616, "learning_rate": 1.570803376055017e-05, "loss": 0.3741, "mean_token_accuracy": 0.8990504983812571, "num_tokens": 744252486.0, "step": 1542 }, { "entropy": 0.40008544921875, "epoch": 0.9605350130253898, "grad_norm": 0.005154927629103417, "learning_rate": 1.5689277899343543e-05, "loss": 0.3763, "mean_token_accuracy": 0.8990206830203533, "num_tokens": 745229777.0, "step": 1544 }, { "entropy": 0.40185546875, "epoch": 0.9617792293635056, "grad_norm": 0.005564314381124215, "learning_rate": 1.567052203813692e-05, "loss": 0.3776, "mean_token_accuracy": 0.8977337591350079, "num_tokens": 746191515.0, "step": 1546 }, { "entropy": 0.401611328125, "epoch": 0.9630234457016214, "grad_norm": 0.0050949398102732475, "learning_rate": 1.5651766176930294e-05, "loss": 0.3824, "mean_token_accuracy": 0.8978726975619793, "num_tokens": 747164625.0, "step": 1548 }, { "entropy": 0.394287109375, "epoch": 0.9642676620397371, "grad_norm": 0.005622520365316132, "learning_rate": 1.5633010315723662e-05, "loss": 0.3728, "mean_token_accuracy": 0.8993618916720152, "num_tokens": 748117329.0, "step": 1550 }, { "entropy": 0.403564453125, "epoch": 0.9655118783778529, "grad_norm": 0.005243511076738619, "learning_rate": 1.5614254454517037e-05, "loss": 0.38, "mean_token_accuracy": 0.8975800927728415, "num_tokens": 749089066.0, "step": 1552 }, { "entropy": 0.4052734375, "epoch": 0.9667560947159688, "grad_norm": 0.005216455225675926, "learning_rate": 1.559549859331041e-05, "loss": 0.385, "mean_token_accuracy": 0.8970531839877367, "num_tokens": 750058508.0, "step": 1554 }, { "entropy": 0.40069580078125, "epoch": 0.9680003110540846, "grad_norm": 0.005504622817686211, "learning_rate": 1.5576742732103784e-05, "loss": 0.3781, "mean_token_accuracy": 0.8984745237976313, "num_tokens": 751030086.0, "step": 1556 }, { "entropy": 0.40106201171875, "epoch": 0.9692445273922004, "grad_norm": 0.005029195802476691, "learning_rate": 1.5557986870897156e-05, "loss": 0.3758, "mean_token_accuracy": 0.8985449206084013, "num_tokens": 751978722.0, "step": 1558 }, { "entropy": 0.39691162109375, "epoch": 0.9704887437303161, "grad_norm": 0.00531322220358732, "learning_rate": 1.5539231009690528e-05, "loss": 0.3774, "mean_token_accuracy": 0.8984716422855854, "num_tokens": 752942421.0, "step": 1560 }, { "entropy": 0.3948974609375, "epoch": 0.9717329600684319, "grad_norm": 0.006071560385560267, "learning_rate": 1.55204751484839e-05, "loss": 0.37, "mean_token_accuracy": 0.8997162599116564, "num_tokens": 753891802.0, "step": 1562 }, { "entropy": 0.40008544921875, "epoch": 0.9729771764065477, "grad_norm": 0.0053507430533959914, "learning_rate": 1.5501719287277275e-05, "loss": 0.3816, "mean_token_accuracy": 0.8980532959103584, "num_tokens": 754859499.0, "step": 1564 }, { "entropy": 0.396484375, "epoch": 0.9742213927446635, "grad_norm": 0.0051724154404807765, "learning_rate": 1.548296342607065e-05, "loss": 0.3759, "mean_token_accuracy": 0.8985564652830362, "num_tokens": 755830293.0, "step": 1566 }, { "entropy": 0.407470703125, "epoch": 0.9754656090827792, "grad_norm": 0.005624150188125636, "learning_rate": 1.546420756486402e-05, "loss": 0.3821, "mean_token_accuracy": 0.8971735760569572, "num_tokens": 756793107.0, "step": 1568 }, { "entropy": 0.40228271484375, "epoch": 0.976709825420895, "grad_norm": 0.005091636027563897, "learning_rate": 1.5445451703657394e-05, "loss": 0.378, "mean_token_accuracy": 0.8983297422528267, "num_tokens": 757758919.0, "step": 1570 }, { "entropy": 0.39764404296875, "epoch": 0.9779540417590108, "grad_norm": 0.005385570271328703, "learning_rate": 1.5426695842450766e-05, "loss": 0.3794, "mean_token_accuracy": 0.8979394007474184, "num_tokens": 758720636.0, "step": 1572 }, { "entropy": 0.39697265625, "epoch": 0.9791982580971267, "grad_norm": 0.005242068590831691, "learning_rate": 1.540793998124414e-05, "loss": 0.3734, "mean_token_accuracy": 0.8991892691701651, "num_tokens": 759695047.0, "step": 1574 }, { "entropy": 0.39178466796875, "epoch": 0.9804424744352425, "grad_norm": 0.0051470681858003, "learning_rate": 1.538918412003751e-05, "loss": 0.3713, "mean_token_accuracy": 0.8998735304921865, "num_tokens": 760656282.0, "step": 1576 }, { "entropy": 0.39105224609375, "epoch": 0.9816866907733582, "grad_norm": 0.00530612632281339, "learning_rate": 1.5370428258830884e-05, "loss": 0.3716, "mean_token_accuracy": 0.8999177347868681, "num_tokens": 761618533.0, "step": 1578 }, { "entropy": 0.3955078125, "epoch": 0.982930907111474, "grad_norm": 0.00538197550326783, "learning_rate": 1.535167239762426e-05, "loss": 0.3725, "mean_token_accuracy": 0.899470366537571, "num_tokens": 762570827.0, "step": 1580 }, { "entropy": 0.39520263671875, "epoch": 0.9841751234495898, "grad_norm": 0.004796086550540488, "learning_rate": 1.533291653641763e-05, "loss": 0.3735, "mean_token_accuracy": 0.8991418685764074, "num_tokens": 763541076.0, "step": 1582 }, { "entropy": 0.39569091796875, "epoch": 0.9854193397877056, "grad_norm": 0.005509463855541266, "learning_rate": 1.5314160675211007e-05, "loss": 0.3725, "mean_token_accuracy": 0.8994384594261646, "num_tokens": 764486698.0, "step": 1584 }, { "entropy": 0.388427734375, "epoch": 0.9866635561258214, "grad_norm": 0.00499671339184283, "learning_rate": 1.5295404814004375e-05, "loss": 0.3711, "mean_token_accuracy": 0.9000991955399513, "num_tokens": 765460397.0, "step": 1586 }, { "entropy": 0.396240234375, "epoch": 0.9879077724639371, "grad_norm": 0.004994469206801488, "learning_rate": 1.527664895279775e-05, "loss": 0.3723, "mean_token_accuracy": 0.8991228640079498, "num_tokens": 766438845.0, "step": 1588 }, { "entropy": 0.40185546875, "epoch": 0.9891519888020529, "grad_norm": 0.004762622353363343, "learning_rate": 1.5257893091591124e-05, "loss": 0.3754, "mean_token_accuracy": 0.8986294474452734, "num_tokens": 767399019.0, "step": 1590 }, { "entropy": 0.3883056640625, "epoch": 0.9903962051401688, "grad_norm": 0.004879202994948607, "learning_rate": 1.5239137230384497e-05, "loss": 0.3666, "mean_token_accuracy": 0.9006400983780622, "num_tokens": 768352369.0, "step": 1592 }, { "entropy": 0.390869140625, "epoch": 0.9916404214782846, "grad_norm": 0.0052616236638453485, "learning_rate": 1.5220381369177867e-05, "loss": 0.372, "mean_token_accuracy": 0.8999687861651182, "num_tokens": 769321767.0, "step": 1594 }, { "entropy": 0.40875244140625, "epoch": 0.9928846378164004, "grad_norm": 0.0058216283106160335, "learning_rate": 1.520162550797124e-05, "loss": 0.3826, "mean_token_accuracy": 0.8974384441971779, "num_tokens": 770294498.0, "step": 1596 }, { "entropy": 0.39202880859375, "epoch": 0.9941288541545161, "grad_norm": 0.0053290721045101295, "learning_rate": 1.5182869646764614e-05, "loss": 0.3697, "mean_token_accuracy": 0.9001308493316174, "num_tokens": 771257072.0, "step": 1598 }, { "entropy": 0.3895263671875, "epoch": 0.9953730704926319, "grad_norm": 0.005162953347744006, "learning_rate": 1.5164113785557988e-05, "loss": 0.3701, "mean_token_accuracy": 0.8999896571040154, "num_tokens": 772214921.0, "step": 1600 }, { "epoch": 0.9953730704926319, "eval_entropy": 0.39652909204209746, "eval_loss": 0.37379172444343567, "eval_mean_token_accuracy": 0.8991162767262367, "eval_num_tokens": 772214921.0, "eval_runtime": 425.6124, "eval_samples_per_second": 203.542, "eval_steps_per_second": 3.181, "step": 1600 }, { "entropy": 0.3961181640625, "epoch": 0.9966172868307477, "grad_norm": 0.005578699837870134, "learning_rate": 1.5145357924351361e-05, "loss": 0.3726, "mean_token_accuracy": 0.899048451334238, "num_tokens": 773174318.0, "step": 1602 }, { "entropy": 0.39849853515625, "epoch": 0.9978615031688635, "grad_norm": 0.005109702947035346, "learning_rate": 1.5126602063144731e-05, "loss": 0.3764, "mean_token_accuracy": 0.8993540611118078, "num_tokens": 774141742.0, "step": 1604 }, { "entropy": 0.3927001953125, "epoch": 0.9991057195069792, "grad_norm": 0.005204541232574947, "learning_rate": 1.5107846201938107e-05, "loss": 0.3732, "mean_token_accuracy": 0.8991651739925146, "num_tokens": 775107210.0, "step": 1606 }, { "entropy": 0.397758152173913, "epoch": 1.0, "grad_norm": 0.007502892535766562, "learning_rate": 1.508909034073148e-05, "loss": 0.3703, "mean_token_accuracy": 0.8987410509068033, "num_tokens": 775801518.0, "step": 1608 } ], "logging_steps": 2, "max_steps": 3216, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.7241803122016256e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }