{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 10, "global_step": 129, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023255813953488372, "grad_norm": 3.35943865776062, "learning_rate": 0.0, "loss": 4.603, "step": 1 }, { "epoch": 0.046511627906976744, "grad_norm": 3.289452075958252, "learning_rate": 1.5384615384615387e-05, "loss": 4.5344, "step": 2 }, { "epoch": 0.06976744186046512, "grad_norm": 3.1487317085266113, "learning_rate": 3.0769230769230774e-05, "loss": 4.4828, "step": 3 }, { "epoch": 0.09302325581395349, "grad_norm": 2.7355170249938965, "learning_rate": 4.615384615384616e-05, "loss": 4.3188, "step": 4 }, { "epoch": 0.11627906976744186, "grad_norm": 2.17081880569458, "learning_rate": 6.153846153846155e-05, "loss": 4.1009, "step": 5 }, { "epoch": 0.13953488372093023, "grad_norm": 2.007120370864868, "learning_rate": 7.692307692307693e-05, "loss": 3.8892, "step": 6 }, { "epoch": 0.16279069767441862, "grad_norm": 1.9022364616394043, "learning_rate": 9.230769230769232e-05, "loss": 3.7229, "step": 7 }, { "epoch": 0.18604651162790697, "grad_norm": 1.9221678972244263, "learning_rate": 0.0001076923076923077, "loss": 3.5347, "step": 8 }, { "epoch": 0.20930232558139536, "grad_norm": 2.0581228733062744, "learning_rate": 0.0001230769230769231, "loss": 3.2141, "step": 9 }, { "epoch": 0.23255813953488372, "grad_norm": 2.2354397773742676, "learning_rate": 0.00013846153846153847, "loss": 2.8757, "step": 10 }, { "epoch": 0.23255813953488372, "eval_loss": 2.621652841567993, "eval_runtime": 280.141, "eval_samples_per_second": 17.448, "eval_steps_per_second": 0.139, "step": 10 }, { "epoch": 0.2558139534883721, "grad_norm": 2.2474637031555176, "learning_rate": 0.00015384615384615385, "loss": 2.5382, "step": 11 }, { "epoch": 0.27906976744186046, "grad_norm": 2.040398597717285, "learning_rate": 0.00016923076923076923, "loss": 2.1967, "step": 12 }, { "epoch": 0.3023255813953488, "grad_norm": 1.9672517776489258, "learning_rate": 0.00018461538461538463, "loss": 1.923, "step": 13 }, { "epoch": 0.32558139534883723, "grad_norm": 2.168889045715332, "learning_rate": 0.0002, "loss": 1.7671, "step": 14 }, { "epoch": 0.3488372093023256, "grad_norm": 1.2723966836929321, "learning_rate": 0.0001999633286223284, "loss": 1.5425, "step": 15 }, { "epoch": 0.37209302325581395, "grad_norm": 1.0903257131576538, "learning_rate": 0.00019985334138511237, "loss": 1.3849, "step": 16 }, { "epoch": 0.3953488372093023, "grad_norm": 1.4316954612731934, "learning_rate": 0.0001996701189560223, "loss": 1.2584, "step": 17 }, { "epoch": 0.4186046511627907, "grad_norm": 1.6875934600830078, "learning_rate": 0.00019941379571543596, "loss": 1.1068, "step": 18 }, { "epoch": 0.4418604651162791, "grad_norm": 1.6493124961853027, "learning_rate": 0.00019908455965788067, "loss": 0.9556, "step": 19 }, { "epoch": 0.46511627906976744, "grad_norm": 1.4394609928131104, "learning_rate": 0.00019868265225415265, "loss": 0.7735, "step": 20 }, { "epoch": 0.46511627906976744, "eval_loss": 0.6109094619750977, "eval_runtime": 278.8403, "eval_samples_per_second": 17.53, "eval_steps_per_second": 0.14, "step": 20 }, { "epoch": 0.4883720930232558, "grad_norm": 1.4170726537704468, "learning_rate": 0.0001982083682742156, "loss": 0.5833, "step": 21 }, { "epoch": 0.5116279069767442, "grad_norm": 1.226104497909546, "learning_rate": 0.00019766205557100868, "loss": 0.401, "step": 22 }, { "epoch": 0.5348837209302325, "grad_norm": 1.3556923866271973, "learning_rate": 0.00019704411482532116, "loss": 0.2544, "step": 23 }, { "epoch": 0.5581395348837209, "grad_norm": 0.7993350028991699, "learning_rate": 0.0001963549992519223, "loss": 0.1426, "step": 24 }, { "epoch": 0.5813953488372093, "grad_norm": 0.4882752001285553, "learning_rate": 0.00019559521426716118, "loss": 0.0813, "step": 25 }, { "epoch": 0.6046511627906976, "grad_norm": 0.22902335226535797, "learning_rate": 0.00019476531711828027, "loss": 0.0541, "step": 26 }, { "epoch": 0.627906976744186, "grad_norm": 0.1321583092212677, "learning_rate": 0.00019386591647471506, "loss": 0.0452, "step": 27 }, { "epoch": 0.6511627906976745, "grad_norm": 0.11517132818698883, "learning_rate": 0.00019289767198167916, "loss": 0.0382, "step": 28 }, { "epoch": 0.6744186046511628, "grad_norm": 0.10609164088964462, "learning_rate": 0.0001918612937763622, "loss": 0.0362, "step": 29 }, { "epoch": 0.6976744186046512, "grad_norm": 0.09162303805351257, "learning_rate": 0.00019075754196709572, "loss": 0.0325, "step": 30 }, { "epoch": 0.6976744186046512, "eval_loss": 0.03507654368877411, "eval_runtime": 284.4431, "eval_samples_per_second": 17.184, "eval_steps_per_second": 0.137, "step": 30 }, { "epoch": 0.7209302325581395, "grad_norm": 0.09819968044757843, "learning_rate": 0.0001895872260758688, "loss": 0.0312, "step": 31 }, { "epoch": 0.7441860465116279, "grad_norm": 0.0848526805639267, "learning_rate": 0.0001883512044446023, "loss": 0.0322, "step": 32 }, { "epoch": 0.7674418604651163, "grad_norm": 0.0566725954413414, "learning_rate": 0.0001870503836056172, "loss": 0.029, "step": 33 }, { "epoch": 0.7906976744186046, "grad_norm": 0.11173616349697113, "learning_rate": 0.00018568571761675893, "loss": 0.0312, "step": 34 }, { "epoch": 0.813953488372093, "grad_norm": 0.06333456933498383, "learning_rate": 0.0001842582073616649, "loss": 0.0275, "step": 35 }, { "epoch": 0.8372093023255814, "grad_norm": 0.05882587283849716, "learning_rate": 0.00018276889981568906, "loss": 0.0312, "step": 36 }, { "epoch": 0.8604651162790697, "grad_norm": 0.04547254368662834, "learning_rate": 0.00018121888727802113, "loss": 0.0287, "step": 37 }, { "epoch": 0.8837209302325582, "grad_norm": 0.060173455625772476, "learning_rate": 0.00017960930657056438, "loss": 0.03, "step": 38 }, { "epoch": 0.9069767441860465, "grad_norm": 0.04700060561299324, "learning_rate": 0.00017794133820415916, "loss": 0.0288, "step": 39 }, { "epoch": 0.9302325581395349, "grad_norm": 0.0446193665266037, "learning_rate": 0.00017621620551276366, "loss": 0.0289, "step": 40 }, { "epoch": 0.9302325581395349, "eval_loss": 0.029409727081656456, "eval_runtime": 280.3805, "eval_samples_per_second": 17.433, "eval_steps_per_second": 0.139, "step": 40 }, { "epoch": 0.9534883720930233, "grad_norm": 0.05349310114979744, "learning_rate": 0.00017443517375622704, "loss": 0.0307, "step": 41 }, { "epoch": 0.9767441860465116, "grad_norm": 0.05011633783578873, "learning_rate": 0.0001725995491923131, "loss": 0.0259, "step": 42 }, { "epoch": 1.0, "grad_norm": 0.05545727163553238, "learning_rate": 0.00017071067811865476, "loss": 0.0261, "step": 43 }, { "epoch": 1.0232558139534884, "grad_norm": 0.03875494748353958, "learning_rate": 0.00016876994588534234, "loss": 0.0229, "step": 44 }, { "epoch": 1.0465116279069768, "grad_norm": 0.04426169767975807, "learning_rate": 0.00016677877587886956, "loss": 0.023, "step": 45 }, { "epoch": 1.069767441860465, "grad_norm": 0.0356602780520916, "learning_rate": 0.00016473862847818277, "loss": 0.0206, "step": 46 }, { "epoch": 1.0930232558139534, "grad_norm": 0.03734959289431572, "learning_rate": 0.00016265099998359866, "loss": 0.022, "step": 47 }, { "epoch": 1.1162790697674418, "grad_norm": 0.037262506783008575, "learning_rate": 0.00016051742151937655, "loss": 0.0209, "step": 48 }, { "epoch": 1.1395348837209303, "grad_norm": 0.051405053585767746, "learning_rate": 0.00015833945791074943, "loss": 0.0251, "step": 49 }, { "epoch": 1.1627906976744187, "grad_norm": 0.03806193917989731, "learning_rate": 0.00015611870653623825, "loss": 0.0215, "step": 50 }, { "epoch": 1.1627906976744187, "eval_loss": 0.027570661157369614, "eval_runtime": 277.8404, "eval_samples_per_second": 17.593, "eval_steps_per_second": 0.14, "step": 50 }, { "epoch": 1.1860465116279069, "grad_norm": 0.03529495373368263, "learning_rate": 0.00015385679615609042, "loss": 0.021, "step": 51 }, { "epoch": 1.2093023255813953, "grad_norm": 0.04077022895216942, "learning_rate": 0.00015155538571770218, "loss": 0.0196, "step": 52 }, { "epoch": 1.2325581395348837, "grad_norm": 0.03276953846216202, "learning_rate": 0.00014921616313890072, "loss": 0.0217, "step": 53 }, { "epoch": 1.255813953488372, "grad_norm": 0.03502936288714409, "learning_rate": 0.00014684084406997903, "loss": 0.0189, "step": 54 }, { "epoch": 1.2790697674418605, "grad_norm": 0.038756098598241806, "learning_rate": 0.00014443117063539038, "loss": 0.0225, "step": 55 }, { "epoch": 1.302325581395349, "grad_norm": 0.03458042070269585, "learning_rate": 0.00014198891015602646, "loss": 0.0184, "step": 56 }, { "epoch": 1.3255813953488373, "grad_norm": 0.032612334936857224, "learning_rate": 0.00013951585385301555, "loss": 0.0219, "step": 57 }, { "epoch": 1.3488372093023255, "grad_norm": 0.030378634110093117, "learning_rate": 0.00013701381553399145, "loss": 0.0188, "step": 58 }, { "epoch": 1.372093023255814, "grad_norm": 0.037341564893722534, "learning_rate": 0.00013448463026279704, "loss": 0.0188, "step": 59 }, { "epoch": 1.3953488372093024, "grad_norm": 0.03775785118341446, "learning_rate": 0.000131930153013598, "loss": 0.021, "step": 60 }, { "epoch": 1.3953488372093024, "eval_loss": 0.026697171851992607, "eval_runtime": 277.6346, "eval_samples_per_second": 17.606, "eval_steps_per_second": 0.14, "step": 60 }, { "epoch": 1.4186046511627908, "grad_norm": 0.034544046968221664, "learning_rate": 0.00012935225731039348, "loss": 0.0181, "step": 61 }, { "epoch": 1.441860465116279, "grad_norm": 0.03138190880417824, "learning_rate": 0.00012675283385292212, "loss": 0.0191, "step": 62 }, { "epoch": 1.4651162790697674, "grad_norm": 0.03294781595468521, "learning_rate": 0.00012413378912997058, "loss": 0.0208, "step": 63 }, { "epoch": 1.4883720930232558, "grad_norm": 0.03295775130391121, "learning_rate": 0.00012149704402110243, "loss": 0.0211, "step": 64 }, { "epoch": 1.5116279069767442, "grad_norm": 0.03224225342273712, "learning_rate": 0.00011884453238783185, "loss": 0.0198, "step": 65 }, { "epoch": 1.5348837209302326, "grad_norm": 0.03675834834575653, "learning_rate": 0.0001161781996552765, "loss": 0.0217, "step": 66 }, { "epoch": 1.558139534883721, "grad_norm": 0.032391469925642014, "learning_rate": 0.00011350000138532902, "loss": 0.02, "step": 67 }, { "epoch": 1.5813953488372094, "grad_norm": 0.0394534207880497, "learning_rate": 0.00011081190184239419, "loss": 0.0195, "step": 68 }, { "epoch": 1.6046511627906976, "grad_norm": 0.03158976882696152, "learning_rate": 0.00010811587255274313, "loss": 0.0189, "step": 69 }, { "epoch": 1.627906976744186, "grad_norm": 0.03663668408989906, "learning_rate": 0.00010541389085854176, "loss": 0.0182, "step": 70 }, { "epoch": 1.627906976744186, "eval_loss": 0.025791307911276817, "eval_runtime": 283.3586, "eval_samples_per_second": 17.25, "eval_steps_per_second": 0.138, "step": 70 }, { "epoch": 1.6511627906976745, "grad_norm": 0.03444863110780716, "learning_rate": 0.00010270793846761347, "loss": 0.0216, "step": 71 }, { "epoch": 1.6744186046511627, "grad_norm": 0.03390992805361748, "learning_rate": 0.0001, "loss": 0.0212, "step": 72 }, { "epoch": 1.697674418604651, "grad_norm": 0.03509373590350151, "learning_rate": 9.729206153238657e-05, "loss": 0.0175, "step": 73 }, { "epoch": 1.7209302325581395, "grad_norm": 0.03165116906166077, "learning_rate": 9.458610914145826e-05, "loss": 0.0194, "step": 74 }, { "epoch": 1.744186046511628, "grad_norm": 0.040131378918886185, "learning_rate": 9.18841274472569e-05, "loss": 0.0204, "step": 75 }, { "epoch": 1.7674418604651163, "grad_norm": 0.03355936333537102, "learning_rate": 8.918809815760585e-05, "loss": 0.0211, "step": 76 }, { "epoch": 1.7906976744186047, "grad_norm": 0.03639550507068634, "learning_rate": 8.649999861467099e-05, "loss": 0.02, "step": 77 }, { "epoch": 1.8139534883720931, "grad_norm": 0.03191493824124336, "learning_rate": 8.382180034472353e-05, "loss": 0.0189, "step": 78 }, { "epoch": 1.8372093023255816, "grad_norm": 0.031350504606962204, "learning_rate": 8.115546761216822e-05, "loss": 0.0181, "step": 79 }, { "epoch": 1.8604651162790697, "grad_norm": 0.033913638442754745, "learning_rate": 7.85029559788976e-05, "loss": 0.021, "step": 80 }, { "epoch": 1.8604651162790697, "eval_loss": 0.02541690692305565, "eval_runtime": 278.2139, "eval_samples_per_second": 17.569, "eval_steps_per_second": 0.14, "step": 80 }, { "epoch": 1.8837209302325582, "grad_norm": 0.03476489707827568, "learning_rate": 7.586621087002945e-05, "loss": 0.0182, "step": 81 }, { "epoch": 1.9069767441860463, "grad_norm": 0.02707247994840145, "learning_rate": 7.324716614707793e-05, "loss": 0.0184, "step": 82 }, { "epoch": 1.9302325581395348, "grad_norm": 0.03282856568694115, "learning_rate": 7.064774268960653e-05, "loss": 0.02, "step": 83 }, { "epoch": 1.9534883720930232, "grad_norm": 0.029144227504730225, "learning_rate": 6.806984698640202e-05, "loss": 0.0191, "step": 84 }, { "epoch": 1.9767441860465116, "grad_norm": 0.03824353590607643, "learning_rate": 6.551536973720298e-05, "loss": 0.0214, "step": 85 }, { "epoch": 2.0, "grad_norm": 0.030625835061073303, "learning_rate": 6.298618446600856e-05, "loss": 0.0188, "step": 86 }, { "epoch": 2.0232558139534884, "grad_norm": 0.03776366263628006, "learning_rate": 6.048414614698448e-05, "loss": 0.0135, "step": 87 }, { "epoch": 2.046511627906977, "grad_norm": 0.026809707283973694, "learning_rate": 5.801108984397354e-05, "loss": 0.0153, "step": 88 }, { "epoch": 2.0697674418604652, "grad_norm": 0.02388789877295494, "learning_rate": 5.5568829364609664e-05, "loss": 0.0133, "step": 89 }, { "epoch": 2.0930232558139537, "grad_norm": 0.03190658614039421, "learning_rate": 5.3159155930021e-05, "loss": 0.0123, "step": 90 }, { "epoch": 2.0930232558139537, "eval_loss": 0.025698909536004066, "eval_runtime": 279.9189, "eval_samples_per_second": 17.462, "eval_steps_per_second": 0.139, "step": 90 }, { "epoch": 2.116279069767442, "grad_norm": 0.02972349151968956, "learning_rate": 5.078383686109926e-05, "loss": 0.0137, "step": 91 }, { "epoch": 2.13953488372093, "grad_norm": 0.025936435908079147, "learning_rate": 4.844461428229782e-05, "loss": 0.0155, "step": 92 }, { "epoch": 2.1627906976744184, "grad_norm": 0.027229925617575645, "learning_rate": 4.614320384390959e-05, "loss": 0.0131, "step": 93 }, { "epoch": 2.186046511627907, "grad_norm": 0.02414911612868309, "learning_rate": 4.388129346376178e-05, "loss": 0.0144, "step": 94 }, { "epoch": 2.2093023255813953, "grad_norm": 0.026830129325389862, "learning_rate": 4.16605420892506e-05, "loss": 0.0116, "step": 95 }, { "epoch": 2.2325581395348837, "grad_norm": 0.02861592546105385, "learning_rate": 3.948257848062351e-05, "loss": 0.0146, "step": 96 }, { "epoch": 2.255813953488372, "grad_norm": 0.024740004912018776, "learning_rate": 3.734900001640135e-05, "loss": 0.0119, "step": 97 }, { "epoch": 2.2790697674418605, "grad_norm": 0.026296459138393402, "learning_rate": 3.5261371521817244e-05, "loss": 0.0136, "step": 98 }, { "epoch": 2.302325581395349, "grad_norm": 0.028810663148760796, "learning_rate": 3.322122412113047e-05, "loss": 0.0138, "step": 99 }, { "epoch": 2.3255813953488373, "grad_norm": 0.029194893315434456, "learning_rate": 3.123005411465766e-05, "loss": 0.0149, "step": 100 }, { "epoch": 2.3255813953488373, "eval_loss": 0.026885949075222015, "eval_runtime": 279.8632, "eval_samples_per_second": 17.466, "eval_steps_per_second": 0.139, "step": 100 }, { "epoch": 2.3488372093023258, "grad_norm": 0.02966705709695816, "learning_rate": 2.9289321881345254e-05, "loss": 0.0139, "step": 101 }, { "epoch": 2.3720930232558137, "grad_norm": 0.026589643210172653, "learning_rate": 2.7400450807686938e-05, "loss": 0.0129, "step": 102 }, { "epoch": 2.395348837209302, "grad_norm": 0.028834247961640358, "learning_rate": 2.5564826243772966e-05, "loss": 0.0137, "step": 103 }, { "epoch": 2.4186046511627906, "grad_norm": 0.03186238184571266, "learning_rate": 2.3783794487236365e-05, "loss": 0.0123, "step": 104 }, { "epoch": 2.441860465116279, "grad_norm": 0.031148750334978104, "learning_rate": 2.205866179584084e-05, "loss": 0.0141, "step": 105 }, { "epoch": 2.4651162790697674, "grad_norm": 0.029319997876882553, "learning_rate": 2.0390693429435627e-05, "loss": 0.0103, "step": 106 }, { "epoch": 2.488372093023256, "grad_norm": 0.0360947921872139, "learning_rate": 1.87811127219789e-05, "loss": 0.0152, "step": 107 }, { "epoch": 2.511627906976744, "grad_norm": 0.02554849162697792, "learning_rate": 1.7231100184310956e-05, "loss": 0.0122, "step": 108 }, { "epoch": 2.5348837209302326, "grad_norm": 0.033594969660043716, "learning_rate": 1.5741792638335095e-05, "loss": 0.015, "step": 109 }, { "epoch": 2.558139534883721, "grad_norm": 0.026471910998225212, "learning_rate": 1.4314282383241096e-05, "loss": 0.012, "step": 110 }, { "epoch": 2.558139534883721, "eval_loss": 0.026891091838479042, "eval_runtime": 279.8879, "eval_samples_per_second": 17.464, "eval_steps_per_second": 0.139, "step": 110 }, { "epoch": 2.5813953488372094, "grad_norm": 0.027651213109493256, "learning_rate": 1.2949616394382802e-05, "loss": 0.0132, "step": 111 }, { "epoch": 2.604651162790698, "grad_norm": 0.02776024490594864, "learning_rate": 1.1648795555397719e-05, "loss": 0.0129, "step": 112 }, { "epoch": 2.6279069767441863, "grad_norm": 0.02689000964164734, "learning_rate": 1.0412773924131203e-05, "loss": 0.0097, "step": 113 }, { "epoch": 2.6511627906976747, "grad_norm": 0.03100793994963169, "learning_rate": 9.242458032904311e-06, "loss": 0.0147, "step": 114 }, { "epoch": 2.6744186046511627, "grad_norm": 0.025405889376997948, "learning_rate": 8.138706223637827e-06, "loss": 0.011, "step": 115 }, { "epoch": 2.697674418604651, "grad_norm": 0.024790151044726372, "learning_rate": 7.102328018320858e-06, "loss": 0.0107, "step": 116 }, { "epoch": 2.7209302325581395, "grad_norm": 0.030019311234354973, "learning_rate": 6.13408352528495e-06, "loss": 0.0122, "step": 117 }, { "epoch": 2.744186046511628, "grad_norm": 0.03311316668987274, "learning_rate": 5.2346828817197655e-06, "loss": 0.0158, "step": 118 }, { "epoch": 2.7674418604651163, "grad_norm": 0.03373163193464279, "learning_rate": 4.404785732838846e-06, "loss": 0.0124, "step": 119 }, { "epoch": 2.7906976744186047, "grad_norm": 0.02422955445945263, "learning_rate": 3.6450007480777093e-06, "loss": 0.0102, "step": 120 }, { "epoch": 2.7906976744186047, "eval_loss": 0.026771264150738716, "eval_runtime": 279.5419, "eval_samples_per_second": 17.486, "eval_steps_per_second": 0.14, "step": 120 }, { "epoch": 2.813953488372093, "grad_norm": 0.028385188430547714, "learning_rate": 2.9558851746788517e-06, "loss": 0.0137, "step": 121 }, { "epoch": 2.8372093023255816, "grad_norm": 0.03301030769944191, "learning_rate": 2.3379444289913342e-06, "loss": 0.0153, "step": 122 }, { "epoch": 2.8604651162790695, "grad_norm": 0.027750354260206223, "learning_rate": 1.7916317257844039e-06, "loss": 0.0123, "step": 123 }, { "epoch": 2.883720930232558, "grad_norm": 0.029665982350707054, "learning_rate": 1.317347745847386e-06, "loss": 0.0139, "step": 124 }, { "epoch": 2.9069767441860463, "grad_norm": 0.026960058137774467, "learning_rate": 9.154403421193225e-07, "loss": 0.012, "step": 125 }, { "epoch": 2.9302325581395348, "grad_norm": 0.05238181725144386, "learning_rate": 5.862042845640403e-07, "loss": 0.0153, "step": 126 }, { "epoch": 2.953488372093023, "grad_norm": 0.036755241453647614, "learning_rate": 3.298810439777311e-07, "loss": 0.0156, "step": 127 }, { "epoch": 2.9767441860465116, "grad_norm": 0.02995097078382969, "learning_rate": 1.4665861488761813e-07, "loss": 0.0118, "step": 128 }, { "epoch": 3.0, "grad_norm": 0.02632974274456501, "learning_rate": 3.667137767160433e-08, "loss": 0.0123, "step": 129 }, { "epoch": 3.0, "step": 129, "total_flos": 4.432857893200462e+18, "train_loss": 0.4512600473944069, "train_runtime": 28077.9211, "train_samples_per_second": 4.7, "train_steps_per_second": 0.005 } ], "logging_steps": 1.0, "max_steps": 129, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.432857893200462e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }