{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.35362014690451204, "eval_steps": 500, "global_step": 337, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005246589716684155, "grad_norm": 8.869540214538574, "learning_rate": 8.401166180758017e-06, "loss": 0.9257, "mean_token_accuracy": 0.8380203127861023, "num_tokens": 289686.0, "step": 5 }, { "epoch": 0.01049317943336831, "grad_norm": 1.9420901536941528, "learning_rate": 1.8902623906705538e-05, "loss": 0.5266, "mean_token_accuracy": 0.8593480110168457, "num_tokens": 584447.0, "step": 10 }, { "epoch": 0.015739769150052464, "grad_norm": 1.2173879146575928, "learning_rate": 2.9404081632653062e-05, "loss": 0.3377, "mean_token_accuracy": 0.894003939628601, "num_tokens": 865283.0, "step": 15 }, { "epoch": 0.02098635886673662, "grad_norm": 0.9192031621932983, "learning_rate": 3.990553935860058e-05, "loss": 0.3091, "mean_token_accuracy": 0.9075875878334045, "num_tokens": 1150898.0, "step": 20 }, { "epoch": 0.026232948583420776, "grad_norm": 0.8734805583953857, "learning_rate": 5.0406997084548104e-05, "loss": 0.2471, "mean_token_accuracy": 0.9180729031562805, "num_tokens": 1456620.0, "step": 25 }, { "epoch": 0.03147953830010493, "grad_norm": 0.7508789300918579, "learning_rate": 6.090845481049563e-05, "loss": 0.2155, "mean_token_accuracy": 0.9300779223442077, "num_tokens": 1751457.0, "step": 30 }, { "epoch": 0.03672612801678909, "grad_norm": 0.8536575436592102, "learning_rate": 7.140991253644314e-05, "loss": 0.2623, "mean_token_accuracy": 0.9183142781257629, "num_tokens": 2039245.0, "step": 35 }, { "epoch": 0.04197271773347324, "grad_norm": 0.868010938167572, "learning_rate": 7.350762136663497e-05, "loss": 0.2239, "mean_token_accuracy": 0.9314786314964294, "num_tokens": 2330149.0, "step": 40 }, { "epoch": 0.0472193074501574, "grad_norm": 0.7415319681167603, "learning_rate": 7.349712991638426e-05, "loss": 0.2124, "mean_token_accuracy": 0.935946810245514, "num_tokens": 2627934.0, "step": 45 }, { "epoch": 0.05246589716684155, "grad_norm": 0.6589397192001343, "learning_rate": 7.347857138053374e-05, "loss": 0.2258, "mean_token_accuracy": 0.9339556813240051, "num_tokens": 2909298.0, "step": 50 }, { "epoch": 0.05771248688352571, "grad_norm": 0.8945339322090149, "learning_rate": 7.345195119268144e-05, "loss": 0.2135, "mean_token_accuracy": 0.9362031579017639, "num_tokens": 3215160.0, "step": 55 }, { "epoch": 0.06295907660020986, "grad_norm": 0.6015563607215881, "learning_rate": 7.341727714672879e-05, "loss": 0.223, "mean_token_accuracy": 0.9350167393684388, "num_tokens": 3512700.0, "step": 60 }, { "epoch": 0.06820566631689402, "grad_norm": 0.9376577138900757, "learning_rate": 7.337455939459886e-05, "loss": 0.1949, "mean_token_accuracy": 0.9446848630905151, "num_tokens": 3798243.0, "step": 65 }, { "epoch": 0.07345225603357818, "grad_norm": 0.8647239804267883, "learning_rate": 7.332381044326387e-05, "loss": 0.192, "mean_token_accuracy": 0.9473262071609497, "num_tokens": 4088918.0, "step": 70 }, { "epoch": 0.07869884575026233, "grad_norm": 0.6758660078048706, "learning_rate": 7.326504515108353e-05, "loss": 0.1608, "mean_token_accuracy": 0.9533717513084412, "num_tokens": 4373400.0, "step": 75 }, { "epoch": 0.08394543546694648, "grad_norm": 0.6870194673538208, "learning_rate": 7.319828072345472e-05, "loss": 0.1719, "mean_token_accuracy": 0.947171425819397, "num_tokens": 4658256.0, "step": 80 }, { "epoch": 0.08919202518363065, "grad_norm": 0.677169680595398, "learning_rate": 7.312353670777409e-05, "loss": 0.2406, "mean_token_accuracy": 0.9317703485488892, "num_tokens": 4942712.0, "step": 85 }, { "epoch": 0.0944386149003148, "grad_norm": 0.7355192303657532, "learning_rate": 7.304083498771491e-05, "loss": 0.1559, "mean_token_accuracy": 0.9547448277473449, "num_tokens": 5236965.0, "step": 90 }, { "epoch": 0.09968520461699895, "grad_norm": 0.8258501291275024, "learning_rate": 7.295019977681995e-05, "loss": 0.1484, "mean_token_accuracy": 0.9531127452850342, "num_tokens": 5531311.0, "step": 95 }, { "epoch": 0.1049317943336831, "grad_norm": 0.6302676200866699, "learning_rate": 7.285165761141225e-05, "loss": 0.1302, "mean_token_accuracy": 0.9609502911567688, "num_tokens": 5824530.0, "step": 100 }, { "epoch": 0.11017838405036726, "grad_norm": 0.7048377990722656, "learning_rate": 7.274523734282567e-05, "loss": 0.1655, "mean_token_accuracy": 0.9510802745819091, "num_tokens": 6126073.0, "step": 105 }, { "epoch": 0.11542497376705142, "grad_norm": 0.708620011806488, "learning_rate": 7.263097012895783e-05, "loss": 0.1919, "mean_token_accuracy": 0.9440585255622864, "num_tokens": 6428088.0, "step": 110 }, { "epoch": 0.12067156348373557, "grad_norm": 0.7772646546363831, "learning_rate": 7.250888942514763e-05, "loss": 0.1791, "mean_token_accuracy": 0.9519138216972352, "num_tokens": 6709759.0, "step": 115 }, { "epoch": 0.1259181532004197, "grad_norm": 0.728533148765564, "learning_rate": 7.237903097438013e-05, "loss": 0.1794, "mean_token_accuracy": 0.9493731379508972, "num_tokens": 7009928.0, "step": 120 }, { "epoch": 0.1311647429171039, "grad_norm": 0.5031924843788147, "learning_rate": 7.224143279682167e-05, "loss": 0.1966, "mean_token_accuracy": 0.9432877659797668, "num_tokens": 7299912.0, "step": 125 }, { "epoch": 0.13641133263378805, "grad_norm": 0.7311460375785828, "learning_rate": 7.209613517868828e-05, "loss": 0.1623, "mean_token_accuracy": 0.9553608179092408, "num_tokens": 7587766.0, "step": 130 }, { "epoch": 0.1416579223504722, "grad_norm": 0.4527396261692047, "learning_rate": 7.194318066045057e-05, "loss": 0.1552, "mean_token_accuracy": 0.9542324781417847, "num_tokens": 7883332.0, "step": 135 }, { "epoch": 0.14690451206715635, "grad_norm": 0.5695252418518066, "learning_rate": 7.178261402437872e-05, "loss": 0.1409, "mean_token_accuracy": 0.9568296670913696, "num_tokens": 8184963.0, "step": 140 }, { "epoch": 0.1521511017838405, "grad_norm": 0.47259852290153503, "learning_rate": 7.161448228143099e-05, "loss": 0.1465, "mean_token_accuracy": 0.9609991312026978, "num_tokens": 8481153.0, "step": 145 }, { "epoch": 0.15739769150052466, "grad_norm": 0.5303276181221008, "learning_rate": 7.143883465748988e-05, "loss": 0.183, "mean_token_accuracy": 0.9521325826644897, "num_tokens": 8775155.0, "step": 150 }, { "epoch": 0.1626442812172088, "grad_norm": 0.46558475494384766, "learning_rate": 7.125572257894959e-05, "loss": 0.205, "mean_token_accuracy": 0.9444481015205384, "num_tokens": 9071761.0, "step": 155 }, { "epoch": 0.16789087093389296, "grad_norm": 0.6111699342727661, "learning_rate": 7.106519965765934e-05, "loss": 0.1717, "mean_token_accuracy": 0.9499832153320312, "num_tokens": 9363710.0, "step": 160 }, { "epoch": 0.1731374606505771, "grad_norm": 0.7169287204742432, "learning_rate": 7.086732167522687e-05, "loss": 0.2031, "mean_token_accuracy": 0.9435471892356873, "num_tokens": 9650095.0, "step": 165 }, { "epoch": 0.1783840503672613, "grad_norm": 0.41854023933410645, "learning_rate": 7.066214656668653e-05, "loss": 0.128, "mean_token_accuracy": 0.9642794132232666, "num_tokens": 9943925.0, "step": 170 }, { "epoch": 0.18363064008394545, "grad_norm": 0.5462023019790649, "learning_rate": 7.044973440353702e-05, "loss": 0.1692, "mean_token_accuracy": 0.9510787844657898, "num_tokens": 10239935.0, "step": 175 }, { "epoch": 0.1888772298006296, "grad_norm": 0.682062566280365, "learning_rate": 7.023014737615354e-05, "loss": 0.0925, "mean_token_accuracy": 0.9718472838401795, "num_tokens": 10542283.0, "step": 180 }, { "epoch": 0.19412381951731375, "grad_norm": 0.4446072578430176, "learning_rate": 7.000344977557959e-05, "loss": 0.1831, "mean_token_accuracy": 0.9482220768928528, "num_tokens": 10833066.0, "step": 185 }, { "epoch": 0.1993704092339979, "grad_norm": 0.35908645391464233, "learning_rate": 6.976970797470374e-05, "loss": 0.1553, "mean_token_accuracy": 0.9547260999679565, "num_tokens": 11138524.0, "step": 190 }, { "epoch": 0.20461699895068206, "grad_norm": 0.5753610134124756, "learning_rate": 6.95289904088269e-05, "loss": 0.1783, "mean_token_accuracy": 0.9493900299072265, "num_tokens": 11434453.0, "step": 195 }, { "epoch": 0.2098635886673662, "grad_norm": 0.5665861964225769, "learning_rate": 6.928136755562561e-05, "loss": 0.1653, "mean_token_accuracy": 0.9550609827041626, "num_tokens": 11730328.0, "step": 200 }, { "epoch": 0.21511017838405036, "grad_norm": 0.5666924118995667, "learning_rate": 6.902691191451765e-05, "loss": 0.1927, "mean_token_accuracy": 0.9518372297286988, "num_tokens": 12015188.0, "step": 205 }, { "epoch": 0.2203567681007345, "grad_norm": 0.41499361395835876, "learning_rate": 6.876569798543542e-05, "loss": 0.1149, "mean_token_accuracy": 0.9646952271461486, "num_tokens": 12310232.0, "step": 210 }, { "epoch": 0.22560335781741866, "grad_norm": 0.44563302397727966, "learning_rate": 6.84978022470137e-05, "loss": 0.1195, "mean_token_accuracy": 0.9647434711456299, "num_tokens": 12591060.0, "step": 215 }, { "epoch": 0.23084994753410285, "grad_norm": 0.9801908731460571, "learning_rate": 6.822330313419822e-05, "loss": 0.1501, "mean_token_accuracy": 0.9570110678672791, "num_tokens": 12879679.0, "step": 220 }, { "epoch": 0.236096537250787, "grad_norm": 0.590543806552887, "learning_rate": 6.79422810152813e-05, "loss": 0.1879, "mean_token_accuracy": 0.949954628944397, "num_tokens": 13169419.0, "step": 225 }, { "epoch": 0.24134312696747115, "grad_norm": 0.5752514004707336, "learning_rate": 6.765481816837146e-05, "loss": 0.1507, "mean_token_accuracy": 0.9556933283805847, "num_tokens": 13471416.0, "step": 230 }, { "epoch": 0.2465897166841553, "grad_norm": 0.47486424446105957, "learning_rate": 6.736099875730397e-05, "loss": 0.1675, "mean_token_accuracy": 0.956990647315979, "num_tokens": 13767085.0, "step": 235 }, { "epoch": 0.2518363064008394, "grad_norm": 0.7542303800582886, "learning_rate": 6.706090880699916e-05, "loss": 0.1798, "mean_token_accuracy": 0.9498309254646301, "num_tokens": 14071045.0, "step": 240 }, { "epoch": 0.2570828961175236, "grad_norm": 0.5474414825439453, "learning_rate": 6.675463617827598e-05, "loss": 0.1698, "mean_token_accuracy": 0.9535765051841736, "num_tokens": 14362641.0, "step": 245 }, { "epoch": 0.2623294858342078, "grad_norm": 0.5496485829353333, "learning_rate": 6.644227054212787e-05, "loss": 0.1457, "mean_token_accuracy": 0.9581978678703308, "num_tokens": 14664981.0, "step": 250 }, { "epoch": 0.2675760755508919, "grad_norm": 0.49473029375076294, "learning_rate": 6.612390335346883e-05, "loss": 0.155, "mean_token_accuracy": 0.9565532088279725, "num_tokens": 14959011.0, "step": 255 }, { "epoch": 0.2728226652675761, "grad_norm": 0.303487628698349, "learning_rate": 6.579962782435706e-05, "loss": 0.1509, "mean_token_accuracy": 0.9600463390350342, "num_tokens": 15245374.0, "step": 260 }, { "epoch": 0.2780692549842602, "grad_norm": 0.5282312631607056, "learning_rate": 6.546953889670418e-05, "loss": 0.1702, "mean_token_accuracy": 0.9531951785087586, "num_tokens": 15525453.0, "step": 265 }, { "epoch": 0.2833158447009444, "grad_norm": 0.28557926416397095, "learning_rate": 6.513373321447803e-05, "loss": 0.1937, "mean_token_accuracy": 0.9494660019874572, "num_tokens": 15809199.0, "step": 270 }, { "epoch": 0.2885624344176285, "grad_norm": 0.4175744652748108, "learning_rate": 6.479230909540701e-05, "loss": 0.125, "mean_token_accuracy": 0.9632657885551452, "num_tokens": 16101826.0, "step": 275 }, { "epoch": 0.2938090241343127, "grad_norm": 0.46477630734443665, "learning_rate": 6.444536650219457e-05, "loss": 0.1477, "mean_token_accuracy": 0.9573351144790649, "num_tokens": 16390495.0, "step": 280 }, { "epoch": 0.2990556138509968, "grad_norm": 0.40762028098106384, "learning_rate": 6.409300701325186e-05, "loss": 0.1447, "mean_token_accuracy": 0.9613166928291321, "num_tokens": 16685503.0, "step": 285 }, { "epoch": 0.304302203567681, "grad_norm": 0.24077750742435455, "learning_rate": 6.37353337929575e-05, "loss": 0.172, "mean_token_accuracy": 0.9532236933708191, "num_tokens": 16964030.0, "step": 290 }, { "epoch": 0.3095487932843652, "grad_norm": 0.826016366481781, "learning_rate": 6.337245156145292e-05, "loss": 0.1803, "mean_token_accuracy": 0.955608320236206, "num_tokens": 17262831.0, "step": 295 }, { "epoch": 0.3147953830010493, "grad_norm": 0.4392148554325104, "learning_rate": 6.300446656398228e-05, "loss": 0.1769, "mean_token_accuracy": 0.950160825252533, "num_tokens": 17542500.0, "step": 300 }, { "epoch": 0.3200419727177335, "grad_norm": 0.5865193009376526, "learning_rate": 6.263148653978572e-05, "loss": 0.1245, "mean_token_accuracy": 0.9670758128166199, "num_tokens": 17833796.0, "step": 305 }, { "epoch": 0.3252885624344176, "grad_norm": 0.5191839337348938, "learning_rate": 6.225362069055539e-05, "loss": 0.192, "mean_token_accuracy": 0.947128975391388, "num_tokens": 18131801.0, "step": 310 }, { "epoch": 0.3305351521511018, "grad_norm": 0.41704511642456055, "learning_rate": 6.187097964846317e-05, "loss": 0.1353, "mean_token_accuracy": 0.9619265556335449, "num_tokens": 18429696.0, "step": 315 }, { "epoch": 0.3357817418677859, "grad_norm": 0.5766462087631226, "learning_rate": 6.148367544376953e-05, "loss": 0.1647, "mean_token_accuracy": 0.9554041147232055, "num_tokens": 18726485.0, "step": 320 }, { "epoch": 0.3410283315844701, "grad_norm": 0.4433589577674866, "learning_rate": 6.109182147202329e-05, "loss": 0.1643, "mean_token_accuracy": 0.9574645280838012, "num_tokens": 19017396.0, "step": 325 }, { "epoch": 0.3462749213011542, "grad_norm": 0.4165564179420471, "learning_rate": 6.069553246086131e-05, "loss": 0.1669, "mean_token_accuracy": 0.9528910636901855, "num_tokens": 19320603.0, "step": 330 }, { "epoch": 0.3515215110178384, "grad_norm": 0.6303921341896057, "learning_rate": 6.0294924436418496e-05, "loss": 0.1796, "mean_token_accuracy": 0.9498880624771118, "num_tokens": 19599038.0, "step": 335 } ], "logging_steps": 5, "max_steps": 953, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.871895722016768e+17, "train_batch_size": 24, "trial_name": null, "trial_params": null }