{ "best_global_step": 7359, "best_metric": 0.8462, "best_model_checkpoint": "models/NED/EMEA_human_only_tfidf_hybrid_long_v2_addheaders/Llama-3.1-8B-Instruct/checkpoint-7359", "epoch": 50.0, "eval_steps": 500, "global_step": 122650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.1526817805758311, "epoch": 1.0, "grad_norm": 304.0, "learning_rate": 1.9989130434782608e-05, "loss": 0.7669, "mean_token_accuracy": 0.8752253057546777, "num_tokens": 15010779.0, "step": 2453 }, { "epoch": 1.0, "eval_entropy": 1.2358426589232225, "eval_loss": 0.6339517831802368, "eval_mean_token_accuracy": 0.8988095246828519, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 15010779.0, "eval_recall": 0.7308, "eval_runtime": 3.6399, "eval_samples_per_second": 7.143, "eval_steps_per_second": 3.571, "step": 2453 }, { "entropy": 1.3605892632720036, "epoch": 2.0, "grad_norm": 12.1875, "learning_rate": 2.9691098596284776e-05, "loss": 0.5437, "mean_token_accuracy": 0.9150349811612466, "num_tokens": 30021558.0, "step": 4906 }, { "epoch": 2.0, "eval_entropy": 1.1509519540346587, "eval_loss": 0.4853871166706085, "eval_mean_token_accuracy": 0.9201437464127173, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 30021558.0, "eval_recall": 0.7692, "eval_runtime": 3.627, "eval_samples_per_second": 7.168, "eval_steps_per_second": 3.584, "step": 4906 }, { "entropy": 1.1862413553719222, "epoch": 3.0, "grad_norm": 2.1875, "learning_rate": 2.9072539295620746e-05, "loss": 0.2619, "mean_token_accuracy": 0.9548876376794495, "num_tokens": 45032337.0, "step": 7359 }, { "epoch": 3.0, "eval_entropy": 1.019592651954064, "eval_loss": 0.5770813822746277, "eval_mean_token_accuracy": 0.9220362993387076, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 45032337.0, "eval_recall": 0.8462, "eval_runtime": 3.6363, "eval_samples_per_second": 7.15, "eval_steps_per_second": 3.575, "step": 7359 }, { "entropy": 0.9634018300311497, "epoch": 4.0, "grad_norm": 0.1240234375, "learning_rate": 2.8453979994956713e-05, "loss": 0.1216, "mean_token_accuracy": 0.9782008502466845, "num_tokens": 60043116.0, "step": 9812 }, { "epoch": 4.0, "eval_entropy": 0.8699520321992728, "eval_loss": 0.5446107387542725, "eval_mean_token_accuracy": 0.940018314581651, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 60043116.0, "eval_recall": 0.8462, "eval_runtime": 3.6143, "eval_samples_per_second": 7.194, "eval_steps_per_second": 3.597, "step": 9812 }, { "entropy": 0.7849812429144681, "epoch": 5.0, "grad_norm": 0.002227783203125, "learning_rate": 2.783542069429268e-05, "loss": 0.0517, "mean_token_accuracy": 0.9894482943411997, "num_tokens": 75053895.0, "step": 12265 }, { "epoch": 5.0, "eval_entropy": 0.6801113898937519, "eval_loss": 0.7289856672286987, "eval_mean_token_accuracy": 0.9444444454633273, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 75053895.0, "eval_recall": 0.8462, "eval_runtime": 3.6486, "eval_samples_per_second": 7.126, "eval_steps_per_second": 3.563, "step": 12265 }, { "entropy": 0.6892432886826181, "epoch": 6.0, "grad_norm": 0.0004749298095703125, "learning_rate": 2.721686139362865e-05, "loss": 0.0209, "mean_token_accuracy": 0.9958273216359138, "num_tokens": 90064674.0, "step": 14718 }, { "epoch": 6.0, "eval_entropy": 0.577189931502709, "eval_loss": 0.7246649265289307, "eval_mean_token_accuracy": 0.9444444454633273, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 90064674.0, "eval_recall": 0.8462, "eval_runtime": 3.6456, "eval_samples_per_second": 7.132, "eval_steps_per_second": 3.566, "step": 14718 }, { "entropy": 0.6557439389371696, "epoch": 7.0, "grad_norm": 0.000888824462890625, "learning_rate": 2.659830209296461e-05, "loss": 0.0078, "mean_token_accuracy": 0.9979321826393635, "num_tokens": 105075453.0, "step": 17171 }, { "epoch": 7.0, "eval_entropy": 0.5603500146132249, "eval_loss": 0.8045116662979126, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 105075453.0, "eval_recall": 0.8462, "eval_runtime": 5.557, "eval_samples_per_second": 4.679, "eval_steps_per_second": 2.339, "step": 17171 }, { "entropy": 0.6481096161976669, "epoch": 8.0, "grad_norm": 8.96453857421875e-05, "learning_rate": 2.597974279230058e-05, "loss": 0.0028, "mean_token_accuracy": 0.9993061645391568, "num_tokens": 120086232.0, "step": 19624 }, { "epoch": 8.0, "eval_entropy": 0.5650725089586698, "eval_loss": 0.8335245847702026, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 120086232.0, "eval_recall": 0.8462, "eval_runtime": 3.6391, "eval_samples_per_second": 7.145, "eval_steps_per_second": 3.572, "step": 19624 }, { "entropy": 0.6384989822756452, "epoch": 9.0, "grad_norm": 0.00102996826171875, "learning_rate": 2.5361183491636548e-05, "loss": 0.0011, "mean_token_accuracy": 0.9997574686275129, "num_tokens": 135097011.0, "step": 22077 }, { "epoch": 9.0, "eval_entropy": 0.5437194108963013, "eval_loss": 0.8720409870147705, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 135097011.0, "eval_recall": 0.8462, "eval_runtime": 3.6696, "eval_samples_per_second": 7.085, "eval_steps_per_second": 3.543, "step": 22077 }, { "entropy": 0.6327040182586792, "epoch": 10.0, "grad_norm": 0.00011968612670898438, "learning_rate": 2.4742624190972517e-05, "loss": 0.0002, "mean_token_accuracy": 0.9999592335818012, "num_tokens": 150107790.0, "step": 24530 }, { "epoch": 10.0, "eval_entropy": 0.5456434029799241, "eval_loss": 0.8786986470222473, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 150107790.0, "eval_recall": 0.8462, "eval_runtime": 3.7213, "eval_samples_per_second": 6.987, "eval_steps_per_second": 3.493, "step": 24530 }, { "entropy": 0.6342776355527636, "epoch": 11.0, "grad_norm": 2.9206275939941406e-05, "learning_rate": 2.412406489030848e-05, "loss": 0.0001, "mean_token_accuracy": 0.9999629396397, "num_tokens": 165118569.0, "step": 26983 }, { "epoch": 11.0, "eval_entropy": 0.5441241906239436, "eval_loss": 0.8776129484176636, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 165118569.0, "eval_recall": 0.8462, "eval_runtime": 3.6285, "eval_samples_per_second": 7.165, "eval_steps_per_second": 3.583, "step": 26983 }, { "entropy": 0.6330991076222742, "epoch": 12.0, "grad_norm": 0.000823974609375, "learning_rate": 2.350550558964445e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 180129348.0, "step": 29436 }, { "epoch": 12.0, "eval_entropy": 0.544509245799138, "eval_loss": 0.88084477186203, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 180129348.0, "eval_recall": 0.8462, "eval_runtime": 3.6661, "eval_samples_per_second": 7.092, "eval_steps_per_second": 3.546, "step": 29436 }, { "entropy": 0.6322705759061291, "epoch": 13.0, "grad_norm": 0.010498046875, "learning_rate": 2.2886946288980416e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 195140127.0, "step": 31889 }, { "epoch": 13.0, "eval_entropy": 0.5434356606923617, "eval_loss": 0.8842343091964722, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 195140127.0, "eval_recall": 0.8462, "eval_runtime": 4.1268, "eval_samples_per_second": 6.3, "eval_steps_per_second": 3.15, "step": 31889 }, { "entropy": 0.6316640121908612, "epoch": 14.0, "grad_norm": 0.0035552978515625, "learning_rate": 2.2268386988316383e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 210150906.0, "step": 34342 }, { "epoch": 14.0, "eval_entropy": 0.543243577847114, "eval_loss": 0.885927140712738, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 210150906.0, "eval_recall": 0.8462, "eval_runtime": 3.7188, "eval_samples_per_second": 6.991, "eval_steps_per_second": 3.496, "step": 34342 }, { "entropy": 0.6321596540070241, "epoch": 15.0, "grad_norm": 2.4199485778808594e-05, "learning_rate": 2.164982768765235e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 225161685.0, "step": 36795 }, { "epoch": 15.0, "eval_entropy": 0.5422769280580374, "eval_loss": 0.8823052644729614, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 225161685.0, "eval_recall": 0.8462, "eval_runtime": 3.6723, "eval_samples_per_second": 7.08, "eval_steps_per_second": 3.54, "step": 36795 }, { "entropy": 0.6315903761194426, "epoch": 16.0, "grad_norm": 0.0291748046875, "learning_rate": 2.1031268386988316e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 240172464.0, "step": 39248 }, { "epoch": 16.0, "eval_entropy": 0.5426660546889672, "eval_loss": 0.8869765996932983, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 240172464.0, "eval_recall": 0.8462, "eval_runtime": 3.6896, "eval_samples_per_second": 7.047, "eval_steps_per_second": 3.523, "step": 39248 }, { "entropy": 0.6317922561279472, "epoch": 17.0, "grad_norm": 0.0001850128173828125, "learning_rate": 2.0412709086324285e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 255183243.0, "step": 41701 }, { "epoch": 17.0, "eval_entropy": 0.542809899036701, "eval_loss": 0.8864607214927673, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 255183243.0, "eval_recall": 0.8462, "eval_runtime": 3.6498, "eval_samples_per_second": 7.124, "eval_steps_per_second": 3.562, "step": 41701 }, { "entropy": 0.6319634849034763, "epoch": 18.0, "grad_norm": 2.1457672119140625e-05, "learning_rate": 1.979414978566025e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 270194022.0, "step": 44154 }, { "epoch": 18.0, "eval_entropy": 0.5426488243616544, "eval_loss": 0.8861849308013916, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 270194022.0, "eval_recall": 0.8462, "eval_runtime": 3.6568, "eval_samples_per_second": 7.11, "eval_steps_per_second": 3.555, "step": 44154 }, { "entropy": 0.631338802688325, "epoch": 19.0, "grad_norm": 4.076957702636719e-05, "learning_rate": 1.9175590484996218e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 285204801.0, "step": 46607 }, { "epoch": 19.0, "eval_entropy": 0.5423762339812058, "eval_loss": 0.885791540145874, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 285204801.0, "eval_recall": 0.8462, "eval_runtime": 3.653, "eval_samples_per_second": 7.118, "eval_steps_per_second": 3.559, "step": 46607 }, { "entropy": 0.6311312203036976, "epoch": 20.0, "grad_norm": 0.0004634857177734375, "learning_rate": 1.8557031184332184e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 300215580.0, "step": 49060 }, { "epoch": 20.0, "eval_entropy": 0.5424229686076825, "eval_loss": 0.8889456987380981, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 300215580.0, "eval_recall": 0.8462, "eval_runtime": 3.651, "eval_samples_per_second": 7.121, "eval_steps_per_second": 3.561, "step": 49060 }, { "entropy": 0.631198678741249, "epoch": 21.0, "grad_norm": 0.00031280517578125, "learning_rate": 1.793847188366815e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 315226359.0, "step": 51513 }, { "epoch": 21.0, "eval_entropy": 0.5428222968028142, "eval_loss": 0.8843169808387756, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 315226359.0, "eval_recall": 0.8462, "eval_runtime": 3.6619, "eval_samples_per_second": 7.1, "eval_steps_per_second": 3.55, "step": 51513 }, { "entropy": 0.6313406728478388, "epoch": 22.0, "grad_norm": 0.000759124755859375, "learning_rate": 1.731991258300412e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 330237138.0, "step": 53966 }, { "epoch": 22.0, "eval_entropy": 0.5427144765853882, "eval_loss": 0.8861469030380249, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 330237138.0, "eval_recall": 0.8462, "eval_runtime": 3.6544, "eval_samples_per_second": 7.115, "eval_steps_per_second": 3.557, "step": 53966 }, { "entropy": 0.6313331465647263, "epoch": 23.0, "grad_norm": 0.00051116943359375, "learning_rate": 1.6701353282340083e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 345247917.0, "step": 56419 }, { "epoch": 23.0, "eval_entropy": 0.5423137545585632, "eval_loss": 0.8892049193382263, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 345247917.0, "eval_recall": 0.8462, "eval_runtime": 3.6537, "eval_samples_per_second": 7.116, "eval_steps_per_second": 3.558, "step": 56419 }, { "entropy": 0.6310314053401527, "epoch": 24.0, "grad_norm": 3.600120544433594e-05, "learning_rate": 1.6082793981676053e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 360258696.0, "step": 58872 }, { "epoch": 24.0, "eval_entropy": 0.5423843631377587, "eval_loss": 0.8886714577674866, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 360258696.0, "eval_recall": 0.8462, "eval_runtime": 3.6316, "eval_samples_per_second": 7.159, "eval_steps_per_second": 3.58, "step": 58872 }, { "entropy": 0.6315073234496484, "epoch": 25.0, "grad_norm": 7.82012939453125e-05, "learning_rate": 1.546423468101202e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 375269475.0, "step": 61325 }, { "epoch": 25.0, "eval_entropy": 0.5420686419193561, "eval_loss": 0.8865240812301636, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 375269475.0, "eval_recall": 0.8462, "eval_runtime": 3.613, "eval_samples_per_second": 7.196, "eval_steps_per_second": 3.598, "step": 61325 }, { "entropy": 0.632054461467718, "epoch": 26.0, "grad_norm": 0.00024318695068359375, "learning_rate": 1.4845675380347987e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 15010779.0, "step": 63778 }, { "epoch": 26.0, "eval_entropy": 0.5426568893285898, "eval_loss": 0.88667893409729, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 15010779.0, "eval_recall": 0.8462, "eval_runtime": 3.647, "eval_samples_per_second": 7.129, "eval_steps_per_second": 3.565, "step": 63778 }, { "entropy": 0.6314872418356777, "epoch": 27.0, "grad_norm": 0.00011396408081054688, "learning_rate": 1.4227116079683954e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 30021558.0, "step": 66231 }, { "epoch": 27.0, "eval_entropy": 0.5423887417866633, "eval_loss": 0.8907365798950195, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 30021558.0, "eval_recall": 0.8462, "eval_runtime": 3.6242, "eval_samples_per_second": 7.174, "eval_steps_per_second": 3.587, "step": 66231 }, { "entropy": 0.6317801613055392, "epoch": 28.0, "grad_norm": 8.392333984375e-05, "learning_rate": 1.3608556779019922e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 45032337.0, "step": 68684 }, { "epoch": 28.0, "eval_entropy": 0.5428364735383254, "eval_loss": 0.885719358921051, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 45032337.0, "eval_recall": 0.8462, "eval_runtime": 3.6828, "eval_samples_per_second": 7.06, "eval_steps_per_second": 3.53, "step": 68684 }, { "entropy": 0.6310389586555389, "epoch": 29.0, "grad_norm": 0.000774383544921875, "learning_rate": 1.2989997478355888e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 60043116.0, "step": 71137 }, { "epoch": 29.0, "eval_entropy": 0.5424722524789664, "eval_loss": 0.8864960074424744, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 60043116.0, "eval_recall": 0.8462, "eval_runtime": 3.6359, "eval_samples_per_second": 7.151, "eval_steps_per_second": 3.576, "step": 71137 }, { "entropy": 0.6310345640461444, "epoch": 30.0, "grad_norm": 3.5762786865234375e-05, "learning_rate": 1.2371438177691856e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 75053895.0, "step": 73590 }, { "epoch": 30.0, "eval_entropy": 0.5427528161268967, "eval_loss": 0.8871183395385742, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 75053895.0, "eval_recall": 0.8462, "eval_runtime": 3.6648, "eval_samples_per_second": 7.095, "eval_steps_per_second": 3.547, "step": 73590 }, { "entropy": 0.6307261824680745, "epoch": 31.0, "grad_norm": 0.00015163421630859375, "learning_rate": 1.1752878877027823e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 90064674.0, "step": 76043 }, { "epoch": 31.0, "eval_entropy": 0.5423439878683823, "eval_loss": 0.890313982963562, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 90064674.0, "eval_recall": 0.8462, "eval_runtime": 3.6589, "eval_samples_per_second": 7.106, "eval_steps_per_second": 3.553, "step": 76043 }, { "entropy": 0.6317850742056279, "epoch": 32.0, "grad_norm": 0.0005035400390625, "learning_rate": 1.113431957636379e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 105075453.0, "step": 78496 }, { "epoch": 32.0, "eval_entropy": 0.5422184283916767, "eval_loss": 0.8882402181625366, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 105075453.0, "eval_recall": 0.8462, "eval_runtime": 3.6075, "eval_samples_per_second": 7.207, "eval_steps_per_second": 3.604, "step": 78496 }, { "entropy": 0.6315069926961121, "epoch": 33.0, "grad_norm": 0.0079345703125, "learning_rate": 1.0515760275699757e-05, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 120086232.0, "step": 80949 }, { "epoch": 33.0, "eval_entropy": 0.5428683024186355, "eval_loss": 0.8859032988548279, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 120086232.0, "eval_recall": 0.8462, "eval_runtime": 3.6537, "eval_samples_per_second": 7.116, "eval_steps_per_second": 3.558, "step": 80949 }, { "entropy": 0.6313212784246381, "epoch": 34.0, "grad_norm": 0.000885009765625, "learning_rate": 9.897200975035723e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 135097011.0, "step": 83402 }, { "epoch": 34.0, "eval_entropy": 0.5425068598527175, "eval_loss": 0.887780487537384, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 135097011.0, "eval_recall": 0.8462, "eval_runtime": 3.6448, "eval_samples_per_second": 7.133, "eval_steps_per_second": 3.567, "step": 83402 }, { "entropy": 0.6308202771352254, "epoch": 35.0, "grad_norm": 0.00032806396484375, "learning_rate": 9.27864167437169e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 150107790.0, "step": 85855 }, { "epoch": 35.0, "eval_entropy": 0.54246619114509, "eval_loss": 0.8900800347328186, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 150107790.0, "eval_recall": 0.8462, "eval_runtime": 3.6253, "eval_samples_per_second": 7.172, "eval_steps_per_second": 3.586, "step": 85855 }, { "entropy": 0.6310893858737767, "epoch": 36.0, "grad_norm": 0.00543212890625, "learning_rate": 8.660082373707658e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 165118569.0, "step": 88308 }, { "epoch": 36.0, "eval_entropy": 0.542354785479032, "eval_loss": 0.882867157459259, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 165118569.0, "eval_recall": 0.8462, "eval_runtime": 3.6309, "eval_samples_per_second": 7.161, "eval_steps_per_second": 3.58, "step": 88308 }, { "entropy": 0.6313383878492308, "epoch": 37.0, "grad_norm": 0.0014495849609375, "learning_rate": 8.041523073043624e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 180129348.0, "step": 90761 }, { "epoch": 37.0, "eval_entropy": 0.5429406670423654, "eval_loss": 0.8894430994987488, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 180129348.0, "eval_recall": 0.8462, "eval_runtime": 3.6047, "eval_samples_per_second": 7.213, "eval_steps_per_second": 3.606, "step": 90761 }, { "entropy": 0.6315074832012738, "epoch": 38.0, "grad_norm": 1.8477439880371094e-05, "learning_rate": 7.422963772379592e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 195140127.0, "step": 93214 }, { "epoch": 38.0, "eval_entropy": 0.5428708929281968, "eval_loss": 0.8853751420974731, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 195140127.0, "eval_recall": 0.8462, "eval_runtime": 3.6095, "eval_samples_per_second": 7.203, "eval_steps_per_second": 3.602, "step": 93214 }, { "entropy": 0.6316086658156264, "epoch": 39.0, "grad_norm": 0.0019378662109375, "learning_rate": 6.804404471715559e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 210150906.0, "step": 95667 }, { "epoch": 39.0, "eval_entropy": 0.5423155472828791, "eval_loss": 0.8865050673484802, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 210150906.0, "eval_recall": 0.8462, "eval_runtime": 3.6105, "eval_samples_per_second": 7.201, "eval_steps_per_second": 3.601, "step": 95667 }, { "entropy": 0.6319762418161253, "epoch": 40.0, "grad_norm": 0.0076904296875, "learning_rate": 6.185845171051526e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 225161685.0, "step": 98120 }, { "epoch": 40.0, "eval_entropy": 0.5423448315033546, "eval_loss": 0.887237012386322, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 225161685.0, "eval_recall": 0.8462, "eval_runtime": 3.6062, "eval_samples_per_second": 7.21, "eval_steps_per_second": 3.605, "step": 98120 }, { "entropy": 0.6316094772090632, "epoch": 41.0, "grad_norm": 0.00040435791015625, "learning_rate": 5.567285870387493e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 240172464.0, "step": 100573 }, { "epoch": 41.0, "eval_entropy": 0.5424330555475675, "eval_loss": 0.8862788081169128, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 240172464.0, "eval_recall": 0.8462, "eval_runtime": 3.6042, "eval_samples_per_second": 7.214, "eval_steps_per_second": 3.607, "step": 100573 }, { "entropy": 0.6310035889118581, "epoch": 42.0, "grad_norm": 0.0020294189453125, "learning_rate": 4.94872656972346e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 255183243.0, "step": 103026 }, { "epoch": 42.0, "eval_entropy": 0.5431472292313209, "eval_loss": 0.890018105506897, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 255183243.0, "eval_recall": 0.8462, "eval_runtime": 3.6041, "eval_samples_per_second": 7.214, "eval_steps_per_second": 3.607, "step": 103026 }, { "entropy": 0.6312229550229838, "epoch": 43.0, "grad_norm": 0.0012969970703125, "learning_rate": 4.330167269059427e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 270194022.0, "step": 105479 }, { "epoch": 43.0, "eval_entropy": 0.5424636235603919, "eval_loss": 0.8868480324745178, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 270194022.0, "eval_recall": 0.8462, "eval_runtime": 3.606, "eval_samples_per_second": 7.21, "eval_steps_per_second": 3.605, "step": 105479 }, { "entropy": 0.631434175660063, "epoch": 44.0, "grad_norm": 7.390975952148438e-05, "learning_rate": 3.711607968395394e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 285204801.0, "step": 107932 }, { "epoch": 44.0, "eval_entropy": 0.5421680899766775, "eval_loss": 0.8860384821891785, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 285204801.0, "eval_recall": 0.8462, "eval_runtime": 3.6344, "eval_samples_per_second": 7.154, "eval_steps_per_second": 3.577, "step": 107932 }, { "entropy": 0.6307510763127319, "epoch": 45.0, "grad_norm": 0.00927734375, "learning_rate": 3.0930486677313608e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 300215580.0, "step": 110385 }, { "epoch": 45.0, "eval_entropy": 0.54229736328125, "eval_loss": 0.8853968977928162, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 300215580.0, "eval_recall": 0.8462, "eval_runtime": 3.61, "eval_samples_per_second": 7.202, "eval_steps_per_second": 3.601, "step": 110385 }, { "entropy": 0.6315490893937595, "epoch": 46.0, "grad_norm": 0.0001239776611328125, "learning_rate": 2.474489367067328e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 315226359.0, "step": 112838 }, { "epoch": 46.0, "eval_entropy": 0.5422170620698196, "eval_loss": 0.8882192373275757, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 315226359.0, "eval_recall": 0.8462, "eval_runtime": 3.7084, "eval_samples_per_second": 7.011, "eval_steps_per_second": 3.506, "step": 112838 }, { "entropy": 0.6317317981380761, "epoch": 47.0, "grad_norm": 3.3855438232421875e-05, "learning_rate": 1.855930066403295e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 330237138.0, "step": 115291 }, { "epoch": 47.0, "eval_entropy": 0.5427549022894639, "eval_loss": 0.8879793882369995, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 330237138.0, "eval_recall": 0.8462, "eval_runtime": 3.6923, "eval_samples_per_second": 7.042, "eval_steps_per_second": 3.521, "step": 115291 }, { "entropy": 0.6314135375092869, "epoch": 48.0, "grad_norm": 0.0025634765625, "learning_rate": 1.2373707657392621e-06, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 345247917.0, "step": 117744 }, { "epoch": 48.0, "eval_entropy": 0.5423269546948947, "eval_loss": 0.887828528881073, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 345247917.0, "eval_recall": 0.8462, "eval_runtime": 3.661, "eval_samples_per_second": 7.102, "eval_steps_per_second": 3.551, "step": 117744 }, { "entropy": 0.6317788491650499, "epoch": 49.0, "grad_norm": 0.0015106201171875, "learning_rate": 6.18811465075229e-07, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 360258696.0, "step": 120197 }, { "epoch": 49.0, "eval_entropy": 0.5421000031324533, "eval_loss": 0.886226236820221, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 360258696.0, "eval_recall": 0.8462, "eval_runtime": 3.8724, "eval_samples_per_second": 6.714, "eval_steps_per_second": 3.357, "step": 120197 }, { "entropy": 0.6307675256881722, "epoch": 50.0, "grad_norm": 0.0003414154052734375, "learning_rate": 2.5216441119609984e-10, "loss": 0.0, "mean_token_accuracy": 1.0, "num_tokens": 375269475.0, "step": 122650 }, { "epoch": 50.0, "eval_entropy": 0.5427401478473957, "eval_loss": 0.888108491897583, "eval_mean_token_accuracy": 0.9358974374257601, "eval_num_gold": 26, "eval_num_guess": 26, "eval_num_tokens": 375269475.0, "eval_recall": 0.8462, "eval_runtime": 3.7116, "eval_samples_per_second": 7.005, "eval_steps_per_second": 3.503, "step": 122650 } ], "logging_steps": 0, "max_steps": 122650, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.3796448253168845e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }