{ "best_global_step": 1629, "best_metric": 1.2086403369903564, "best_model_checkpoint": "./outputs_lora_32_16_1_3_dataset_no_system_completion_only/checkpoint-1629", "epoch": 3.0, "eval_steps": 500, "global_step": 1629, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018424689083371719, "grad_norm": 0.27174142003059387, "learning_rate": 0.0005, "loss": 1.9274, "num_tokens": 62917.0, "step": 1 }, { "epoch": 0.0036849378166743437, "grad_norm": 0.8499718308448792, "learning_rate": 0.0004998465316144874, "loss": 1.8094, "num_tokens": 126861.0, "step": 2 }, { "epoch": 0.0055274067250115156, "grad_norm": 0.4138409495353699, "learning_rate": 0.0004996930632289748, "loss": 1.8183, "num_tokens": 189316.0, "step": 3 }, { "epoch": 0.007369875633348687, "grad_norm": 0.2764137089252472, "learning_rate": 0.0004995395948434623, "loss": 1.8011, "num_tokens": 251192.0, "step": 4 }, { "epoch": 0.009212344541685858, "grad_norm": 0.26355600357055664, "learning_rate": 0.0004993861264579497, "loss": 1.6307, "num_tokens": 314479.0, "step": 5 }, { "epoch": 0.011054813450023031, "grad_norm": 0.17187239229679108, "learning_rate": 0.0004992326580724371, "loss": 1.6604, "num_tokens": 377017.0, "step": 6 }, { "epoch": 0.012897282358360202, "grad_norm": 0.2717991769313812, "learning_rate": 0.0004990791896869245, "loss": 1.5837, "num_tokens": 440391.0, "step": 7 }, { "epoch": 0.014739751266697375, "grad_norm": 0.14616522192955017, "learning_rate": 0.0004989257213014119, "loss": 1.552, "num_tokens": 503225.0, "step": 8 }, { "epoch": 0.016582220175034548, "grad_norm": 0.1655392199754715, "learning_rate": 0.0004987722529158994, "loss": 1.5354, "num_tokens": 567263.0, "step": 9 }, { "epoch": 0.018424689083371717, "grad_norm": 0.12371820956468582, "learning_rate": 0.0004986187845303868, "loss": 1.5507, "num_tokens": 630615.0, "step": 10 }, { "epoch": 0.02026715799170889, "grad_norm": 0.16554005444049835, "learning_rate": 0.0004984653161448742, "loss": 1.5678, "num_tokens": 694637.0, "step": 11 }, { "epoch": 0.022109626900046062, "grad_norm": 0.14094720780849457, "learning_rate": 0.0004983118477593616, "loss": 1.4875, "num_tokens": 759480.0, "step": 12 }, { "epoch": 0.023952095808383235, "grad_norm": 0.09484918415546417, "learning_rate": 0.000498158379373849, "loss": 1.4624, "num_tokens": 822449.0, "step": 13 }, { "epoch": 0.025794564716720404, "grad_norm": 0.09723198413848877, "learning_rate": 0.0004980049109883364, "loss": 1.5179, "num_tokens": 886191.0, "step": 14 }, { "epoch": 0.027637033625057577, "grad_norm": 0.11498954892158508, "learning_rate": 0.0004978514426028239, "loss": 1.4651, "num_tokens": 948485.0, "step": 15 }, { "epoch": 0.02947950253339475, "grad_norm": 0.11234007030725479, "learning_rate": 0.0004976979742173113, "loss": 1.5417, "num_tokens": 1012179.0, "step": 16 }, { "epoch": 0.03132197144173192, "grad_norm": 0.10377254337072372, "learning_rate": 0.0004975445058317987, "loss": 1.4657, "num_tokens": 1074382.0, "step": 17 }, { "epoch": 0.033164440350069095, "grad_norm": 0.11212120205163956, "learning_rate": 0.000497391037446286, "loss": 1.4219, "num_tokens": 1137980.0, "step": 18 }, { "epoch": 0.035006909258406264, "grad_norm": 0.1026037186384201, "learning_rate": 0.0004972375690607735, "loss": 1.5026, "num_tokens": 1200597.0, "step": 19 }, { "epoch": 0.036849378166743434, "grad_norm": 0.10714147239923477, "learning_rate": 0.0004970841006752608, "loss": 1.4428, "num_tokens": 1263491.0, "step": 20 }, { "epoch": 0.03869184707508061, "grad_norm": 0.09927812963724136, "learning_rate": 0.0004969306322897484, "loss": 1.4356, "num_tokens": 1326089.0, "step": 21 }, { "epoch": 0.04053431598341778, "grad_norm": 0.08096501976251602, "learning_rate": 0.0004967771639042357, "loss": 1.3963, "num_tokens": 1390383.0, "step": 22 }, { "epoch": 0.04237678489175495, "grad_norm": 0.09743349254131317, "learning_rate": 0.0004966236955187232, "loss": 1.4791, "num_tokens": 1453769.0, "step": 23 }, { "epoch": 0.044219253800092125, "grad_norm": 0.08892462402582169, "learning_rate": 0.0004964702271332105, "loss": 1.4465, "num_tokens": 1516712.0, "step": 24 }, { "epoch": 0.046061722708429294, "grad_norm": 0.09490308165550232, "learning_rate": 0.000496316758747698, "loss": 1.3586, "num_tokens": 1579592.0, "step": 25 }, { "epoch": 0.04790419161676647, "grad_norm": 0.09283996373414993, "learning_rate": 0.0004961632903621853, "loss": 1.4061, "num_tokens": 1643281.0, "step": 26 }, { "epoch": 0.04974666052510364, "grad_norm": 0.08532562106847763, "learning_rate": 0.0004960098219766729, "loss": 1.4151, "num_tokens": 1707335.0, "step": 27 }, { "epoch": 0.05158912943344081, "grad_norm": 0.09012343734502792, "learning_rate": 0.0004958563535911602, "loss": 1.426, "num_tokens": 1771901.0, "step": 28 }, { "epoch": 0.053431598341777985, "grad_norm": 0.10019658505916595, "learning_rate": 0.0004957028852056477, "loss": 1.3576, "num_tokens": 1835986.0, "step": 29 }, { "epoch": 0.055274067250115154, "grad_norm": 0.0924343466758728, "learning_rate": 0.000495549416820135, "loss": 1.4321, "num_tokens": 1898501.0, "step": 30 }, { "epoch": 0.05711653615845232, "grad_norm": 0.09559261798858643, "learning_rate": 0.0004953959484346225, "loss": 1.404, "num_tokens": 1962003.0, "step": 31 }, { "epoch": 0.0589590050667895, "grad_norm": 0.07674569636583328, "learning_rate": 0.0004952424800491099, "loss": 1.3344, "num_tokens": 2025724.0, "step": 32 }, { "epoch": 0.06080147397512667, "grad_norm": 0.08839581906795502, "learning_rate": 0.0004950890116635973, "loss": 1.3679, "num_tokens": 2088034.0, "step": 33 }, { "epoch": 0.06264394288346384, "grad_norm": 0.08382457494735718, "learning_rate": 0.0004949355432780847, "loss": 1.4207, "num_tokens": 2152118.0, "step": 34 }, { "epoch": 0.06448641179180101, "grad_norm": 0.09082429856061935, "learning_rate": 0.0004947820748925721, "loss": 1.3957, "num_tokens": 2215818.0, "step": 35 }, { "epoch": 0.06632888070013819, "grad_norm": 0.08498605340719223, "learning_rate": 0.0004946286065070595, "loss": 1.3381, "num_tokens": 2279493.0, "step": 36 }, { "epoch": 0.06817134960847536, "grad_norm": 0.07881935685873032, "learning_rate": 0.0004944751381215469, "loss": 1.4239, "num_tokens": 2343461.0, "step": 37 }, { "epoch": 0.07001381851681253, "grad_norm": 0.08435941487550735, "learning_rate": 0.0004943216697360344, "loss": 1.3223, "num_tokens": 2407495.0, "step": 38 }, { "epoch": 0.0718562874251497, "grad_norm": 0.08587496727705002, "learning_rate": 0.0004941682013505218, "loss": 1.4088, "num_tokens": 2470872.0, "step": 39 }, { "epoch": 0.07369875633348687, "grad_norm": 0.08659565448760986, "learning_rate": 0.0004940147329650092, "loss": 1.4272, "num_tokens": 2535174.0, "step": 40 }, { "epoch": 0.07554122524182405, "grad_norm": 0.08041954040527344, "learning_rate": 0.0004938612645794966, "loss": 1.4501, "num_tokens": 2597754.0, "step": 41 }, { "epoch": 0.07738369415016122, "grad_norm": 0.0823766365647316, "learning_rate": 0.000493707796193984, "loss": 1.3621, "num_tokens": 2661681.0, "step": 42 }, { "epoch": 0.07922616305849839, "grad_norm": 0.07833965867757797, "learning_rate": 0.0004935543278084714, "loss": 1.3573, "num_tokens": 2725549.0, "step": 43 }, { "epoch": 0.08106863196683556, "grad_norm": 0.08382697403430939, "learning_rate": 0.0004934008594229589, "loss": 1.4065, "num_tokens": 2788335.0, "step": 44 }, { "epoch": 0.08291110087517273, "grad_norm": 0.08919845521450043, "learning_rate": 0.0004932473910374463, "loss": 1.3416, "num_tokens": 2852294.0, "step": 45 }, { "epoch": 0.0847535697835099, "grad_norm": 0.08479095995426178, "learning_rate": 0.0004930939226519337, "loss": 1.294, "num_tokens": 2915169.0, "step": 46 }, { "epoch": 0.08659603869184708, "grad_norm": 0.09619110822677612, "learning_rate": 0.0004929404542664211, "loss": 1.3861, "num_tokens": 2978278.0, "step": 47 }, { "epoch": 0.08843850760018425, "grad_norm": 0.10685691237449646, "learning_rate": 0.0004927869858809085, "loss": 1.3039, "num_tokens": 3042828.0, "step": 48 }, { "epoch": 0.09028097650852142, "grad_norm": 0.07755471020936966, "learning_rate": 0.000492633517495396, "loss": 1.313, "num_tokens": 3106706.0, "step": 49 }, { "epoch": 0.09212344541685859, "grad_norm": 0.09510131180286407, "learning_rate": 0.0004924800491098834, "loss": 1.4122, "num_tokens": 3169815.0, "step": 50 }, { "epoch": 0.09396591432519576, "grad_norm": 0.1044248566031456, "learning_rate": 0.0004923265807243708, "loss": 1.3516, "num_tokens": 3233598.0, "step": 51 }, { "epoch": 0.09580838323353294, "grad_norm": 0.08970752358436584, "learning_rate": 0.0004921731123388582, "loss": 1.3695, "num_tokens": 3296941.0, "step": 52 }, { "epoch": 0.09765085214187011, "grad_norm": 0.07939132302999496, "learning_rate": 0.0004920196439533456, "loss": 1.3661, "num_tokens": 3361034.0, "step": 53 }, { "epoch": 0.09949332105020728, "grad_norm": 0.09606830030679703, "learning_rate": 0.000491866175567833, "loss": 1.3824, "num_tokens": 3424573.0, "step": 54 }, { "epoch": 0.10133578995854445, "grad_norm": 0.08115610480308533, "learning_rate": 0.0004917127071823205, "loss": 1.3781, "num_tokens": 3489169.0, "step": 55 }, { "epoch": 0.10317825886688162, "grad_norm": 0.08662096410989761, "learning_rate": 0.0004915592387968079, "loss": 1.313, "num_tokens": 3551060.0, "step": 56 }, { "epoch": 0.1050207277752188, "grad_norm": 0.09329764544963837, "learning_rate": 0.0004914057704112953, "loss": 1.3739, "num_tokens": 3615242.0, "step": 57 }, { "epoch": 0.10686319668355597, "grad_norm": 0.08128457516431808, "learning_rate": 0.0004912523020257827, "loss": 1.3298, "num_tokens": 3679026.0, "step": 58 }, { "epoch": 0.10870566559189314, "grad_norm": 0.08668368309736252, "learning_rate": 0.0004910988336402701, "loss": 1.3201, "num_tokens": 3743572.0, "step": 59 }, { "epoch": 0.11054813450023031, "grad_norm": 0.08350718766450882, "learning_rate": 0.0004909453652547575, "loss": 1.3101, "num_tokens": 3807432.0, "step": 60 }, { "epoch": 0.11239060340856748, "grad_norm": 0.1054181158542633, "learning_rate": 0.000490791896869245, "loss": 1.3105, "num_tokens": 3869253.0, "step": 61 }, { "epoch": 0.11423307231690465, "grad_norm": 0.07995358854532242, "learning_rate": 0.0004906384284837324, "loss": 1.2964, "num_tokens": 3933671.0, "step": 62 }, { "epoch": 0.11607554122524183, "grad_norm": 0.08543234318494797, "learning_rate": 0.0004904849600982198, "loss": 1.3767, "num_tokens": 3997848.0, "step": 63 }, { "epoch": 0.117918010133579, "grad_norm": 0.08178466558456421, "learning_rate": 0.0004903314917127072, "loss": 1.4225, "num_tokens": 4060010.0, "step": 64 }, { "epoch": 0.11976047904191617, "grad_norm": 0.09869443625211716, "learning_rate": 0.0004901780233271946, "loss": 1.2442, "num_tokens": 4123001.0, "step": 65 }, { "epoch": 0.12160294795025334, "grad_norm": 0.08512373268604279, "learning_rate": 0.000490024554941682, "loss": 1.3672, "num_tokens": 4185477.0, "step": 66 }, { "epoch": 0.1234454168585905, "grad_norm": 0.08496227115392685, "learning_rate": 0.0004898710865561695, "loss": 1.3175, "num_tokens": 4248812.0, "step": 67 }, { "epoch": 0.12528788576692768, "grad_norm": 0.08939068764448166, "learning_rate": 0.0004897176181706569, "loss": 1.3068, "num_tokens": 4311753.0, "step": 68 }, { "epoch": 0.12713035467526485, "grad_norm": 0.09134455025196075, "learning_rate": 0.0004895641497851443, "loss": 1.3697, "num_tokens": 4374419.0, "step": 69 }, { "epoch": 0.12897282358360201, "grad_norm": 0.09110338985919952, "learning_rate": 0.0004894106813996317, "loss": 1.3071, "num_tokens": 4438927.0, "step": 70 }, { "epoch": 0.1308152924919392, "grad_norm": 0.08442392200231552, "learning_rate": 0.0004892572130141191, "loss": 1.2791, "num_tokens": 4500935.0, "step": 71 }, { "epoch": 0.13265776140027638, "grad_norm": 0.08056983351707458, "learning_rate": 0.0004891037446286066, "loss": 1.2831, "num_tokens": 4566078.0, "step": 72 }, { "epoch": 0.13450023030861355, "grad_norm": 0.09895192831754684, "learning_rate": 0.0004889502762430939, "loss": 1.3225, "num_tokens": 4629976.0, "step": 73 }, { "epoch": 0.13634269921695072, "grad_norm": 0.09267780184745789, "learning_rate": 0.0004887968078575814, "loss": 1.4167, "num_tokens": 4693336.0, "step": 74 }, { "epoch": 0.1381851681252879, "grad_norm": 0.0810493677854538, "learning_rate": 0.0004886433394720687, "loss": 1.35, "num_tokens": 4756011.0, "step": 75 }, { "epoch": 0.14002763703362506, "grad_norm": 0.10650582611560822, "learning_rate": 0.0004884898710865562, "loss": 1.3274, "num_tokens": 4819731.0, "step": 76 }, { "epoch": 0.14187010594196223, "grad_norm": 0.09446509927511215, "learning_rate": 0.0004883364027010435, "loss": 1.3608, "num_tokens": 4882518.0, "step": 77 }, { "epoch": 0.1437125748502994, "grad_norm": 0.10349292308092117, "learning_rate": 0.000488182934315531, "loss": 1.2592, "num_tokens": 4945791.0, "step": 78 }, { "epoch": 0.14555504375863657, "grad_norm": 0.08471430838108063, "learning_rate": 0.0004880294659300184, "loss": 1.4234, "num_tokens": 5009648.0, "step": 79 }, { "epoch": 0.14739751266697373, "grad_norm": 0.08815622329711914, "learning_rate": 0.00048787599754450585, "loss": 1.3142, "num_tokens": 5072917.0, "step": 80 }, { "epoch": 0.1492399815753109, "grad_norm": 0.08565309643745422, "learning_rate": 0.0004877225291589932, "loss": 1.3835, "num_tokens": 5136262.0, "step": 81 }, { "epoch": 0.1510824504836481, "grad_norm": 0.08930712938308716, "learning_rate": 0.0004875690607734807, "loss": 1.281, "num_tokens": 5200098.0, "step": 82 }, { "epoch": 0.15292491939198527, "grad_norm": 0.08383305370807648, "learning_rate": 0.00048741559238796805, "loss": 1.3221, "num_tokens": 5264013.0, "step": 83 }, { "epoch": 0.15476738830032244, "grad_norm": 0.08293300122022629, "learning_rate": 0.0004872621240024555, "loss": 1.3335, "num_tokens": 5326761.0, "step": 84 }, { "epoch": 0.1566098572086596, "grad_norm": 0.08868996053934097, "learning_rate": 0.0004871086556169429, "loss": 1.3051, "num_tokens": 5388470.0, "step": 85 }, { "epoch": 0.15845232611699678, "grad_norm": 0.08284460753202438, "learning_rate": 0.00048695518723143035, "loss": 1.3231, "num_tokens": 5451118.0, "step": 86 }, { "epoch": 0.16029479502533395, "grad_norm": 0.08427251875400543, "learning_rate": 0.0004868017188459177, "loss": 1.3723, "num_tokens": 5514259.0, "step": 87 }, { "epoch": 0.16213726393367112, "grad_norm": 0.08224712312221527, "learning_rate": 0.0004866482504604052, "loss": 1.4329, "num_tokens": 5578145.0, "step": 88 }, { "epoch": 0.16397973284200829, "grad_norm": 0.08265923708677292, "learning_rate": 0.00048649478207489255, "loss": 1.2828, "num_tokens": 5641984.0, "step": 89 }, { "epoch": 0.16582220175034545, "grad_norm": 0.09509484469890594, "learning_rate": 0.00048634131368938, "loss": 1.3173, "num_tokens": 5702683.0, "step": 90 }, { "epoch": 0.16766467065868262, "grad_norm": 0.08703058958053589, "learning_rate": 0.0004861878453038674, "loss": 1.3333, "num_tokens": 5765316.0, "step": 91 }, { "epoch": 0.1695071395670198, "grad_norm": 0.08530570566654205, "learning_rate": 0.00048603437691835486, "loss": 1.3056, "num_tokens": 5830103.0, "step": 92 }, { "epoch": 0.171349608475357, "grad_norm": 0.0862644836306572, "learning_rate": 0.0004858809085328422, "loss": 1.3808, "num_tokens": 5892573.0, "step": 93 }, { "epoch": 0.17319207738369416, "grad_norm": 0.0805979073047638, "learning_rate": 0.0004857274401473297, "loss": 1.3201, "num_tokens": 5957041.0, "step": 94 }, { "epoch": 0.17503454629203133, "grad_norm": 0.08946425467729568, "learning_rate": 0.00048557397176181706, "loss": 1.3071, "num_tokens": 6021472.0, "step": 95 }, { "epoch": 0.1768770152003685, "grad_norm": 0.0937597006559372, "learning_rate": 0.0004854205033763045, "loss": 1.3191, "num_tokens": 6084583.0, "step": 96 }, { "epoch": 0.17871948410870567, "grad_norm": 0.08807350695133209, "learning_rate": 0.0004852670349907919, "loss": 1.3597, "num_tokens": 6146922.0, "step": 97 }, { "epoch": 0.18056195301704284, "grad_norm": 0.08338185399770737, "learning_rate": 0.0004851135666052793, "loss": 1.2936, "num_tokens": 6210573.0, "step": 98 }, { "epoch": 0.18240442192538, "grad_norm": 0.09898986667394638, "learning_rate": 0.00048496009821976673, "loss": 1.3035, "num_tokens": 6272912.0, "step": 99 }, { "epoch": 0.18424689083371718, "grad_norm": 0.08478111028671265, "learning_rate": 0.00048480662983425415, "loss": 1.243, "num_tokens": 6336364.0, "step": 100 }, { "epoch": 0.18608935974205434, "grad_norm": 0.09365523606538773, "learning_rate": 0.00048465316144874157, "loss": 1.2975, "num_tokens": 6399588.0, "step": 101 }, { "epoch": 0.1879318286503915, "grad_norm": 0.09440144151449203, "learning_rate": 0.000484499693063229, "loss": 1.3426, "num_tokens": 6461993.0, "step": 102 }, { "epoch": 0.18977429755872868, "grad_norm": 0.09824486821889877, "learning_rate": 0.0004843462246777164, "loss": 1.2556, "num_tokens": 6526332.0, "step": 103 }, { "epoch": 0.19161676646706588, "grad_norm": 0.09350544959306717, "learning_rate": 0.0004841927562922038, "loss": 1.2935, "num_tokens": 6590437.0, "step": 104 }, { "epoch": 0.19345923537540305, "grad_norm": 0.09216629713773727, "learning_rate": 0.00048403928790669124, "loss": 1.3809, "num_tokens": 6653592.0, "step": 105 }, { "epoch": 0.19530170428374022, "grad_norm": 0.10142754018306732, "learning_rate": 0.00048388581952117865, "loss": 1.2555, "num_tokens": 6716822.0, "step": 106 }, { "epoch": 0.1971441731920774, "grad_norm": 0.09099232405424118, "learning_rate": 0.00048373235113566607, "loss": 1.2748, "num_tokens": 6780111.0, "step": 107 }, { "epoch": 0.19898664210041456, "grad_norm": 0.0988006591796875, "learning_rate": 0.0004835788827501535, "loss": 1.3366, "num_tokens": 6844352.0, "step": 108 }, { "epoch": 0.20082911100875173, "grad_norm": 0.09335943311452866, "learning_rate": 0.0004834254143646409, "loss": 1.3317, "num_tokens": 6908951.0, "step": 109 }, { "epoch": 0.2026715799170889, "grad_norm": 0.09635764360427856, "learning_rate": 0.0004832719459791283, "loss": 1.307, "num_tokens": 6971898.0, "step": 110 }, { "epoch": 0.20451404882542606, "grad_norm": 0.10345321148633957, "learning_rate": 0.0004831184775936157, "loss": 1.2791, "num_tokens": 7034469.0, "step": 111 }, { "epoch": 0.20635651773376323, "grad_norm": 0.1048668771982193, "learning_rate": 0.00048296500920810316, "loss": 1.2315, "num_tokens": 7097057.0, "step": 112 }, { "epoch": 0.2081989866421004, "grad_norm": 0.09254509955644608, "learning_rate": 0.0004828115408225905, "loss": 1.3002, "num_tokens": 7160160.0, "step": 113 }, { "epoch": 0.2100414555504376, "grad_norm": 0.09960129112005234, "learning_rate": 0.000482658072437078, "loss": 1.4131, "num_tokens": 7223571.0, "step": 114 }, { "epoch": 0.21188392445877477, "grad_norm": 0.09643974155187607, "learning_rate": 0.00048250460405156536, "loss": 1.2871, "num_tokens": 7287474.0, "step": 115 }, { "epoch": 0.21372639336711194, "grad_norm": 0.08915647119283676, "learning_rate": 0.00048235113566605283, "loss": 1.2467, "num_tokens": 7352196.0, "step": 116 }, { "epoch": 0.2155688622754491, "grad_norm": 0.08909046649932861, "learning_rate": 0.0004821976672805402, "loss": 1.2559, "num_tokens": 7416728.0, "step": 117 }, { "epoch": 0.21741133118378628, "grad_norm": 0.11706819385290146, "learning_rate": 0.00048204419889502767, "loss": 1.3332, "num_tokens": 7480544.0, "step": 118 }, { "epoch": 0.21925380009212345, "grad_norm": 0.08741747587919235, "learning_rate": 0.00048189073050951503, "loss": 1.3279, "num_tokens": 7543899.0, "step": 119 }, { "epoch": 0.22109626900046062, "grad_norm": 0.09022476524114609, "learning_rate": 0.0004817372621240025, "loss": 1.2729, "num_tokens": 7606229.0, "step": 120 }, { "epoch": 0.22293873790879778, "grad_norm": 0.08855341374874115, "learning_rate": 0.00048158379373848987, "loss": 1.2966, "num_tokens": 7669432.0, "step": 121 }, { "epoch": 0.22478120681713495, "grad_norm": 0.09143545478582382, "learning_rate": 0.00048143032535297734, "loss": 1.3796, "num_tokens": 7733533.0, "step": 122 }, { "epoch": 0.22662367572547212, "grad_norm": 0.09877335280179977, "learning_rate": 0.0004812768569674647, "loss": 1.2475, "num_tokens": 7797715.0, "step": 123 }, { "epoch": 0.2284661446338093, "grad_norm": 0.0907440185546875, "learning_rate": 0.0004811233885819522, "loss": 1.279, "num_tokens": 7860140.0, "step": 124 }, { "epoch": 0.2303086135421465, "grad_norm": 0.09071335196495056, "learning_rate": 0.00048096992019643954, "loss": 1.2781, "num_tokens": 7923617.0, "step": 125 }, { "epoch": 0.23215108245048366, "grad_norm": 0.08857566863298416, "learning_rate": 0.00048081645181092695, "loss": 1.356, "num_tokens": 7987454.0, "step": 126 }, { "epoch": 0.23399355135882083, "grad_norm": 0.08675359934568405, "learning_rate": 0.00048066298342541437, "loss": 1.3, "num_tokens": 8050965.0, "step": 127 }, { "epoch": 0.235836020267158, "grad_norm": 0.09069304168224335, "learning_rate": 0.0004805095150399018, "loss": 1.2587, "num_tokens": 8115672.0, "step": 128 }, { "epoch": 0.23767848917549517, "grad_norm": 0.0990447998046875, "learning_rate": 0.0004803560466543892, "loss": 1.2978, "num_tokens": 8178834.0, "step": 129 }, { "epoch": 0.23952095808383234, "grad_norm": 0.09882336109876633, "learning_rate": 0.0004802025782688766, "loss": 1.2334, "num_tokens": 8241067.0, "step": 130 }, { "epoch": 0.2413634269921695, "grad_norm": 0.09168076515197754, "learning_rate": 0.00048004910988336404, "loss": 1.2856, "num_tokens": 8304603.0, "step": 131 }, { "epoch": 0.24320589590050667, "grad_norm": 0.08502545952796936, "learning_rate": 0.00047989564149785146, "loss": 1.3096, "num_tokens": 8368573.0, "step": 132 }, { "epoch": 0.24504836480884384, "grad_norm": 0.09370923042297363, "learning_rate": 0.0004797421731123389, "loss": 1.3355, "num_tokens": 8432455.0, "step": 133 }, { "epoch": 0.246890833717181, "grad_norm": 0.08775393664836884, "learning_rate": 0.0004795887047268263, "loss": 1.2381, "num_tokens": 8496584.0, "step": 134 }, { "epoch": 0.24873330262551818, "grad_norm": 0.08802684396505356, "learning_rate": 0.0004794352363413137, "loss": 1.2444, "num_tokens": 8561586.0, "step": 135 }, { "epoch": 0.25057577153385535, "grad_norm": 0.09058354049921036, "learning_rate": 0.00047928176795580113, "loss": 1.3087, "num_tokens": 8625121.0, "step": 136 }, { "epoch": 0.2524182404421925, "grad_norm": 0.09893903136253357, "learning_rate": 0.00047912829957028855, "loss": 1.3166, "num_tokens": 8689218.0, "step": 137 }, { "epoch": 0.2542607093505297, "grad_norm": 0.09142877161502838, "learning_rate": 0.00047897483118477597, "loss": 1.2798, "num_tokens": 8751321.0, "step": 138 }, { "epoch": 0.25610317825886686, "grad_norm": 0.08350904285907745, "learning_rate": 0.0004788213627992634, "loss": 1.3574, "num_tokens": 8816008.0, "step": 139 }, { "epoch": 0.25794564716720403, "grad_norm": 0.09445066004991531, "learning_rate": 0.0004786678944137508, "loss": 1.338, "num_tokens": 8879049.0, "step": 140 }, { "epoch": 0.2597881160755412, "grad_norm": 0.08924169838428497, "learning_rate": 0.00047851442602823817, "loss": 1.2877, "num_tokens": 8942298.0, "step": 141 }, { "epoch": 0.2616305849838784, "grad_norm": 0.09568005055189133, "learning_rate": 0.00047836095764272564, "loss": 1.2629, "num_tokens": 9007615.0, "step": 142 }, { "epoch": 0.2634730538922156, "grad_norm": 0.09064745903015137, "learning_rate": 0.000478207489257213, "loss": 1.2484, "num_tokens": 9070597.0, "step": 143 }, { "epoch": 0.26531552280055276, "grad_norm": 0.0949067547917366, "learning_rate": 0.0004780540208717004, "loss": 1.3254, "num_tokens": 9135280.0, "step": 144 }, { "epoch": 0.26715799170888993, "grad_norm": 0.10061468183994293, "learning_rate": 0.00047790055248618784, "loss": 1.2997, "num_tokens": 9198462.0, "step": 145 }, { "epoch": 0.2690004606172271, "grad_norm": 0.0937122330069542, "learning_rate": 0.00047774708410067525, "loss": 1.3092, "num_tokens": 9261449.0, "step": 146 }, { "epoch": 0.27084292952556427, "grad_norm": 0.09279117733240128, "learning_rate": 0.00047759361571516267, "loss": 1.2568, "num_tokens": 9325872.0, "step": 147 }, { "epoch": 0.27268539843390144, "grad_norm": 0.10213746130466461, "learning_rate": 0.0004774401473296501, "loss": 1.3235, "num_tokens": 9389755.0, "step": 148 }, { "epoch": 0.2745278673422386, "grad_norm": 0.09749754518270493, "learning_rate": 0.0004772866789441375, "loss": 1.3179, "num_tokens": 9453600.0, "step": 149 }, { "epoch": 0.2763703362505758, "grad_norm": 0.09111884236335754, "learning_rate": 0.0004771332105586249, "loss": 1.3642, "num_tokens": 9517466.0, "step": 150 }, { "epoch": 0.27821280515891295, "grad_norm": 0.10381179302930832, "learning_rate": 0.00047697974217311234, "loss": 1.23, "num_tokens": 9580857.0, "step": 151 }, { "epoch": 0.2800552740672501, "grad_norm": 0.09723608195781708, "learning_rate": 0.00047682627378759976, "loss": 1.2971, "num_tokens": 9643336.0, "step": 152 }, { "epoch": 0.2818977429755873, "grad_norm": 0.09318266063928604, "learning_rate": 0.0004766728054020872, "loss": 1.2831, "num_tokens": 9706631.0, "step": 153 }, { "epoch": 0.28374021188392445, "grad_norm": 0.1071537435054779, "learning_rate": 0.0004765193370165746, "loss": 1.2633, "num_tokens": 9769134.0, "step": 154 }, { "epoch": 0.2855826807922616, "grad_norm": 0.09743782132863998, "learning_rate": 0.000476365868631062, "loss": 1.3811, "num_tokens": 9832985.0, "step": 155 }, { "epoch": 0.2874251497005988, "grad_norm": 0.09620378166437149, "learning_rate": 0.0004762124002455494, "loss": 1.2446, "num_tokens": 9896373.0, "step": 156 }, { "epoch": 0.28926761860893596, "grad_norm": 0.10181791335344315, "learning_rate": 0.00047605893186003685, "loss": 1.286, "num_tokens": 9959560.0, "step": 157 }, { "epoch": 0.29111008751727313, "grad_norm": 0.09868380427360535, "learning_rate": 0.0004759054634745242, "loss": 1.3392, "num_tokens": 10022306.0, "step": 158 }, { "epoch": 0.2929525564256103, "grad_norm": 0.09871764481067657, "learning_rate": 0.0004757519950890117, "loss": 1.2788, "num_tokens": 10084740.0, "step": 159 }, { "epoch": 0.29479502533394747, "grad_norm": 0.09174288809299469, "learning_rate": 0.00047559852670349905, "loss": 1.2304, "num_tokens": 10149096.0, "step": 160 }, { "epoch": 0.29663749424228464, "grad_norm": 0.09089936316013336, "learning_rate": 0.0004754450583179865, "loss": 1.2343, "num_tokens": 10212831.0, "step": 161 }, { "epoch": 0.2984799631506218, "grad_norm": 0.08724789321422577, "learning_rate": 0.0004752915899324739, "loss": 1.313, "num_tokens": 10277667.0, "step": 162 }, { "epoch": 0.30032243205895903, "grad_norm": 0.09532701969146729, "learning_rate": 0.00047513812154696136, "loss": 1.2119, "num_tokens": 10341088.0, "step": 163 }, { "epoch": 0.3021649009672962, "grad_norm": 0.08841877430677414, "learning_rate": 0.0004749846531614487, "loss": 1.2562, "num_tokens": 10403795.0, "step": 164 }, { "epoch": 0.30400736987563337, "grad_norm": 0.10321378707885742, "learning_rate": 0.0004748311847759362, "loss": 1.2951, "num_tokens": 10467188.0, "step": 165 }, { "epoch": 0.30584983878397054, "grad_norm": 0.0962178185582161, "learning_rate": 0.00047467771639042355, "loss": 1.3058, "num_tokens": 10531445.0, "step": 166 }, { "epoch": 0.3076923076923077, "grad_norm": 0.09240318834781647, "learning_rate": 0.000474524248004911, "loss": 1.3048, "num_tokens": 10594415.0, "step": 167 }, { "epoch": 0.3095347766006449, "grad_norm": 0.0968954935669899, "learning_rate": 0.0004743707796193984, "loss": 1.2634, "num_tokens": 10657930.0, "step": 168 }, { "epoch": 0.31137724550898205, "grad_norm": 0.08881954848766327, "learning_rate": 0.00047421731123388586, "loss": 1.2667, "num_tokens": 10722136.0, "step": 169 }, { "epoch": 0.3132197144173192, "grad_norm": 0.09685399383306503, "learning_rate": 0.0004740638428483732, "loss": 1.3094, "num_tokens": 10785449.0, "step": 170 }, { "epoch": 0.3150621833256564, "grad_norm": 0.15080352127552032, "learning_rate": 0.0004739103744628607, "loss": 1.2689, "num_tokens": 10848655.0, "step": 171 }, { "epoch": 0.31690465223399356, "grad_norm": 0.09611034393310547, "learning_rate": 0.00047375690607734806, "loss": 1.2683, "num_tokens": 10911209.0, "step": 172 }, { "epoch": 0.3187471211423307, "grad_norm": 0.09508621692657471, "learning_rate": 0.0004736034376918355, "loss": 1.2842, "num_tokens": 10975256.0, "step": 173 }, { "epoch": 0.3205895900506679, "grad_norm": 0.10551825910806656, "learning_rate": 0.0004734499693063229, "loss": 1.2936, "num_tokens": 11038931.0, "step": 174 }, { "epoch": 0.32243205895900506, "grad_norm": 0.09252557903528214, "learning_rate": 0.0004732965009208103, "loss": 1.3388, "num_tokens": 11102498.0, "step": 175 }, { "epoch": 0.32427452786734223, "grad_norm": 0.11074170470237732, "learning_rate": 0.00047314303253529773, "loss": 1.2661, "num_tokens": 11163963.0, "step": 176 }, { "epoch": 0.3261169967756794, "grad_norm": 0.09143912047147751, "learning_rate": 0.00047298956414978515, "loss": 1.1907, "num_tokens": 11226715.0, "step": 177 }, { "epoch": 0.32795946568401657, "grad_norm": 0.10227463394403458, "learning_rate": 0.00047283609576427257, "loss": 1.1516, "num_tokens": 11290582.0, "step": 178 }, { "epoch": 0.32980193459235374, "grad_norm": 0.09942394495010376, "learning_rate": 0.00047268262737876, "loss": 1.1847, "num_tokens": 11353360.0, "step": 179 }, { "epoch": 0.3316444035006909, "grad_norm": 0.09928397834300995, "learning_rate": 0.0004725291589932474, "loss": 1.2911, "num_tokens": 11416865.0, "step": 180 }, { "epoch": 0.3334868724090281, "grad_norm": 0.1033642515540123, "learning_rate": 0.0004723756906077348, "loss": 1.2977, "num_tokens": 11480172.0, "step": 181 }, { "epoch": 0.33532934131736525, "grad_norm": 0.09114142507314682, "learning_rate": 0.00047222222222222224, "loss": 1.2868, "num_tokens": 11544512.0, "step": 182 }, { "epoch": 0.3371718102257024, "grad_norm": 0.0953393504023552, "learning_rate": 0.00047206875383670966, "loss": 1.2918, "num_tokens": 11607483.0, "step": 183 }, { "epoch": 0.3390142791340396, "grad_norm": 0.11545693874359131, "learning_rate": 0.0004719152854511971, "loss": 1.3424, "num_tokens": 11669271.0, "step": 184 }, { "epoch": 0.3408567480423768, "grad_norm": 0.09393662959337234, "learning_rate": 0.0004717618170656845, "loss": 1.2489, "num_tokens": 11733538.0, "step": 185 }, { "epoch": 0.342699216950714, "grad_norm": 0.11163611710071564, "learning_rate": 0.0004716083486801719, "loss": 1.2819, "num_tokens": 11797358.0, "step": 186 }, { "epoch": 0.34454168585905115, "grad_norm": 0.09949325770139694, "learning_rate": 0.0004714548802946593, "loss": 1.3093, "num_tokens": 11860695.0, "step": 187 }, { "epoch": 0.3463841547673883, "grad_norm": 0.10119225084781647, "learning_rate": 0.0004713014119091467, "loss": 1.3331, "num_tokens": 11921561.0, "step": 188 }, { "epoch": 0.3482266236757255, "grad_norm": 0.11003967374563217, "learning_rate": 0.00047114794352363416, "loss": 1.2571, "num_tokens": 11984325.0, "step": 189 }, { "epoch": 0.35006909258406266, "grad_norm": 0.09147924929857254, "learning_rate": 0.0004709944751381215, "loss": 1.2277, "num_tokens": 12047541.0, "step": 190 }, { "epoch": 0.3519115614923998, "grad_norm": 0.09424199908971786, "learning_rate": 0.000470841006752609, "loss": 1.3135, "num_tokens": 12110331.0, "step": 191 }, { "epoch": 0.353754030400737, "grad_norm": 0.09839576482772827, "learning_rate": 0.00047068753836709636, "loss": 1.3186, "num_tokens": 12174018.0, "step": 192 }, { "epoch": 0.35559649930907417, "grad_norm": 0.09452823549509048, "learning_rate": 0.00047053406998158383, "loss": 1.3467, "num_tokens": 12238555.0, "step": 193 }, { "epoch": 0.35743896821741133, "grad_norm": 0.10120641440153122, "learning_rate": 0.0004703806015960712, "loss": 1.2447, "num_tokens": 12301110.0, "step": 194 }, { "epoch": 0.3592814371257485, "grad_norm": 0.09962154924869537, "learning_rate": 0.00047022713321055867, "loss": 1.3399, "num_tokens": 12364253.0, "step": 195 }, { "epoch": 0.3611239060340857, "grad_norm": 0.09950164705514908, "learning_rate": 0.00047007366482504603, "loss": 1.2764, "num_tokens": 12427775.0, "step": 196 }, { "epoch": 0.36296637494242284, "grad_norm": 0.10616143047809601, "learning_rate": 0.0004699201964395335, "loss": 1.2295, "num_tokens": 12491111.0, "step": 197 }, { "epoch": 0.36480884385076, "grad_norm": 0.09427747875452042, "learning_rate": 0.00046976672805402087, "loss": 1.231, "num_tokens": 12554529.0, "step": 198 }, { "epoch": 0.3666513127590972, "grad_norm": 0.11054739356040955, "learning_rate": 0.00046961325966850834, "loss": 1.2873, "num_tokens": 12618060.0, "step": 199 }, { "epoch": 0.36849378166743435, "grad_norm": 0.10015911608934402, "learning_rate": 0.0004694597912829957, "loss": 1.3164, "num_tokens": 12681344.0, "step": 200 }, { "epoch": 0.3703362505757715, "grad_norm": 0.10280846804380417, "learning_rate": 0.0004693063228974832, "loss": 1.2862, "num_tokens": 12743635.0, "step": 201 }, { "epoch": 0.3721787194841087, "grad_norm": 0.09174755215644836, "learning_rate": 0.00046915285451197054, "loss": 1.1559, "num_tokens": 12808002.0, "step": 202 }, { "epoch": 0.37402118839244586, "grad_norm": 0.09782297909259796, "learning_rate": 0.000468999386126458, "loss": 1.3592, "num_tokens": 12871354.0, "step": 203 }, { "epoch": 0.375863657300783, "grad_norm": 0.11277933418750763, "learning_rate": 0.0004688459177409454, "loss": 1.1873, "num_tokens": 12934741.0, "step": 204 }, { "epoch": 0.3777061262091202, "grad_norm": 0.09454668313264847, "learning_rate": 0.0004686924493554328, "loss": 1.2604, "num_tokens": 12998787.0, "step": 205 }, { "epoch": 0.37954859511745737, "grad_norm": 0.09543941915035248, "learning_rate": 0.0004685389809699202, "loss": 1.2226, "num_tokens": 13063052.0, "step": 206 }, { "epoch": 0.3813910640257946, "grad_norm": 0.11904283612966537, "learning_rate": 0.0004683855125844076, "loss": 1.2741, "num_tokens": 13126172.0, "step": 207 }, { "epoch": 0.38323353293413176, "grad_norm": 0.10892145335674286, "learning_rate": 0.00046823204419889504, "loss": 1.2501, "num_tokens": 13189187.0, "step": 208 }, { "epoch": 0.38507600184246893, "grad_norm": 0.0946912094950676, "learning_rate": 0.00046807857581338246, "loss": 1.2937, "num_tokens": 13251467.0, "step": 209 }, { "epoch": 0.3869184707508061, "grad_norm": 0.11348316073417664, "learning_rate": 0.0004679251074278699, "loss": 1.2249, "num_tokens": 13314149.0, "step": 210 }, { "epoch": 0.38876093965914327, "grad_norm": 0.09878228604793549, "learning_rate": 0.0004677716390423573, "loss": 1.3456, "num_tokens": 13378033.0, "step": 211 }, { "epoch": 0.39060340856748044, "grad_norm": 0.09119324386119843, "learning_rate": 0.0004676181706568447, "loss": 1.2265, "num_tokens": 13441776.0, "step": 212 }, { "epoch": 0.3924458774758176, "grad_norm": 0.11676391959190369, "learning_rate": 0.00046746470227133213, "loss": 1.2745, "num_tokens": 13503327.0, "step": 213 }, { "epoch": 0.3942883463841548, "grad_norm": 0.09270577132701874, "learning_rate": 0.00046731123388581955, "loss": 1.2485, "num_tokens": 13567063.0, "step": 214 }, { "epoch": 0.39613081529249194, "grad_norm": 0.0977969765663147, "learning_rate": 0.0004671577655003069, "loss": 1.2716, "num_tokens": 13631097.0, "step": 215 }, { "epoch": 0.3979732842008291, "grad_norm": 0.09898398816585541, "learning_rate": 0.0004670042971147944, "loss": 1.2662, "num_tokens": 13694133.0, "step": 216 }, { "epoch": 0.3998157531091663, "grad_norm": 0.0990348681807518, "learning_rate": 0.00046685082872928175, "loss": 1.2911, "num_tokens": 13757488.0, "step": 217 }, { "epoch": 0.40165822201750345, "grad_norm": 0.10380908846855164, "learning_rate": 0.0004666973603437692, "loss": 1.2559, "num_tokens": 13820183.0, "step": 218 }, { "epoch": 0.4035006909258406, "grad_norm": 0.13048210740089417, "learning_rate": 0.0004665438919582566, "loss": 1.2468, "num_tokens": 13882956.0, "step": 219 }, { "epoch": 0.4053431598341778, "grad_norm": 0.10040684789419174, "learning_rate": 0.000466390423572744, "loss": 1.3152, "num_tokens": 13945850.0, "step": 220 }, { "epoch": 0.40718562874251496, "grad_norm": 0.1173500344157219, "learning_rate": 0.0004662369551872314, "loss": 1.2492, "num_tokens": 14009914.0, "step": 221 }, { "epoch": 0.40902809765085213, "grad_norm": 0.09842982143163681, "learning_rate": 0.00046608348680171884, "loss": 1.2586, "num_tokens": 14074433.0, "step": 222 }, { "epoch": 0.4108705665591893, "grad_norm": 0.09893294423818588, "learning_rate": 0.00046593001841620626, "loss": 1.2748, "num_tokens": 14137377.0, "step": 223 }, { "epoch": 0.41271303546752647, "grad_norm": 0.14513461291790009, "learning_rate": 0.0004657765500306937, "loss": 1.2998, "num_tokens": 14198599.0, "step": 224 }, { "epoch": 0.41455550437586364, "grad_norm": 0.11926227062940598, "learning_rate": 0.0004656230816451811, "loss": 1.2744, "num_tokens": 14260704.0, "step": 225 }, { "epoch": 0.4163979732842008, "grad_norm": 0.09100288897752762, "learning_rate": 0.0004654696132596685, "loss": 1.2018, "num_tokens": 14324307.0, "step": 226 }, { "epoch": 0.418240442192538, "grad_norm": 0.10198856145143509, "learning_rate": 0.0004653161448741559, "loss": 1.2994, "num_tokens": 14385427.0, "step": 227 }, { "epoch": 0.4200829111008752, "grad_norm": 0.09253379702568054, "learning_rate": 0.00046516267648864334, "loss": 1.3909, "num_tokens": 14448956.0, "step": 228 }, { "epoch": 0.42192538000921237, "grad_norm": 0.0986032783985138, "learning_rate": 0.00046500920810313076, "loss": 1.2834, "num_tokens": 14511130.0, "step": 229 }, { "epoch": 0.42376784891754954, "grad_norm": 0.1144937202334404, "learning_rate": 0.0004648557397176182, "loss": 1.2454, "num_tokens": 14575072.0, "step": 230 }, { "epoch": 0.4256103178258867, "grad_norm": 0.09571216255426407, "learning_rate": 0.0004647022713321056, "loss": 1.2486, "num_tokens": 14639141.0, "step": 231 }, { "epoch": 0.4274527867342239, "grad_norm": 0.10347457975149155, "learning_rate": 0.000464548802946593, "loss": 1.3113, "num_tokens": 14702311.0, "step": 232 }, { "epoch": 0.42929525564256105, "grad_norm": 0.09361619502305984, "learning_rate": 0.00046439533456108043, "loss": 1.2086, "num_tokens": 14766177.0, "step": 233 }, { "epoch": 0.4311377245508982, "grad_norm": 0.09890967607498169, "learning_rate": 0.00046424186617556785, "loss": 1.2793, "num_tokens": 14830662.0, "step": 234 }, { "epoch": 0.4329801934592354, "grad_norm": 0.09661815315485, "learning_rate": 0.0004640883977900552, "loss": 1.2255, "num_tokens": 14895078.0, "step": 235 }, { "epoch": 0.43482266236757255, "grad_norm": 0.09385213255882263, "learning_rate": 0.0004639349294045427, "loss": 1.2231, "num_tokens": 14958503.0, "step": 236 }, { "epoch": 0.4366651312759097, "grad_norm": 0.0976121723651886, "learning_rate": 0.00046378146101903005, "loss": 1.3142, "num_tokens": 15022416.0, "step": 237 }, { "epoch": 0.4385076001842469, "grad_norm": 0.0922638401389122, "learning_rate": 0.0004636279926335175, "loss": 1.1864, "num_tokens": 15086901.0, "step": 238 }, { "epoch": 0.44035006909258406, "grad_norm": 0.0931812971830368, "learning_rate": 0.0004634745242480049, "loss": 1.3026, "num_tokens": 15150409.0, "step": 239 }, { "epoch": 0.44219253800092123, "grad_norm": 0.12121029943227768, "learning_rate": 0.00046332105586249236, "loss": 1.3042, "num_tokens": 15214126.0, "step": 240 }, { "epoch": 0.4440350069092584, "grad_norm": 0.09763725847005844, "learning_rate": 0.0004631675874769797, "loss": 1.2744, "num_tokens": 15277838.0, "step": 241 }, { "epoch": 0.44587747581759557, "grad_norm": 0.09625076502561569, "learning_rate": 0.0004630141190914672, "loss": 1.211, "num_tokens": 15342460.0, "step": 242 }, { "epoch": 0.44771994472593274, "grad_norm": 0.09327404946088791, "learning_rate": 0.00046286065070595456, "loss": 1.2436, "num_tokens": 15404661.0, "step": 243 }, { "epoch": 0.4495624136342699, "grad_norm": 0.0955573171377182, "learning_rate": 0.00046270718232044203, "loss": 1.1972, "num_tokens": 15468531.0, "step": 244 }, { "epoch": 0.4514048825426071, "grad_norm": 0.09429655224084854, "learning_rate": 0.0004625537139349294, "loss": 1.2573, "num_tokens": 15532284.0, "step": 245 }, { "epoch": 0.45324735145094425, "grad_norm": 0.1168677881360054, "learning_rate": 0.00046240024554941686, "loss": 1.2014, "num_tokens": 15594401.0, "step": 246 }, { "epoch": 0.4550898203592814, "grad_norm": 0.10106991231441498, "learning_rate": 0.00046224677716390423, "loss": 1.2965, "num_tokens": 15659106.0, "step": 247 }, { "epoch": 0.4569322892676186, "grad_norm": 0.11693844199180603, "learning_rate": 0.0004620933087783917, "loss": 1.2828, "num_tokens": 15723438.0, "step": 248 }, { "epoch": 0.45877475817595575, "grad_norm": 0.09572553634643555, "learning_rate": 0.00046193984039287906, "loss": 1.3022, "num_tokens": 15787307.0, "step": 249 }, { "epoch": 0.460617227084293, "grad_norm": 0.09704402089118958, "learning_rate": 0.0004617863720073665, "loss": 1.2891, "num_tokens": 15851786.0, "step": 250 }, { "epoch": 0.46245969599263015, "grad_norm": 0.09775398671627045, "learning_rate": 0.0004616329036218539, "loss": 1.2251, "num_tokens": 15915381.0, "step": 251 }, { "epoch": 0.4643021649009673, "grad_norm": 0.10276384651660919, "learning_rate": 0.0004614794352363413, "loss": 1.2612, "num_tokens": 15977498.0, "step": 252 }, { "epoch": 0.4661446338093045, "grad_norm": 0.09268301725387573, "learning_rate": 0.00046132596685082873, "loss": 1.2522, "num_tokens": 16040740.0, "step": 253 }, { "epoch": 0.46798710271764166, "grad_norm": 0.10253075510263443, "learning_rate": 0.00046117249846531615, "loss": 1.3038, "num_tokens": 16103627.0, "step": 254 }, { "epoch": 0.4698295716259788, "grad_norm": 0.09636884927749634, "learning_rate": 0.00046101903007980357, "loss": 1.2493, "num_tokens": 16167151.0, "step": 255 }, { "epoch": 0.471672040534316, "grad_norm": 0.1097356379032135, "learning_rate": 0.000460865561694291, "loss": 1.2242, "num_tokens": 16228494.0, "step": 256 }, { "epoch": 0.47351450944265316, "grad_norm": 0.1036401093006134, "learning_rate": 0.0004607120933087784, "loss": 1.2367, "num_tokens": 16290975.0, "step": 257 }, { "epoch": 0.47535697835099033, "grad_norm": 0.09891247749328613, "learning_rate": 0.0004605586249232658, "loss": 1.1769, "num_tokens": 16353958.0, "step": 258 }, { "epoch": 0.4771994472593275, "grad_norm": 0.0961914137005806, "learning_rate": 0.00046040515653775324, "loss": 1.2461, "num_tokens": 16418512.0, "step": 259 }, { "epoch": 0.47904191616766467, "grad_norm": 0.09430357813835144, "learning_rate": 0.00046025168815224066, "loss": 1.1943, "num_tokens": 16481973.0, "step": 260 }, { "epoch": 0.48088438507600184, "grad_norm": 0.09624114632606506, "learning_rate": 0.0004600982197667281, "loss": 1.1657, "num_tokens": 16545272.0, "step": 261 }, { "epoch": 0.482726853984339, "grad_norm": 0.09604683518409729, "learning_rate": 0.0004599447513812155, "loss": 1.2235, "num_tokens": 16609443.0, "step": 262 }, { "epoch": 0.4845693228926762, "grad_norm": 0.10418158769607544, "learning_rate": 0.0004597912829957029, "loss": 1.2592, "num_tokens": 16673001.0, "step": 263 }, { "epoch": 0.48641179180101335, "grad_norm": 0.11444243788719177, "learning_rate": 0.00045963781461019033, "loss": 1.21, "num_tokens": 16736542.0, "step": 264 }, { "epoch": 0.4882542607093505, "grad_norm": 0.09989791363477707, "learning_rate": 0.0004594843462246777, "loss": 1.2184, "num_tokens": 16799944.0, "step": 265 }, { "epoch": 0.4900967296176877, "grad_norm": 0.1089451014995575, "learning_rate": 0.00045933087783916516, "loss": 1.2254, "num_tokens": 16862904.0, "step": 266 }, { "epoch": 0.49193919852602486, "grad_norm": 0.10207071900367737, "learning_rate": 0.00045917740945365253, "loss": 1.2789, "num_tokens": 16925720.0, "step": 267 }, { "epoch": 0.493781667434362, "grad_norm": 0.10958519577980042, "learning_rate": 0.00045902394106814, "loss": 1.2514, "num_tokens": 16989306.0, "step": 268 }, { "epoch": 0.4956241363426992, "grad_norm": 0.09575817734003067, "learning_rate": 0.00045887047268262736, "loss": 1.2103, "num_tokens": 17053464.0, "step": 269 }, { "epoch": 0.49746660525103636, "grad_norm": 0.09897450357675552, "learning_rate": 0.00045871700429711483, "loss": 1.2866, "num_tokens": 17117587.0, "step": 270 }, { "epoch": 0.49930907415937353, "grad_norm": 0.09855060279369354, "learning_rate": 0.0004585635359116022, "loss": 1.2666, "num_tokens": 17180070.0, "step": 271 }, { "epoch": 0.5011515430677107, "grad_norm": 0.09505361318588257, "learning_rate": 0.00045841006752608967, "loss": 1.252, "num_tokens": 17244637.0, "step": 272 }, { "epoch": 0.5029940119760479, "grad_norm": 0.0996844545006752, "learning_rate": 0.00045825659914057703, "loss": 1.2324, "num_tokens": 17307708.0, "step": 273 }, { "epoch": 0.504836480884385, "grad_norm": 0.10562769323587418, "learning_rate": 0.0004581031307550645, "loss": 1.2653, "num_tokens": 17370782.0, "step": 274 }, { "epoch": 0.5066789497927222, "grad_norm": 0.09653283655643463, "learning_rate": 0.00045794966236955187, "loss": 1.275, "num_tokens": 17433657.0, "step": 275 }, { "epoch": 0.5085214187010594, "grad_norm": 0.09708886593580246, "learning_rate": 0.00045779619398403934, "loss": 1.2795, "num_tokens": 17496384.0, "step": 276 }, { "epoch": 0.5103638876093965, "grad_norm": 0.10174455493688583, "learning_rate": 0.0004576427255985267, "loss": 1.2957, "num_tokens": 17558933.0, "step": 277 }, { "epoch": 0.5122063565177337, "grad_norm": 0.10842365771532059, "learning_rate": 0.0004574892572130142, "loss": 1.2261, "num_tokens": 17621334.0, "step": 278 }, { "epoch": 0.5140488254260709, "grad_norm": 0.10619796812534332, "learning_rate": 0.00045733578882750154, "loss": 1.2916, "num_tokens": 17683886.0, "step": 279 }, { "epoch": 0.5158912943344081, "grad_norm": 0.0956711620092392, "learning_rate": 0.000457182320441989, "loss": 1.2689, "num_tokens": 17747959.0, "step": 280 }, { "epoch": 0.5177337632427452, "grad_norm": 0.09967044740915298, "learning_rate": 0.0004570288520564764, "loss": 1.3285, "num_tokens": 17810830.0, "step": 281 }, { "epoch": 0.5195762321510824, "grad_norm": 0.10575132071971893, "learning_rate": 0.0004568753836709638, "loss": 1.237, "num_tokens": 17874951.0, "step": 282 }, { "epoch": 0.5214187010594197, "grad_norm": 0.10313896834850311, "learning_rate": 0.0004567219152854512, "loss": 1.2967, "num_tokens": 17937625.0, "step": 283 }, { "epoch": 0.5232611699677568, "grad_norm": 0.09952889382839203, "learning_rate": 0.00045656844689993863, "loss": 1.2739, "num_tokens": 18001191.0, "step": 284 }, { "epoch": 0.525103638876094, "grad_norm": 0.10496751219034195, "learning_rate": 0.00045641497851442605, "loss": 1.2624, "num_tokens": 18065539.0, "step": 285 }, { "epoch": 0.5269461077844312, "grad_norm": 0.10126779228448868, "learning_rate": 0.0004562615101289134, "loss": 1.2562, "num_tokens": 18127521.0, "step": 286 }, { "epoch": 0.5287885766927684, "grad_norm": 0.09689553827047348, "learning_rate": 0.0004561080417434009, "loss": 1.2782, "num_tokens": 18190798.0, "step": 287 }, { "epoch": 0.5306310456011055, "grad_norm": 0.10323937237262726, "learning_rate": 0.00045595457335788825, "loss": 1.2975, "num_tokens": 18253818.0, "step": 288 }, { "epoch": 0.5324735145094427, "grad_norm": 0.0979054868221283, "learning_rate": 0.0004558011049723757, "loss": 1.272, "num_tokens": 18317908.0, "step": 289 }, { "epoch": 0.5343159834177799, "grad_norm": 0.09808478504419327, "learning_rate": 0.0004556476365868631, "loss": 1.2366, "num_tokens": 18382485.0, "step": 290 }, { "epoch": 0.536158452326117, "grad_norm": 0.10925448685884476, "learning_rate": 0.00045549416820135055, "loss": 1.2558, "num_tokens": 18443068.0, "step": 291 }, { "epoch": 0.5380009212344542, "grad_norm": 0.10182604938745499, "learning_rate": 0.0004553406998158379, "loss": 1.1956, "num_tokens": 18507262.0, "step": 292 }, { "epoch": 0.5398433901427914, "grad_norm": 0.10915637016296387, "learning_rate": 0.0004551872314303254, "loss": 1.3081, "num_tokens": 18569938.0, "step": 293 }, { "epoch": 0.5416858590511285, "grad_norm": 0.10115023702383041, "learning_rate": 0.00045503376304481275, "loss": 1.2433, "num_tokens": 18634097.0, "step": 294 }, { "epoch": 0.5435283279594657, "grad_norm": 0.0970570296049118, "learning_rate": 0.0004548802946593002, "loss": 1.2365, "num_tokens": 18697352.0, "step": 295 }, { "epoch": 0.5453707968678029, "grad_norm": 0.10749422758817673, "learning_rate": 0.0004547268262737876, "loss": 1.216, "num_tokens": 18760714.0, "step": 296 }, { "epoch": 0.54721326577614, "grad_norm": 0.100978784263134, "learning_rate": 0.000454573357888275, "loss": 1.276, "num_tokens": 18823556.0, "step": 297 }, { "epoch": 0.5490557346844772, "grad_norm": 0.09869664162397385, "learning_rate": 0.0004544198895027624, "loss": 1.1872, "num_tokens": 18887632.0, "step": 298 }, { "epoch": 0.5508982035928144, "grad_norm": 0.10011962056159973, "learning_rate": 0.00045426642111724984, "loss": 1.2423, "num_tokens": 18951795.0, "step": 299 }, { "epoch": 0.5527406725011516, "grad_norm": 0.09465339034795761, "learning_rate": 0.00045411295273173726, "loss": 1.1894, "num_tokens": 19014705.0, "step": 300 }, { "epoch": 0.5545831414094887, "grad_norm": 0.134961798787117, "learning_rate": 0.0004539594843462247, "loss": 1.2732, "num_tokens": 19078699.0, "step": 301 }, { "epoch": 0.5564256103178259, "grad_norm": 0.10843180865049362, "learning_rate": 0.0004538060159607121, "loss": 1.2258, "num_tokens": 19140001.0, "step": 302 }, { "epoch": 0.5582680792261631, "grad_norm": 0.09568282961845398, "learning_rate": 0.0004536525475751995, "loss": 1.2283, "num_tokens": 19205053.0, "step": 303 }, { "epoch": 0.5601105481345002, "grad_norm": 0.10251955687999725, "learning_rate": 0.00045349907918968693, "loss": 1.2481, "num_tokens": 19267167.0, "step": 304 }, { "epoch": 0.5619530170428374, "grad_norm": 0.11323412507772446, "learning_rate": 0.00045334561080417435, "loss": 1.2016, "num_tokens": 19331232.0, "step": 305 }, { "epoch": 0.5637954859511746, "grad_norm": 0.12728968262672424, "learning_rate": 0.00045319214241866176, "loss": 1.2453, "num_tokens": 19393786.0, "step": 306 }, { "epoch": 0.5656379548595117, "grad_norm": 0.10079807788133621, "learning_rate": 0.0004530386740331492, "loss": 1.2637, "num_tokens": 19457776.0, "step": 307 }, { "epoch": 0.5674804237678489, "grad_norm": 0.10168179869651794, "learning_rate": 0.0004528852056476366, "loss": 1.2207, "num_tokens": 19521912.0, "step": 308 }, { "epoch": 0.5693228926761861, "grad_norm": 0.1026490181684494, "learning_rate": 0.000452731737262124, "loss": 1.2569, "num_tokens": 19585648.0, "step": 309 }, { "epoch": 0.5711653615845232, "grad_norm": 0.10141963511705399, "learning_rate": 0.00045257826887661144, "loss": 1.2911, "num_tokens": 19649681.0, "step": 310 }, { "epoch": 0.5730078304928604, "grad_norm": 0.09750928729772568, "learning_rate": 0.00045242480049109885, "loss": 1.2784, "num_tokens": 19713126.0, "step": 311 }, { "epoch": 0.5748502994011976, "grad_norm": 0.10510984063148499, "learning_rate": 0.0004522713321055862, "loss": 1.2564, "num_tokens": 19774951.0, "step": 312 }, { "epoch": 0.5766927683095348, "grad_norm": 0.12155630439519882, "learning_rate": 0.0004521178637200737, "loss": 1.2264, "num_tokens": 19838877.0, "step": 313 }, { "epoch": 0.5785352372178719, "grad_norm": 0.1039825901389122, "learning_rate": 0.00045196439533456105, "loss": 1.2817, "num_tokens": 19902518.0, "step": 314 }, { "epoch": 0.5803777061262091, "grad_norm": 0.11095499992370605, "learning_rate": 0.0004518109269490485, "loss": 1.3281, "num_tokens": 19965720.0, "step": 315 }, { "epoch": 0.5822201750345463, "grad_norm": 0.11511994898319244, "learning_rate": 0.0004516574585635359, "loss": 1.1988, "num_tokens": 20029016.0, "step": 316 }, { "epoch": 0.5840626439428834, "grad_norm": 0.10079848766326904, "learning_rate": 0.00045150399017802336, "loss": 1.2323, "num_tokens": 20092163.0, "step": 317 }, { "epoch": 0.5859051128512206, "grad_norm": 0.10359913110733032, "learning_rate": 0.0004513505217925107, "loss": 1.2468, "num_tokens": 20156687.0, "step": 318 }, { "epoch": 0.5877475817595578, "grad_norm": 0.1077289804816246, "learning_rate": 0.0004511970534069982, "loss": 1.2477, "num_tokens": 20220208.0, "step": 319 }, { "epoch": 0.5895900506678949, "grad_norm": 0.1055116355419159, "learning_rate": 0.00045104358502148556, "loss": 1.1987, "num_tokens": 20283517.0, "step": 320 }, { "epoch": 0.5914325195762321, "grad_norm": 0.10269372910261154, "learning_rate": 0.00045089011663597303, "loss": 1.3135, "num_tokens": 20346936.0, "step": 321 }, { "epoch": 0.5932749884845693, "grad_norm": 0.11854238063097, "learning_rate": 0.0004507366482504604, "loss": 1.2127, "num_tokens": 20408961.0, "step": 322 }, { "epoch": 0.5951174573929064, "grad_norm": 0.10776227712631226, "learning_rate": 0.00045058317986494787, "loss": 1.1921, "num_tokens": 20472264.0, "step": 323 }, { "epoch": 0.5969599263012436, "grad_norm": 0.10772054642438889, "learning_rate": 0.00045042971147943523, "loss": 1.2542, "num_tokens": 20533623.0, "step": 324 }, { "epoch": 0.5988023952095808, "grad_norm": 0.1143348291516304, "learning_rate": 0.0004502762430939227, "loss": 1.3254, "num_tokens": 20597582.0, "step": 325 }, { "epoch": 0.6006448641179181, "grad_norm": 0.12067167460918427, "learning_rate": 0.00045012277470841006, "loss": 1.2602, "num_tokens": 20661625.0, "step": 326 }, { "epoch": 0.6024873330262552, "grad_norm": 0.10979700088500977, "learning_rate": 0.00044996930632289754, "loss": 1.1802, "num_tokens": 20722827.0, "step": 327 }, { "epoch": 0.6043298019345924, "grad_norm": 0.11592204868793488, "learning_rate": 0.0004498158379373849, "loss": 1.2098, "num_tokens": 20786049.0, "step": 328 }, { "epoch": 0.6061722708429296, "grad_norm": 0.11568925529718399, "learning_rate": 0.0004496623695518723, "loss": 1.2407, "num_tokens": 20847442.0, "step": 329 }, { "epoch": 0.6080147397512667, "grad_norm": 0.10582154244184494, "learning_rate": 0.00044950890116635974, "loss": 1.1987, "num_tokens": 20911023.0, "step": 330 }, { "epoch": 0.6098572086596039, "grad_norm": 0.14516089856624603, "learning_rate": 0.00044935543278084715, "loss": 1.2199, "num_tokens": 20974501.0, "step": 331 }, { "epoch": 0.6116996775679411, "grad_norm": 0.10579651594161987, "learning_rate": 0.00044920196439533457, "loss": 1.2317, "num_tokens": 21038953.0, "step": 332 }, { "epoch": 0.6135421464762782, "grad_norm": 0.10715454071760178, "learning_rate": 0.000449048496009822, "loss": 1.2515, "num_tokens": 21101731.0, "step": 333 }, { "epoch": 0.6153846153846154, "grad_norm": 0.10953768342733383, "learning_rate": 0.0004488950276243094, "loss": 1.2272, "num_tokens": 21165490.0, "step": 334 }, { "epoch": 0.6172270842929526, "grad_norm": 0.09921599924564362, "learning_rate": 0.0004487415592387968, "loss": 1.2194, "num_tokens": 21227494.0, "step": 335 }, { "epoch": 0.6190695532012898, "grad_norm": 0.1074928343296051, "learning_rate": 0.00044858809085328424, "loss": 1.2373, "num_tokens": 21289582.0, "step": 336 }, { "epoch": 0.6209120221096269, "grad_norm": 0.10659728944301605, "learning_rate": 0.00044843462246777166, "loss": 1.232, "num_tokens": 21352434.0, "step": 337 }, { "epoch": 0.6227544910179641, "grad_norm": 0.11081191152334213, "learning_rate": 0.0004482811540822591, "loss": 1.2125, "num_tokens": 21416609.0, "step": 338 }, { "epoch": 0.6245969599263013, "grad_norm": 0.10611343383789062, "learning_rate": 0.0004481276856967465, "loss": 1.2681, "num_tokens": 21478260.0, "step": 339 }, { "epoch": 0.6264394288346384, "grad_norm": 0.10498322546482086, "learning_rate": 0.0004479742173112339, "loss": 1.2304, "num_tokens": 21543327.0, "step": 340 }, { "epoch": 0.6282818977429756, "grad_norm": 0.11141800880432129, "learning_rate": 0.00044782074892572133, "loss": 1.1766, "num_tokens": 21608507.0, "step": 341 }, { "epoch": 0.6301243666513128, "grad_norm": 0.10133706778287888, "learning_rate": 0.00044766728054020875, "loss": 1.2173, "num_tokens": 21671818.0, "step": 342 }, { "epoch": 0.6319668355596499, "grad_norm": 0.1038823351264, "learning_rate": 0.00044751381215469617, "loss": 1.2126, "num_tokens": 21734486.0, "step": 343 }, { "epoch": 0.6338093044679871, "grad_norm": 0.12705904245376587, "learning_rate": 0.00044736034376918353, "loss": 1.2288, "num_tokens": 21797220.0, "step": 344 }, { "epoch": 0.6356517733763243, "grad_norm": 0.10769445449113846, "learning_rate": 0.000447206875383671, "loss": 1.3181, "num_tokens": 21860115.0, "step": 345 }, { "epoch": 0.6374942422846614, "grad_norm": 0.1008937656879425, "learning_rate": 0.00044705340699815836, "loss": 1.2684, "num_tokens": 21924009.0, "step": 346 }, { "epoch": 0.6393367111929986, "grad_norm": 0.10911904275417328, "learning_rate": 0.00044689993861264584, "loss": 1.2134, "num_tokens": 21988489.0, "step": 347 }, { "epoch": 0.6411791801013358, "grad_norm": 0.11573173850774765, "learning_rate": 0.0004467464702271332, "loss": 1.227, "num_tokens": 22051833.0, "step": 348 }, { "epoch": 0.643021649009673, "grad_norm": 0.11072526127099991, "learning_rate": 0.00044659300184162067, "loss": 1.2252, "num_tokens": 22115709.0, "step": 349 }, { "epoch": 0.6448641179180101, "grad_norm": 0.11368118226528168, "learning_rate": 0.00044643953345610804, "loss": 1.2185, "num_tokens": 22179135.0, "step": 350 }, { "epoch": 0.6467065868263473, "grad_norm": 0.09761424362659454, "learning_rate": 0.0004462860650705955, "loss": 1.1917, "num_tokens": 22242505.0, "step": 351 }, { "epoch": 0.6485490557346845, "grad_norm": 0.11453306674957275, "learning_rate": 0.00044613259668508287, "loss": 1.2412, "num_tokens": 22306863.0, "step": 352 }, { "epoch": 0.6503915246430216, "grad_norm": 0.10386066138744354, "learning_rate": 0.00044597912829957034, "loss": 1.2458, "num_tokens": 22370973.0, "step": 353 }, { "epoch": 0.6522339935513588, "grad_norm": 0.10413955897092819, "learning_rate": 0.0004458256599140577, "loss": 1.2257, "num_tokens": 22433564.0, "step": 354 }, { "epoch": 0.654076462459696, "grad_norm": 0.10333375632762909, "learning_rate": 0.0004456721915285452, "loss": 1.2325, "num_tokens": 22497012.0, "step": 355 }, { "epoch": 0.6559189313680331, "grad_norm": 0.11412518471479416, "learning_rate": 0.00044551872314303254, "loss": 1.188, "num_tokens": 22562161.0, "step": 356 }, { "epoch": 0.6577614002763703, "grad_norm": 0.10158652812242508, "learning_rate": 0.00044536525475751996, "loss": 1.2046, "num_tokens": 22625149.0, "step": 357 }, { "epoch": 0.6596038691847075, "grad_norm": 0.11158877611160278, "learning_rate": 0.0004452117863720074, "loss": 1.1889, "num_tokens": 22686994.0, "step": 358 }, { "epoch": 0.6614463380930447, "grad_norm": 0.10429638624191284, "learning_rate": 0.00044505831798649474, "loss": 1.2097, "num_tokens": 22750680.0, "step": 359 }, { "epoch": 0.6632888070013818, "grad_norm": 0.11209189891815186, "learning_rate": 0.0004449048496009822, "loss": 1.1523, "num_tokens": 22815693.0, "step": 360 }, { "epoch": 0.665131275909719, "grad_norm": 0.10211613029241562, "learning_rate": 0.0004447513812154696, "loss": 1.1886, "num_tokens": 22879853.0, "step": 361 }, { "epoch": 0.6669737448180562, "grad_norm": 0.09991370141506195, "learning_rate": 0.00044459791282995705, "loss": 1.1664, "num_tokens": 22943177.0, "step": 362 }, { "epoch": 0.6688162137263933, "grad_norm": 0.10149337351322174, "learning_rate": 0.0004444444444444444, "loss": 1.2154, "num_tokens": 23008299.0, "step": 363 }, { "epoch": 0.6706586826347305, "grad_norm": 0.10461593419313431, "learning_rate": 0.0004442909760589319, "loss": 1.2323, "num_tokens": 23071172.0, "step": 364 }, { "epoch": 0.6725011515430677, "grad_norm": 0.1017397940158844, "learning_rate": 0.00044413750767341925, "loss": 1.3048, "num_tokens": 23135197.0, "step": 365 }, { "epoch": 0.6743436204514048, "grad_norm": 0.11012981832027435, "learning_rate": 0.0004439840392879067, "loss": 1.2156, "num_tokens": 23196100.0, "step": 366 }, { "epoch": 0.676186089359742, "grad_norm": 0.10384906083345413, "learning_rate": 0.0004438305709023941, "loss": 1.2588, "num_tokens": 23260175.0, "step": 367 }, { "epoch": 0.6780285582680792, "grad_norm": 0.10092870146036148, "learning_rate": 0.00044367710251688155, "loss": 1.242, "num_tokens": 23324771.0, "step": 368 }, { "epoch": 0.6798710271764163, "grad_norm": 0.11139003932476044, "learning_rate": 0.0004435236341313689, "loss": 1.2244, "num_tokens": 23387765.0, "step": 369 }, { "epoch": 0.6817134960847536, "grad_norm": 0.11014363914728165, "learning_rate": 0.0004433701657458564, "loss": 1.2576, "num_tokens": 23451981.0, "step": 370 }, { "epoch": 0.6835559649930908, "grad_norm": 0.1018562763929367, "learning_rate": 0.00044321669736034375, "loss": 1.2044, "num_tokens": 23515897.0, "step": 371 }, { "epoch": 0.685398433901428, "grad_norm": 0.11016716063022614, "learning_rate": 0.0004430632289748312, "loss": 1.3438, "num_tokens": 23579468.0, "step": 372 }, { "epoch": 0.6872409028097651, "grad_norm": 0.1040375828742981, "learning_rate": 0.0004429097605893186, "loss": 1.2598, "num_tokens": 23643216.0, "step": 373 }, { "epoch": 0.6890833717181023, "grad_norm": 0.09719370305538177, "learning_rate": 0.000442756292203806, "loss": 1.2942, "num_tokens": 23706780.0, "step": 374 }, { "epoch": 0.6909258406264395, "grad_norm": 0.09986993670463562, "learning_rate": 0.0004426028238182934, "loss": 1.1667, "num_tokens": 23771003.0, "step": 375 }, { "epoch": 0.6927683095347766, "grad_norm": 0.10404027998447418, "learning_rate": 0.00044244935543278084, "loss": 1.1794, "num_tokens": 23834093.0, "step": 376 }, { "epoch": 0.6946107784431138, "grad_norm": 0.1068766638636589, "learning_rate": 0.00044229588704726826, "loss": 1.2218, "num_tokens": 23895738.0, "step": 377 }, { "epoch": 0.696453247351451, "grad_norm": 0.10458534955978394, "learning_rate": 0.0004421424186617557, "loss": 1.1883, "num_tokens": 23959320.0, "step": 378 }, { "epoch": 0.6982957162597881, "grad_norm": 0.10190456360578537, "learning_rate": 0.0004419889502762431, "loss": 1.3227, "num_tokens": 24023785.0, "step": 379 }, { "epoch": 0.7001381851681253, "grad_norm": 0.10291954129934311, "learning_rate": 0.0004418354818907305, "loss": 1.2623, "num_tokens": 24087357.0, "step": 380 }, { "epoch": 0.7019806540764625, "grad_norm": 0.1056765615940094, "learning_rate": 0.00044168201350521793, "loss": 1.3298, "num_tokens": 24150982.0, "step": 381 }, { "epoch": 0.7038231229847997, "grad_norm": 0.10178087651729584, "learning_rate": 0.00044152854511970535, "loss": 1.1671, "num_tokens": 24212857.0, "step": 382 }, { "epoch": 0.7056655918931368, "grad_norm": 0.0936005711555481, "learning_rate": 0.00044137507673419277, "loss": 1.3168, "num_tokens": 24275964.0, "step": 383 }, { "epoch": 0.707508060801474, "grad_norm": 0.10786133259534836, "learning_rate": 0.0004412216083486802, "loss": 1.2535, "num_tokens": 24338677.0, "step": 384 }, { "epoch": 0.7093505297098112, "grad_norm": 0.10507118701934814, "learning_rate": 0.0004410681399631676, "loss": 1.2392, "num_tokens": 24402651.0, "step": 385 }, { "epoch": 0.7111929986181483, "grad_norm": 0.10568293929100037, "learning_rate": 0.000440914671577655, "loss": 1.2476, "num_tokens": 24466290.0, "step": 386 }, { "epoch": 0.7130354675264855, "grad_norm": 0.10220424830913544, "learning_rate": 0.00044076120319214244, "loss": 1.2549, "num_tokens": 24530793.0, "step": 387 }, { "epoch": 0.7148779364348227, "grad_norm": 0.10975175350904465, "learning_rate": 0.00044060773480662985, "loss": 1.2201, "num_tokens": 24594318.0, "step": 388 }, { "epoch": 0.7167204053431598, "grad_norm": 0.10170818120241165, "learning_rate": 0.0004404542664211172, "loss": 1.2288, "num_tokens": 24658598.0, "step": 389 }, { "epoch": 0.718562874251497, "grad_norm": 0.10951067507266998, "learning_rate": 0.0004403007980356047, "loss": 1.2554, "num_tokens": 24720701.0, "step": 390 }, { "epoch": 0.7204053431598342, "grad_norm": 0.11519483476877213, "learning_rate": 0.00044014732965009205, "loss": 1.2759, "num_tokens": 24784316.0, "step": 391 }, { "epoch": 0.7222478120681713, "grad_norm": 0.10630222409963608, "learning_rate": 0.0004399938612645795, "loss": 1.1959, "num_tokens": 24848987.0, "step": 392 }, { "epoch": 0.7240902809765085, "grad_norm": 0.11281648278236389, "learning_rate": 0.0004398403928790669, "loss": 1.1439, "num_tokens": 24912623.0, "step": 393 }, { "epoch": 0.7259327498848457, "grad_norm": 0.12020087987184525, "learning_rate": 0.00043968692449355436, "loss": 1.2012, "num_tokens": 24976643.0, "step": 394 }, { "epoch": 0.7277752187931829, "grad_norm": 0.10685519129037857, "learning_rate": 0.0004395334561080417, "loss": 1.1606, "num_tokens": 25039775.0, "step": 395 }, { "epoch": 0.72961768770152, "grad_norm": 0.12509074807167053, "learning_rate": 0.0004393799877225292, "loss": 1.2251, "num_tokens": 25102092.0, "step": 396 }, { "epoch": 0.7314601566098572, "grad_norm": 0.10460842400789261, "learning_rate": 0.00043922651933701656, "loss": 1.1821, "num_tokens": 25165283.0, "step": 397 }, { "epoch": 0.7333026255181944, "grad_norm": 0.1033405214548111, "learning_rate": 0.00043907305095150403, "loss": 1.238, "num_tokens": 25228524.0, "step": 398 }, { "epoch": 0.7351450944265315, "grad_norm": 0.10843577980995178, "learning_rate": 0.0004389195825659914, "loss": 1.2125, "num_tokens": 25292140.0, "step": 399 }, { "epoch": 0.7369875633348687, "grad_norm": 0.10244468599557877, "learning_rate": 0.00043876611418047887, "loss": 1.1616, "num_tokens": 25355907.0, "step": 400 }, { "epoch": 0.7388300322432059, "grad_norm": 0.09961044788360596, "learning_rate": 0.00043861264579496623, "loss": 1.2539, "num_tokens": 25419901.0, "step": 401 }, { "epoch": 0.740672501151543, "grad_norm": 0.10825616121292114, "learning_rate": 0.0004384591774094537, "loss": 1.2002, "num_tokens": 25483890.0, "step": 402 }, { "epoch": 0.7425149700598802, "grad_norm": 0.1140342578291893, "learning_rate": 0.00043830570902394107, "loss": 1.2524, "num_tokens": 25546565.0, "step": 403 }, { "epoch": 0.7443574389682174, "grad_norm": 0.1101999282836914, "learning_rate": 0.00043815224063842854, "loss": 1.2895, "num_tokens": 25610255.0, "step": 404 }, { "epoch": 0.7461999078765545, "grad_norm": 0.11343330889940262, "learning_rate": 0.0004379987722529159, "loss": 1.1722, "num_tokens": 25673567.0, "step": 405 }, { "epoch": 0.7480423767848917, "grad_norm": 0.11592653393745422, "learning_rate": 0.0004378453038674033, "loss": 1.1948, "num_tokens": 25737304.0, "step": 406 }, { "epoch": 0.7498848456932289, "grad_norm": 0.10558687150478363, "learning_rate": 0.00043769183548189074, "loss": 1.2389, "num_tokens": 25801295.0, "step": 407 }, { "epoch": 0.751727314601566, "grad_norm": 0.12302211672067642, "learning_rate": 0.00043753836709637815, "loss": 1.2814, "num_tokens": 25863507.0, "step": 408 }, { "epoch": 0.7535697835099032, "grad_norm": 0.10455392301082611, "learning_rate": 0.00043738489871086557, "loss": 1.2585, "num_tokens": 25927318.0, "step": 409 }, { "epoch": 0.7554122524182404, "grad_norm": 0.10565931349992752, "learning_rate": 0.000437231430325353, "loss": 1.2652, "num_tokens": 25990290.0, "step": 410 }, { "epoch": 0.7572547213265776, "grad_norm": 0.10755893588066101, "learning_rate": 0.0004370779619398404, "loss": 1.2348, "num_tokens": 26053280.0, "step": 411 }, { "epoch": 0.7590971902349147, "grad_norm": 0.10607592016458511, "learning_rate": 0.0004369244935543278, "loss": 1.2246, "num_tokens": 26115944.0, "step": 412 }, { "epoch": 0.760939659143252, "grad_norm": 0.1486101895570755, "learning_rate": 0.00043677102516881524, "loss": 1.1405, "num_tokens": 26180107.0, "step": 413 }, { "epoch": 0.7627821280515892, "grad_norm": 0.1094764918088913, "learning_rate": 0.00043661755678330266, "loss": 1.2978, "num_tokens": 26243844.0, "step": 414 }, { "epoch": 0.7646245969599264, "grad_norm": 0.10461403429508209, "learning_rate": 0.0004364640883977901, "loss": 1.1844, "num_tokens": 26307899.0, "step": 415 }, { "epoch": 0.7664670658682635, "grad_norm": 0.10358037054538727, "learning_rate": 0.0004363106200122775, "loss": 1.1846, "num_tokens": 26371478.0, "step": 416 }, { "epoch": 0.7683095347766007, "grad_norm": 0.10155554115772247, "learning_rate": 0.0004361571516267649, "loss": 1.2091, "num_tokens": 26435416.0, "step": 417 }, { "epoch": 0.7701520036849379, "grad_norm": 0.10498125851154327, "learning_rate": 0.00043600368324125233, "loss": 1.1557, "num_tokens": 26500189.0, "step": 418 }, { "epoch": 0.771994472593275, "grad_norm": 0.11268804222345352, "learning_rate": 0.00043585021485573975, "loss": 1.257, "num_tokens": 26562894.0, "step": 419 }, { "epoch": 0.7738369415016122, "grad_norm": 0.10611606389284134, "learning_rate": 0.00043569674647022717, "loss": 1.135, "num_tokens": 26627464.0, "step": 420 }, { "epoch": 0.7756794104099494, "grad_norm": 0.10143210738897324, "learning_rate": 0.00043554327808471453, "loss": 1.1478, "num_tokens": 26692002.0, "step": 421 }, { "epoch": 0.7775218793182865, "grad_norm": 0.10348612070083618, "learning_rate": 0.000435389809699202, "loss": 1.2446, "num_tokens": 26755992.0, "step": 422 }, { "epoch": 0.7793643482266237, "grad_norm": 0.10786455124616623, "learning_rate": 0.00043523634131368937, "loss": 1.1806, "num_tokens": 26818877.0, "step": 423 }, { "epoch": 0.7812068171349609, "grad_norm": 0.10393750667572021, "learning_rate": 0.00043508287292817684, "loss": 1.199, "num_tokens": 26882602.0, "step": 424 }, { "epoch": 0.783049286043298, "grad_norm": 0.10196984559297562, "learning_rate": 0.0004349294045426642, "loss": 1.2779, "num_tokens": 26946257.0, "step": 425 }, { "epoch": 0.7848917549516352, "grad_norm": 0.10630695521831512, "learning_rate": 0.0004347759361571516, "loss": 1.2657, "num_tokens": 27008258.0, "step": 426 }, { "epoch": 0.7867342238599724, "grad_norm": 0.1043534055352211, "learning_rate": 0.00043462246777163904, "loss": 1.1708, "num_tokens": 27072565.0, "step": 427 }, { "epoch": 0.7885766927683096, "grad_norm": 0.10614557564258575, "learning_rate": 0.00043446899938612645, "loss": 1.2048, "num_tokens": 27136652.0, "step": 428 }, { "epoch": 0.7904191616766467, "grad_norm": 0.1268279254436493, "learning_rate": 0.00043431553100061387, "loss": 1.3063, "num_tokens": 27201507.0, "step": 429 }, { "epoch": 0.7922616305849839, "grad_norm": 0.10701017081737518, "learning_rate": 0.0004341620626151013, "loss": 1.1534, "num_tokens": 27266208.0, "step": 430 }, { "epoch": 0.7941040994933211, "grad_norm": 0.09919115900993347, "learning_rate": 0.0004340085942295887, "loss": 1.2567, "num_tokens": 27329896.0, "step": 431 }, { "epoch": 0.7959465684016582, "grad_norm": 0.10987650603055954, "learning_rate": 0.0004338551258440761, "loss": 1.1642, "num_tokens": 27393933.0, "step": 432 }, { "epoch": 0.7977890373099954, "grad_norm": 0.10585780441761017, "learning_rate": 0.00043370165745856354, "loss": 1.1511, "num_tokens": 27458069.0, "step": 433 }, { "epoch": 0.7996315062183326, "grad_norm": 0.11062242835760117, "learning_rate": 0.00043354818907305096, "loss": 1.2193, "num_tokens": 27521965.0, "step": 434 }, { "epoch": 0.8014739751266697, "grad_norm": 0.11509957164525986, "learning_rate": 0.0004333947206875384, "loss": 1.2158, "num_tokens": 27582712.0, "step": 435 }, { "epoch": 0.8033164440350069, "grad_norm": 0.10878728330135345, "learning_rate": 0.00043324125230202574, "loss": 1.2124, "num_tokens": 27646449.0, "step": 436 }, { "epoch": 0.8051589129433441, "grad_norm": 0.10628356039524078, "learning_rate": 0.0004330877839165132, "loss": 1.2296, "num_tokens": 27708513.0, "step": 437 }, { "epoch": 0.8070013818516812, "grad_norm": 0.10199989378452301, "learning_rate": 0.0004329343155310006, "loss": 1.2591, "num_tokens": 27771284.0, "step": 438 }, { "epoch": 0.8088438507600184, "grad_norm": 0.10415118932723999, "learning_rate": 0.00043278084714548805, "loss": 1.2435, "num_tokens": 27834722.0, "step": 439 }, { "epoch": 0.8106863196683556, "grad_norm": 0.1128695160150528, "learning_rate": 0.0004326273787599754, "loss": 1.2921, "num_tokens": 27898282.0, "step": 440 }, { "epoch": 0.8125287885766928, "grad_norm": 0.10561270266771317, "learning_rate": 0.0004324739103744629, "loss": 1.2964, "num_tokens": 27961049.0, "step": 441 }, { "epoch": 0.8143712574850299, "grad_norm": 0.10623180866241455, "learning_rate": 0.00043232044198895025, "loss": 1.2563, "num_tokens": 28025273.0, "step": 442 }, { "epoch": 0.8162137263933671, "grad_norm": 0.12049615383148193, "learning_rate": 0.0004321669736034377, "loss": 1.242, "num_tokens": 28089340.0, "step": 443 }, { "epoch": 0.8180561953017043, "grad_norm": 0.11410742998123169, "learning_rate": 0.0004320135052179251, "loss": 1.2338, "num_tokens": 28151996.0, "step": 444 }, { "epoch": 0.8198986642100414, "grad_norm": 0.10255514830350876, "learning_rate": 0.00043186003683241256, "loss": 1.1528, "num_tokens": 28216878.0, "step": 445 }, { "epoch": 0.8217411331183786, "grad_norm": 0.11291272938251495, "learning_rate": 0.0004317065684468999, "loss": 1.1931, "num_tokens": 28280596.0, "step": 446 }, { "epoch": 0.8235836020267158, "grad_norm": 0.11602190136909485, "learning_rate": 0.0004315531000613874, "loss": 1.3362, "num_tokens": 28342509.0, "step": 447 }, { "epoch": 0.8254260709350529, "grad_norm": 0.10742507129907608, "learning_rate": 0.00043139963167587476, "loss": 1.2032, "num_tokens": 28406956.0, "step": 448 }, { "epoch": 0.8272685398433901, "grad_norm": 0.10775892436504364, "learning_rate": 0.0004312461632903622, "loss": 1.2853, "num_tokens": 28470006.0, "step": 449 }, { "epoch": 0.8291110087517273, "grad_norm": 0.11267748475074768, "learning_rate": 0.0004310926949048496, "loss": 1.1791, "num_tokens": 28533300.0, "step": 450 }, { "epoch": 0.8309534776600644, "grad_norm": 0.10543155670166016, "learning_rate": 0.00043093922651933706, "loss": 1.1983, "num_tokens": 28597727.0, "step": 451 }, { "epoch": 0.8327959465684016, "grad_norm": 0.1096687912940979, "learning_rate": 0.0004307857581338244, "loss": 1.1293, "num_tokens": 28659103.0, "step": 452 }, { "epoch": 0.8346384154767388, "grad_norm": 0.1046469435095787, "learning_rate": 0.00043063228974831184, "loss": 1.2581, "num_tokens": 28723240.0, "step": 453 }, { "epoch": 0.836480884385076, "grad_norm": 0.1104128435254097, "learning_rate": 0.00043047882136279926, "loss": 1.2349, "num_tokens": 28786365.0, "step": 454 }, { "epoch": 0.8383233532934131, "grad_norm": 0.10569712519645691, "learning_rate": 0.0004303253529772867, "loss": 1.2135, "num_tokens": 28849378.0, "step": 455 }, { "epoch": 0.8401658222017504, "grad_norm": 0.10911333560943604, "learning_rate": 0.0004301718845917741, "loss": 1.2898, "num_tokens": 28912134.0, "step": 456 }, { "epoch": 0.8420082911100876, "grad_norm": 0.11302676796913147, "learning_rate": 0.0004300184162062615, "loss": 1.2284, "num_tokens": 28975165.0, "step": 457 }, { "epoch": 0.8438507600184247, "grad_norm": 0.10635825991630554, "learning_rate": 0.00042986494782074893, "loss": 1.1654, "num_tokens": 29039369.0, "step": 458 }, { "epoch": 0.8456932289267619, "grad_norm": 0.1070987731218338, "learning_rate": 0.00042971147943523635, "loss": 1.2143, "num_tokens": 29102708.0, "step": 459 }, { "epoch": 0.8475356978350991, "grad_norm": 0.10710791498422623, "learning_rate": 0.00042955801104972377, "loss": 1.1828, "num_tokens": 29166235.0, "step": 460 }, { "epoch": 0.8493781667434362, "grad_norm": 0.10637195408344269, "learning_rate": 0.0004294045426642112, "loss": 1.2641, "num_tokens": 29229606.0, "step": 461 }, { "epoch": 0.8512206356517734, "grad_norm": 0.10710655897855759, "learning_rate": 0.0004292510742786986, "loss": 1.2029, "num_tokens": 29292042.0, "step": 462 }, { "epoch": 0.8530631045601106, "grad_norm": 0.10616612434387207, "learning_rate": 0.000429097605893186, "loss": 1.2049, "num_tokens": 29356520.0, "step": 463 }, { "epoch": 0.8549055734684478, "grad_norm": 0.10606261342763901, "learning_rate": 0.00042894413750767344, "loss": 1.1897, "num_tokens": 29419945.0, "step": 464 }, { "epoch": 0.8567480423767849, "grad_norm": 0.11182323098182678, "learning_rate": 0.00042879066912216086, "loss": 1.2303, "num_tokens": 29482275.0, "step": 465 }, { "epoch": 0.8585905112851221, "grad_norm": 0.09934931248426437, "learning_rate": 0.0004286372007366483, "loss": 1.138, "num_tokens": 29546758.0, "step": 466 }, { "epoch": 0.8604329801934593, "grad_norm": 0.11121580749750137, "learning_rate": 0.0004284837323511357, "loss": 1.1685, "num_tokens": 29610328.0, "step": 467 }, { "epoch": 0.8622754491017964, "grad_norm": 0.10641182214021683, "learning_rate": 0.00042833026396562306, "loss": 1.175, "num_tokens": 29673100.0, "step": 468 }, { "epoch": 0.8641179180101336, "grad_norm": 0.1050378754734993, "learning_rate": 0.00042817679558011053, "loss": 1.2408, "num_tokens": 29736908.0, "step": 469 }, { "epoch": 0.8659603869184708, "grad_norm": 0.10541864484548569, "learning_rate": 0.0004280233271945979, "loss": 1.2436, "num_tokens": 29800030.0, "step": 470 }, { "epoch": 0.8678028558268079, "grad_norm": 0.10389821976423264, "learning_rate": 0.00042786985880908536, "loss": 1.3023, "num_tokens": 29864192.0, "step": 471 }, { "epoch": 0.8696453247351451, "grad_norm": 0.10352307558059692, "learning_rate": 0.0004277163904235727, "loss": 1.296, "num_tokens": 29928522.0, "step": 472 }, { "epoch": 0.8714877936434823, "grad_norm": 0.10603337734937668, "learning_rate": 0.0004275629220380602, "loss": 1.1406, "num_tokens": 29992049.0, "step": 473 }, { "epoch": 0.8733302625518194, "grad_norm": 0.1054159477353096, "learning_rate": 0.00042740945365254756, "loss": 1.2537, "num_tokens": 30055527.0, "step": 474 }, { "epoch": 0.8751727314601566, "grad_norm": 0.1049441397190094, "learning_rate": 0.00042725598526703503, "loss": 1.3166, "num_tokens": 30119736.0, "step": 475 }, { "epoch": 0.8770152003684938, "grad_norm": 0.11269146203994751, "learning_rate": 0.0004271025168815224, "loss": 1.1841, "num_tokens": 30183019.0, "step": 476 }, { "epoch": 0.878857669276831, "grad_norm": 0.10870116949081421, "learning_rate": 0.00042694904849600987, "loss": 1.2359, "num_tokens": 30247775.0, "step": 477 }, { "epoch": 0.8807001381851681, "grad_norm": 0.1110510304570198, "learning_rate": 0.00042679558011049723, "loss": 1.2975, "num_tokens": 30311311.0, "step": 478 }, { "epoch": 0.8825426070935053, "grad_norm": 0.11001905053853989, "learning_rate": 0.0004266421117249847, "loss": 1.1835, "num_tokens": 30373473.0, "step": 479 }, { "epoch": 0.8843850760018425, "grad_norm": 0.1087229922413826, "learning_rate": 0.00042648864333947207, "loss": 1.286, "num_tokens": 30435352.0, "step": 480 }, { "epoch": 0.8862275449101796, "grad_norm": 0.11301674693822861, "learning_rate": 0.00042633517495395954, "loss": 1.2324, "num_tokens": 30500159.0, "step": 481 }, { "epoch": 0.8880700138185168, "grad_norm": 0.12084916234016418, "learning_rate": 0.0004261817065684469, "loss": 1.2714, "num_tokens": 30562948.0, "step": 482 }, { "epoch": 0.889912482726854, "grad_norm": 0.11421308666467667, "learning_rate": 0.0004260282381829343, "loss": 1.2575, "num_tokens": 30626538.0, "step": 483 }, { "epoch": 0.8917549516351911, "grad_norm": 0.10971112549304962, "learning_rate": 0.00042587476979742174, "loss": 1.1754, "num_tokens": 30689585.0, "step": 484 }, { "epoch": 0.8935974205435283, "grad_norm": 0.10548132658004761, "learning_rate": 0.00042572130141190916, "loss": 1.2866, "num_tokens": 30754894.0, "step": 485 }, { "epoch": 0.8954398894518655, "grad_norm": 0.10521477460861206, "learning_rate": 0.0004255678330263966, "loss": 1.2668, "num_tokens": 30818549.0, "step": 486 }, { "epoch": 0.8972823583602026, "grad_norm": 0.10960132628679276, "learning_rate": 0.000425414364640884, "loss": 1.19, "num_tokens": 30882049.0, "step": 487 }, { "epoch": 0.8991248272685398, "grad_norm": 0.10992678254842758, "learning_rate": 0.0004252608962553714, "loss": 1.2076, "num_tokens": 30946499.0, "step": 488 }, { "epoch": 0.900967296176877, "grad_norm": 0.11084679514169693, "learning_rate": 0.00042510742786985883, "loss": 1.1925, "num_tokens": 31009510.0, "step": 489 }, { "epoch": 0.9028097650852142, "grad_norm": 0.10967332869768143, "learning_rate": 0.00042495395948434625, "loss": 1.19, "num_tokens": 31072661.0, "step": 490 }, { "epoch": 0.9046522339935513, "grad_norm": 0.10935547202825546, "learning_rate": 0.00042480049109883366, "loss": 1.26, "num_tokens": 31136742.0, "step": 491 }, { "epoch": 0.9064947029018885, "grad_norm": 0.11470672488212585, "learning_rate": 0.0004246470227133211, "loss": 1.2026, "num_tokens": 31201361.0, "step": 492 }, { "epoch": 0.9083371718102257, "grad_norm": 0.1103513315320015, "learning_rate": 0.0004244935543278085, "loss": 1.13, "num_tokens": 31263690.0, "step": 493 }, { "epoch": 0.9101796407185628, "grad_norm": 0.10554005205631256, "learning_rate": 0.0004243400859422959, "loss": 1.2302, "num_tokens": 31326481.0, "step": 494 }, { "epoch": 0.9120221096269, "grad_norm": 0.11321807652711868, "learning_rate": 0.00042418661755678333, "loss": 1.2709, "num_tokens": 31390984.0, "step": 495 }, { "epoch": 0.9138645785352372, "grad_norm": 0.11590111255645752, "learning_rate": 0.00042403314917127075, "loss": 1.2334, "num_tokens": 31452727.0, "step": 496 }, { "epoch": 0.9157070474435743, "grad_norm": 0.10216066986322403, "learning_rate": 0.0004238796807857581, "loss": 1.1741, "num_tokens": 31516479.0, "step": 497 }, { "epoch": 0.9175495163519115, "grad_norm": 0.10940876603126526, "learning_rate": 0.00042372621240024553, "loss": 1.2705, "num_tokens": 31579630.0, "step": 498 }, { "epoch": 0.9193919852602487, "grad_norm": 0.11149633675813675, "learning_rate": 0.00042357274401473295, "loss": 1.2491, "num_tokens": 31642274.0, "step": 499 }, { "epoch": 0.921234454168586, "grad_norm": 0.11160603165626526, "learning_rate": 0.00042341927562922037, "loss": 1.2319, "num_tokens": 31705008.0, "step": 500 }, { "epoch": 0.9230769230769231, "grad_norm": 0.12118037045001984, "learning_rate": 0.0004232658072437078, "loss": 1.2206, "num_tokens": 31767686.0, "step": 501 }, { "epoch": 0.9249193919852603, "grad_norm": 0.10166748613119125, "learning_rate": 0.0004231123388581952, "loss": 1.2274, "num_tokens": 31831292.0, "step": 502 }, { "epoch": 0.9267618608935975, "grad_norm": 0.10773547738790512, "learning_rate": 0.0004229588704726826, "loss": 1.214, "num_tokens": 31895075.0, "step": 503 }, { "epoch": 0.9286043298019346, "grad_norm": 0.10363714396953583, "learning_rate": 0.00042280540208717004, "loss": 1.207, "num_tokens": 31960182.0, "step": 504 }, { "epoch": 0.9304467987102718, "grad_norm": 0.10553659498691559, "learning_rate": 0.00042265193370165746, "loss": 1.2505, "num_tokens": 32023434.0, "step": 505 }, { "epoch": 0.932289267618609, "grad_norm": 0.10308346152305603, "learning_rate": 0.0004224984653161449, "loss": 1.1765, "num_tokens": 32086502.0, "step": 506 }, { "epoch": 0.9341317365269461, "grad_norm": 0.10617472976446152, "learning_rate": 0.0004223449969306323, "loss": 1.1737, "num_tokens": 32149405.0, "step": 507 }, { "epoch": 0.9359742054352833, "grad_norm": 0.10901011526584625, "learning_rate": 0.0004221915285451197, "loss": 1.1726, "num_tokens": 32212740.0, "step": 508 }, { "epoch": 0.9378166743436205, "grad_norm": 0.11122769117355347, "learning_rate": 0.00042203806015960713, "loss": 1.2175, "num_tokens": 32276459.0, "step": 509 }, { "epoch": 0.9396591432519577, "grad_norm": 0.11718197166919708, "learning_rate": 0.00042188459177409455, "loss": 1.1583, "num_tokens": 32338822.0, "step": 510 }, { "epoch": 0.9415016121602948, "grad_norm": 0.11907754093408585, "learning_rate": 0.00042173112338858196, "loss": 1.2596, "num_tokens": 32402242.0, "step": 511 }, { "epoch": 0.943344081068632, "grad_norm": 0.11104631423950195, "learning_rate": 0.0004215776550030694, "loss": 1.1472, "num_tokens": 32466630.0, "step": 512 }, { "epoch": 0.9451865499769692, "grad_norm": 0.10730155557394028, "learning_rate": 0.00042142418661755674, "loss": 1.2373, "num_tokens": 32530392.0, "step": 513 }, { "epoch": 0.9470290188853063, "grad_norm": 0.11576007306575775, "learning_rate": 0.0004212707182320442, "loss": 1.1977, "num_tokens": 32592790.0, "step": 514 }, { "epoch": 0.9488714877936435, "grad_norm": 0.10468292236328125, "learning_rate": 0.0004211172498465316, "loss": 1.1844, "num_tokens": 32657189.0, "step": 515 }, { "epoch": 0.9507139567019807, "grad_norm": 0.10577072948217392, "learning_rate": 0.00042096378146101905, "loss": 1.1994, "num_tokens": 32720171.0, "step": 516 }, { "epoch": 0.9525564256103178, "grad_norm": 0.11402202397584915, "learning_rate": 0.0004208103130755064, "loss": 1.2081, "num_tokens": 32783090.0, "step": 517 }, { "epoch": 0.954398894518655, "grad_norm": 0.10880884528160095, "learning_rate": 0.0004206568446899939, "loss": 1.2267, "num_tokens": 32846213.0, "step": 518 }, { "epoch": 0.9562413634269922, "grad_norm": 0.10786429047584534, "learning_rate": 0.00042050337630448125, "loss": 1.2217, "num_tokens": 32910479.0, "step": 519 }, { "epoch": 0.9580838323353293, "grad_norm": 0.10578510910272598, "learning_rate": 0.0004203499079189687, "loss": 1.1993, "num_tokens": 32975597.0, "step": 520 }, { "epoch": 0.9599263012436665, "grad_norm": 0.10631375014781952, "learning_rate": 0.0004201964395334561, "loss": 1.2291, "num_tokens": 33038546.0, "step": 521 }, { "epoch": 0.9617687701520037, "grad_norm": 0.11033403128385544, "learning_rate": 0.00042004297114794356, "loss": 1.1794, "num_tokens": 33102252.0, "step": 522 }, { "epoch": 0.9636112390603409, "grad_norm": 0.11057453602552414, "learning_rate": 0.0004198895027624309, "loss": 1.1814, "num_tokens": 33165735.0, "step": 523 }, { "epoch": 0.965453707968678, "grad_norm": 0.10619393736124039, "learning_rate": 0.0004197360343769184, "loss": 1.17, "num_tokens": 33230050.0, "step": 524 }, { "epoch": 0.9672961768770152, "grad_norm": 0.111701101064682, "learning_rate": 0.00041958256599140576, "loss": 1.2236, "num_tokens": 33294496.0, "step": 525 }, { "epoch": 0.9691386457853524, "grad_norm": 0.11277039349079132, "learning_rate": 0.00041942909760589323, "loss": 1.1696, "num_tokens": 33358708.0, "step": 526 }, { "epoch": 0.9709811146936895, "grad_norm": 0.10504408180713654, "learning_rate": 0.0004192756292203806, "loss": 1.1564, "num_tokens": 33421704.0, "step": 527 }, { "epoch": 0.9728235836020267, "grad_norm": 0.12793457508087158, "learning_rate": 0.00041912216083486806, "loss": 1.1947, "num_tokens": 33486202.0, "step": 528 }, { "epoch": 0.9746660525103639, "grad_norm": 0.10567033290863037, "learning_rate": 0.00041896869244935543, "loss": 1.1443, "num_tokens": 33548434.0, "step": 529 }, { "epoch": 0.976508521418701, "grad_norm": 0.1110161691904068, "learning_rate": 0.00041881522406384285, "loss": 1.1549, "num_tokens": 33611980.0, "step": 530 }, { "epoch": 0.9783509903270382, "grad_norm": 0.10747774690389633, "learning_rate": 0.00041866175567833026, "loss": 1.271, "num_tokens": 33675062.0, "step": 531 }, { "epoch": 0.9801934592353754, "grad_norm": 0.11228116601705551, "learning_rate": 0.0004185082872928177, "loss": 1.1886, "num_tokens": 33738712.0, "step": 532 }, { "epoch": 0.9820359281437125, "grad_norm": 0.10935964435338974, "learning_rate": 0.0004183548189073051, "loss": 1.1992, "num_tokens": 33802335.0, "step": 533 }, { "epoch": 0.9838783970520497, "grad_norm": 0.11493658274412155, "learning_rate": 0.0004182013505217925, "loss": 1.3034, "num_tokens": 33863990.0, "step": 534 }, { "epoch": 0.9857208659603869, "grad_norm": 0.11155559122562408, "learning_rate": 0.00041804788213627993, "loss": 1.2706, "num_tokens": 33928545.0, "step": 535 }, { "epoch": 0.987563334868724, "grad_norm": 0.1027296781539917, "learning_rate": 0.00041789441375076735, "loss": 1.1854, "num_tokens": 33993305.0, "step": 536 }, { "epoch": 0.9894058037770612, "grad_norm": 0.10459999740123749, "learning_rate": 0.00041774094536525477, "loss": 1.2066, "num_tokens": 34057037.0, "step": 537 }, { "epoch": 0.9912482726853984, "grad_norm": 0.10800804942846298, "learning_rate": 0.0004175874769797422, "loss": 1.1333, "num_tokens": 34119848.0, "step": 538 }, { "epoch": 0.9930907415937356, "grad_norm": 0.10472935438156128, "learning_rate": 0.0004174340085942296, "loss": 1.1638, "num_tokens": 34183718.0, "step": 539 }, { "epoch": 0.9949332105020727, "grad_norm": 0.10892577469348907, "learning_rate": 0.000417280540208717, "loss": 1.2456, "num_tokens": 34248032.0, "step": 540 }, { "epoch": 0.9967756794104099, "grad_norm": 0.11563415825366974, "learning_rate": 0.00041712707182320444, "loss": 1.2386, "num_tokens": 34311803.0, "step": 541 }, { "epoch": 0.9986181483187471, "grad_norm": 0.10656362771987915, "learning_rate": 0.00041697360343769186, "loss": 1.2528, "num_tokens": 34375197.0, "step": 542 }, { "epoch": 1.0, "grad_norm": 0.16938377916812897, "learning_rate": 0.0004168201350521793, "loss": 1.2912, "num_tokens": 34423264.0, "step": 543 }, { "epoch": 1.0, "eval_loss": 1.2737613916397095, "eval_num_tokens": 34423264.0, "eval_runtime": 2.235, "eval_samples_per_second": 22.371, "eval_steps_per_second": 0.895, "step": 543 }, { "epoch": 1.0018424689083372, "grad_norm": 0.12801103293895721, "learning_rate": 0.0004166666666666667, "loss": 1.1219, "num_tokens": 34486563.0, "step": 544 }, { "epoch": 1.0036849378166743, "grad_norm": 0.12387723475694656, "learning_rate": 0.00041651319828115406, "loss": 1.0923, "num_tokens": 34550140.0, "step": 545 }, { "epoch": 1.0055274067250115, "grad_norm": 0.11559862643480301, "learning_rate": 0.00041635972989564153, "loss": 1.0887, "num_tokens": 34613261.0, "step": 546 }, { "epoch": 1.0073698756333487, "grad_norm": 0.1341915726661682, "learning_rate": 0.0004162062615101289, "loss": 1.1112, "num_tokens": 34676511.0, "step": 547 }, { "epoch": 1.0092123445416858, "grad_norm": 0.12326817959547043, "learning_rate": 0.00041605279312461636, "loss": 1.0272, "num_tokens": 34741383.0, "step": 548 }, { "epoch": 1.011054813450023, "grad_norm": 0.12539108097553253, "learning_rate": 0.00041589932473910373, "loss": 1.0891, "num_tokens": 34804215.0, "step": 549 }, { "epoch": 1.0128972823583602, "grad_norm": 0.11155984550714493, "learning_rate": 0.0004157458563535912, "loss": 1.0587, "num_tokens": 34867577.0, "step": 550 }, { "epoch": 1.0147397512666974, "grad_norm": 0.12110836058855057, "learning_rate": 0.00041559238796807856, "loss": 1.0767, "num_tokens": 34929104.0, "step": 551 }, { "epoch": 1.0165822201750345, "grad_norm": 0.11489541083574295, "learning_rate": 0.00041543891958256604, "loss": 1.0524, "num_tokens": 34992291.0, "step": 552 }, { "epoch": 1.0184246890833717, "grad_norm": 0.11316906660795212, "learning_rate": 0.0004152854511970534, "loss": 1.0624, "num_tokens": 35055842.0, "step": 553 }, { "epoch": 1.0202671579917089, "grad_norm": 0.12519732117652893, "learning_rate": 0.00041513198281154087, "loss": 1.0961, "num_tokens": 35119739.0, "step": 554 }, { "epoch": 1.022109626900046, "grad_norm": 0.11942681670188904, "learning_rate": 0.00041497851442602823, "loss": 1.1393, "num_tokens": 35182287.0, "step": 555 }, { "epoch": 1.0239520958083832, "grad_norm": 0.12498798966407776, "learning_rate": 0.0004148250460405157, "loss": 1.052, "num_tokens": 35244445.0, "step": 556 }, { "epoch": 1.0257945647167204, "grad_norm": 0.11694146692752838, "learning_rate": 0.00041467157765500307, "loss": 1.0581, "num_tokens": 35307295.0, "step": 557 }, { "epoch": 1.0276370336250575, "grad_norm": 0.11156492680311203, "learning_rate": 0.00041451810926949054, "loss": 1.0822, "num_tokens": 35371189.0, "step": 558 }, { "epoch": 1.0294795025333947, "grad_norm": 0.11772070825099945, "learning_rate": 0.0004143646408839779, "loss": 1.0445, "num_tokens": 35433777.0, "step": 559 }, { "epoch": 1.0313219714417319, "grad_norm": 0.11783166229724884, "learning_rate": 0.0004142111724984653, "loss": 1.0647, "num_tokens": 35497703.0, "step": 560 }, { "epoch": 1.033164440350069, "grad_norm": 0.11564221978187561, "learning_rate": 0.00041405770411295274, "loss": 1.1095, "num_tokens": 35561742.0, "step": 561 }, { "epoch": 1.0350069092584062, "grad_norm": 0.12354882061481476, "learning_rate": 0.00041390423572744016, "loss": 1.0699, "num_tokens": 35624525.0, "step": 562 }, { "epoch": 1.0368493781667434, "grad_norm": 0.11736626923084259, "learning_rate": 0.0004137507673419276, "loss": 1.0877, "num_tokens": 35688107.0, "step": 563 }, { "epoch": 1.0386918470750806, "grad_norm": 0.11937452852725983, "learning_rate": 0.000413597298956415, "loss": 1.1163, "num_tokens": 35751907.0, "step": 564 }, { "epoch": 1.0405343159834177, "grad_norm": 0.11892041563987732, "learning_rate": 0.0004134438305709024, "loss": 1.0445, "num_tokens": 35815794.0, "step": 565 }, { "epoch": 1.042376784891755, "grad_norm": 0.10886310786008835, "learning_rate": 0.00041329036218538983, "loss": 1.0049, "num_tokens": 35880008.0, "step": 566 }, { "epoch": 1.044219253800092, "grad_norm": 0.11718738824129105, "learning_rate": 0.00041313689379987725, "loss": 1.1161, "num_tokens": 35943049.0, "step": 567 }, { "epoch": 1.0460617227084292, "grad_norm": 0.11380188912153244, "learning_rate": 0.0004129834254143646, "loss": 1.0998, "num_tokens": 36006943.0, "step": 568 }, { "epoch": 1.0479041916167664, "grad_norm": 0.12192203104496002, "learning_rate": 0.0004128299570288521, "loss": 1.1373, "num_tokens": 36069294.0, "step": 569 }, { "epoch": 1.0497466605251036, "grad_norm": 0.12139210104942322, "learning_rate": 0.00041267648864333945, "loss": 1.0976, "num_tokens": 36131826.0, "step": 570 }, { "epoch": 1.0515891294334407, "grad_norm": 0.1178176999092102, "learning_rate": 0.0004125230202578269, "loss": 1.0552, "num_tokens": 36195998.0, "step": 571 }, { "epoch": 1.053431598341778, "grad_norm": 0.12444979697465897, "learning_rate": 0.0004123695518723143, "loss": 1.0417, "num_tokens": 36260359.0, "step": 572 }, { "epoch": 1.055274067250115, "grad_norm": 0.11820736527442932, "learning_rate": 0.00041221608348680175, "loss": 1.1098, "num_tokens": 36323900.0, "step": 573 }, { "epoch": 1.0571165361584522, "grad_norm": 0.11978204548358917, "learning_rate": 0.0004120626151012891, "loss": 1.1301, "num_tokens": 36387221.0, "step": 574 }, { "epoch": 1.0589590050667894, "grad_norm": 0.12661413848400116, "learning_rate": 0.0004119091467157766, "loss": 1.1029, "num_tokens": 36449408.0, "step": 575 }, { "epoch": 1.0608014739751266, "grad_norm": 0.1146896705031395, "learning_rate": 0.00041175567833026395, "loss": 1.0946, "num_tokens": 36514029.0, "step": 576 }, { "epoch": 1.0626439428834638, "grad_norm": 0.11858242005109787, "learning_rate": 0.00041160220994475137, "loss": 1.0692, "num_tokens": 36577560.0, "step": 577 }, { "epoch": 1.064486411791801, "grad_norm": 0.12135281413793564, "learning_rate": 0.0004114487415592388, "loss": 1.0611, "num_tokens": 36641203.0, "step": 578 }, { "epoch": 1.066328880700138, "grad_norm": 0.11587296426296234, "learning_rate": 0.0004112952731737262, "loss": 1.1045, "num_tokens": 36705059.0, "step": 579 }, { "epoch": 1.0681713496084753, "grad_norm": 0.12728872895240784, "learning_rate": 0.0004111418047882136, "loss": 1.1713, "num_tokens": 36768961.0, "step": 580 }, { "epoch": 1.0700138185168124, "grad_norm": 0.13300549983978271, "learning_rate": 0.00041098833640270104, "loss": 1.0812, "num_tokens": 36831401.0, "step": 581 }, { "epoch": 1.0718562874251496, "grad_norm": 0.12740281224250793, "learning_rate": 0.00041083486801718846, "loss": 1.0488, "num_tokens": 36894919.0, "step": 582 }, { "epoch": 1.0736987563334868, "grad_norm": 0.13304726779460907, "learning_rate": 0.0004106813996316759, "loss": 1.0047, "num_tokens": 36956114.0, "step": 583 }, { "epoch": 1.0755412252418242, "grad_norm": 0.11855430901050568, "learning_rate": 0.0004105279312461633, "loss": 1.063, "num_tokens": 37018910.0, "step": 584 }, { "epoch": 1.077383694150161, "grad_norm": 0.11828093230724335, "learning_rate": 0.0004103744628606507, "loss": 1.102, "num_tokens": 37083671.0, "step": 585 }, { "epoch": 1.0792261630584985, "grad_norm": 0.12145385146141052, "learning_rate": 0.00041022099447513813, "loss": 1.0989, "num_tokens": 37147494.0, "step": 586 }, { "epoch": 1.0810686319668354, "grad_norm": 0.11897139996290207, "learning_rate": 0.00041006752608962555, "loss": 1.0672, "num_tokens": 37212253.0, "step": 587 }, { "epoch": 1.0829111008751728, "grad_norm": 0.12519587576389313, "learning_rate": 0.00040991405770411296, "loss": 1.1141, "num_tokens": 37275976.0, "step": 588 }, { "epoch": 1.0847535697835098, "grad_norm": 0.1242384985089302, "learning_rate": 0.0004097605893186004, "loss": 1.0456, "num_tokens": 37339627.0, "step": 589 }, { "epoch": 1.0865960386918472, "grad_norm": 0.12598992884159088, "learning_rate": 0.0004096071209330878, "loss": 1.078, "num_tokens": 37402539.0, "step": 590 }, { "epoch": 1.0884385076001843, "grad_norm": 0.12699751555919647, "learning_rate": 0.0004094536525475752, "loss": 1.0214, "num_tokens": 37466648.0, "step": 591 }, { "epoch": 1.0902809765085215, "grad_norm": 0.11682726442813873, "learning_rate": 0.0004093001841620626, "loss": 1.0622, "num_tokens": 37531484.0, "step": 592 }, { "epoch": 1.0921234454168587, "grad_norm": 0.12184195965528488, "learning_rate": 0.00040914671577655005, "loss": 1.0242, "num_tokens": 37595658.0, "step": 593 }, { "epoch": 1.0939659143251959, "grad_norm": 0.12850064039230347, "learning_rate": 0.0004089932473910374, "loss": 1.0784, "num_tokens": 37659745.0, "step": 594 }, { "epoch": 1.095808383233533, "grad_norm": 0.12040050327777863, "learning_rate": 0.0004088397790055249, "loss": 1.1241, "num_tokens": 37723475.0, "step": 595 }, { "epoch": 1.0976508521418702, "grad_norm": 0.12761707603931427, "learning_rate": 0.00040868631062001225, "loss": 1.0739, "num_tokens": 37787867.0, "step": 596 }, { "epoch": 1.0994933210502074, "grad_norm": 0.12226364761590958, "learning_rate": 0.0004085328422344997, "loss": 1.1764, "num_tokens": 37852269.0, "step": 597 }, { "epoch": 1.1013357899585445, "grad_norm": 0.12022601813077927, "learning_rate": 0.0004083793738489871, "loss": 1.096, "num_tokens": 37916950.0, "step": 598 }, { "epoch": 1.1031782588668817, "grad_norm": 0.12100092321634293, "learning_rate": 0.00040822590546347456, "loss": 1.0071, "num_tokens": 37981152.0, "step": 599 }, { "epoch": 1.1050207277752189, "grad_norm": 0.1270889937877655, "learning_rate": 0.0004080724370779619, "loss": 1.0622, "num_tokens": 38044254.0, "step": 600 }, { "epoch": 1.106863196683556, "grad_norm": 0.12805640697479248, "learning_rate": 0.0004079189686924494, "loss": 1.0609, "num_tokens": 38108677.0, "step": 601 }, { "epoch": 1.1087056655918932, "grad_norm": 0.12489979714155197, "learning_rate": 0.00040776550030693676, "loss": 1.0961, "num_tokens": 38172560.0, "step": 602 }, { "epoch": 1.1105481345002304, "grad_norm": 0.12455274909734726, "learning_rate": 0.00040761203192142423, "loss": 1.0448, "num_tokens": 38235660.0, "step": 603 }, { "epoch": 1.1123906034085675, "grad_norm": 0.12145621329545975, "learning_rate": 0.0004074585635359116, "loss": 1.0963, "num_tokens": 38298189.0, "step": 604 }, { "epoch": 1.1142330723169047, "grad_norm": 0.11901623755693436, "learning_rate": 0.00040730509515039907, "loss": 1.1037, "num_tokens": 38361926.0, "step": 605 }, { "epoch": 1.1160755412252419, "grad_norm": 0.13306263089179993, "learning_rate": 0.00040715162676488643, "loss": 1.0658, "num_tokens": 38424936.0, "step": 606 }, { "epoch": 1.117918010133579, "grad_norm": 0.133799210190773, "learning_rate": 0.00040699815837937385, "loss": 1.0717, "num_tokens": 38487446.0, "step": 607 }, { "epoch": 1.1197604790419162, "grad_norm": 0.1230994388461113, "learning_rate": 0.00040684468999386126, "loss": 1.035, "num_tokens": 38549565.0, "step": 608 }, { "epoch": 1.1216029479502534, "grad_norm": 0.129140242934227, "learning_rate": 0.0004066912216083487, "loss": 1.0567, "num_tokens": 38612238.0, "step": 609 }, { "epoch": 1.1234454168585906, "grad_norm": 0.12152509391307831, "learning_rate": 0.0004065377532228361, "loss": 1.0645, "num_tokens": 38675295.0, "step": 610 }, { "epoch": 1.1252878857669277, "grad_norm": 0.12560808658599854, "learning_rate": 0.0004063842848373235, "loss": 1.04, "num_tokens": 38737870.0, "step": 611 }, { "epoch": 1.127130354675265, "grad_norm": 0.12172216176986694, "learning_rate": 0.00040623081645181094, "loss": 1.0913, "num_tokens": 38802250.0, "step": 612 }, { "epoch": 1.128972823583602, "grad_norm": 0.12256545573472977, "learning_rate": 0.00040607734806629835, "loss": 1.0534, "num_tokens": 38866369.0, "step": 613 }, { "epoch": 1.1308152924919392, "grad_norm": 0.11910805851221085, "learning_rate": 0.00040592387968078577, "loss": 1.013, "num_tokens": 38930498.0, "step": 614 }, { "epoch": 1.1326577614002764, "grad_norm": 0.13959896564483643, "learning_rate": 0.0004057704112952732, "loss": 1.0568, "num_tokens": 38993068.0, "step": 615 }, { "epoch": 1.1345002303086136, "grad_norm": 0.12386836111545563, "learning_rate": 0.0004056169429097606, "loss": 1.0749, "num_tokens": 39057178.0, "step": 616 }, { "epoch": 1.1363426992169507, "grad_norm": 0.1268816888332367, "learning_rate": 0.000405463474524248, "loss": 1.132, "num_tokens": 39119716.0, "step": 617 }, { "epoch": 1.138185168125288, "grad_norm": 0.11897124350070953, "learning_rate": 0.00040531000613873544, "loss": 1.0986, "num_tokens": 39183111.0, "step": 618 }, { "epoch": 1.140027637033625, "grad_norm": 0.1199638769030571, "learning_rate": 0.00040515653775322286, "loss": 1.104, "num_tokens": 39246473.0, "step": 619 }, { "epoch": 1.1418701059419623, "grad_norm": 0.12057936191558838, "learning_rate": 0.0004050030693677103, "loss": 1.0893, "num_tokens": 39311146.0, "step": 620 }, { "epoch": 1.1437125748502994, "grad_norm": 0.12372042238712311, "learning_rate": 0.0004048496009821977, "loss": 1.0509, "num_tokens": 39374040.0, "step": 621 }, { "epoch": 1.1455550437586366, "grad_norm": 0.12830191850662231, "learning_rate": 0.00040469613259668506, "loss": 1.0678, "num_tokens": 39438436.0, "step": 622 }, { "epoch": 1.1473975126669738, "grad_norm": 0.13031117618083954, "learning_rate": 0.00040454266421117253, "loss": 1.0637, "num_tokens": 39501889.0, "step": 623 }, { "epoch": 1.149239981575311, "grad_norm": 0.12363505363464355, "learning_rate": 0.0004043891958256599, "loss": 1.0805, "num_tokens": 39564852.0, "step": 624 }, { "epoch": 1.151082450483648, "grad_norm": 0.1370985060930252, "learning_rate": 0.00040423572744014737, "loss": 1.0479, "num_tokens": 39627993.0, "step": 625 }, { "epoch": 1.1529249193919853, "grad_norm": 0.1305304616689682, "learning_rate": 0.00040408225905463473, "loss": 1.1076, "num_tokens": 39692223.0, "step": 626 }, { "epoch": 1.1547673883003224, "grad_norm": 0.12175992131233215, "learning_rate": 0.0004039287906691222, "loss": 1.108, "num_tokens": 39755093.0, "step": 627 }, { "epoch": 1.1566098572086596, "grad_norm": 0.11692967265844345, "learning_rate": 0.00040377532228360956, "loss": 1.0423, "num_tokens": 39819691.0, "step": 628 }, { "epoch": 1.1584523261169968, "grad_norm": 0.11991413682699203, "learning_rate": 0.00040362185389809704, "loss": 1.074, "num_tokens": 39884132.0, "step": 629 }, { "epoch": 1.160294795025334, "grad_norm": 0.12967140972614288, "learning_rate": 0.0004034683855125844, "loss": 1.1215, "num_tokens": 39946737.0, "step": 630 }, { "epoch": 1.1621372639336711, "grad_norm": 0.13204121589660645, "learning_rate": 0.00040331491712707187, "loss": 1.1742, "num_tokens": 40009779.0, "step": 631 }, { "epoch": 1.1639797328420083, "grad_norm": 0.11530394107103348, "learning_rate": 0.00040316144874155924, "loss": 1.0501, "num_tokens": 40073817.0, "step": 632 }, { "epoch": 1.1658222017503455, "grad_norm": 0.1299007385969162, "learning_rate": 0.0004030079803560467, "loss": 1.1059, "num_tokens": 40135852.0, "step": 633 }, { "epoch": 1.1676646706586826, "grad_norm": 0.1301860213279724, "learning_rate": 0.00040285451197053407, "loss": 1.128, "num_tokens": 40199656.0, "step": 634 }, { "epoch": 1.1695071395670198, "grad_norm": 0.12071437388658524, "learning_rate": 0.00040270104358502154, "loss": 1.0387, "num_tokens": 40263936.0, "step": 635 }, { "epoch": 1.171349608475357, "grad_norm": 0.13202928006649017, "learning_rate": 0.0004025475751995089, "loss": 1.0984, "num_tokens": 40327679.0, "step": 636 }, { "epoch": 1.1731920773836941, "grad_norm": 0.1261385828256607, "learning_rate": 0.0004023941068139964, "loss": 1.0643, "num_tokens": 40390563.0, "step": 637 }, { "epoch": 1.1750345462920313, "grad_norm": 0.14418497681617737, "learning_rate": 0.00040224063842848374, "loss": 1.1507, "num_tokens": 40452700.0, "step": 638 }, { "epoch": 1.1768770152003685, "grad_norm": 0.13286694884300232, "learning_rate": 0.0004020871700429711, "loss": 1.0535, "num_tokens": 40513034.0, "step": 639 }, { "epoch": 1.1787194841087056, "grad_norm": 0.12426911294460297, "learning_rate": 0.0004019337016574586, "loss": 1.0989, "num_tokens": 40576496.0, "step": 640 }, { "epoch": 1.1805619530170428, "grad_norm": 0.12622949481010437, "learning_rate": 0.00040178023327194594, "loss": 1.0309, "num_tokens": 40640535.0, "step": 641 }, { "epoch": 1.18240442192538, "grad_norm": 0.13067370653152466, "learning_rate": 0.0004016267648864334, "loss": 1.059, "num_tokens": 40703278.0, "step": 642 }, { "epoch": 1.1842468908337171, "grad_norm": 0.12709321081638336, "learning_rate": 0.0004014732965009208, "loss": 1.0527, "num_tokens": 40766712.0, "step": 643 }, { "epoch": 1.1860893597420543, "grad_norm": 0.1284736543893814, "learning_rate": 0.00040131982811540825, "loss": 1.0745, "num_tokens": 40828496.0, "step": 644 }, { "epoch": 1.1879318286503915, "grad_norm": 0.12667329609394073, "learning_rate": 0.0004011663597298956, "loss": 1.0541, "num_tokens": 40892008.0, "step": 645 }, { "epoch": 1.1897742975587287, "grad_norm": 0.12444113940000534, "learning_rate": 0.0004010128913443831, "loss": 1.1128, "num_tokens": 40955243.0, "step": 646 }, { "epoch": 1.1916167664670658, "grad_norm": 0.12282882630825043, "learning_rate": 0.00040085942295887045, "loss": 1.1113, "num_tokens": 41018876.0, "step": 647 }, { "epoch": 1.193459235375403, "grad_norm": 0.13174371421337128, "learning_rate": 0.0004007059545733579, "loss": 1.1469, "num_tokens": 41080380.0, "step": 648 }, { "epoch": 1.1953017042837402, "grad_norm": 0.12263359874486923, "learning_rate": 0.0004005524861878453, "loss": 1.0545, "num_tokens": 41145075.0, "step": 649 }, { "epoch": 1.1971441731920773, "grad_norm": 0.13542623817920685, "learning_rate": 0.00040039901780233275, "loss": 1.0936, "num_tokens": 41208808.0, "step": 650 }, { "epoch": 1.1989866421004145, "grad_norm": 0.12266663461923599, "learning_rate": 0.0004002455494168201, "loss": 1.0691, "num_tokens": 41273214.0, "step": 651 }, { "epoch": 1.2008291110087517, "grad_norm": 0.12838315963745117, "learning_rate": 0.0004000920810313076, "loss": 1.0806, "num_tokens": 41336034.0, "step": 652 }, { "epoch": 1.2026715799170888, "grad_norm": 0.1230698749423027, "learning_rate": 0.00039993861264579495, "loss": 1.0836, "num_tokens": 41398895.0, "step": 653 }, { "epoch": 1.204514048825426, "grad_norm": 0.13295939564704895, "learning_rate": 0.00039978514426028237, "loss": 1.0722, "num_tokens": 41461118.0, "step": 654 }, { "epoch": 1.2063565177337632, "grad_norm": 0.1231362447142601, "learning_rate": 0.0003996316758747698, "loss": 1.1077, "num_tokens": 41524485.0, "step": 655 }, { "epoch": 1.2081989866421003, "grad_norm": 0.12513229250907898, "learning_rate": 0.0003994782074892572, "loss": 1.0505, "num_tokens": 41589277.0, "step": 656 }, { "epoch": 1.2100414555504375, "grad_norm": 0.12667593359947205, "learning_rate": 0.0003993247391037446, "loss": 1.0706, "num_tokens": 41653709.0, "step": 657 }, { "epoch": 1.2118839244587747, "grad_norm": 0.12337975203990936, "learning_rate": 0.00039917127071823204, "loss": 1.1226, "num_tokens": 41717449.0, "step": 658 }, { "epoch": 1.2137263933671119, "grad_norm": 0.13069283962249756, "learning_rate": 0.00039901780233271946, "loss": 1.062, "num_tokens": 41779074.0, "step": 659 }, { "epoch": 1.215568862275449, "grad_norm": 0.1264827400445938, "learning_rate": 0.0003988643339472069, "loss": 1.0773, "num_tokens": 41842836.0, "step": 660 }, { "epoch": 1.2174113311837862, "grad_norm": 0.1294403225183487, "learning_rate": 0.0003987108655616943, "loss": 1.0596, "num_tokens": 41907327.0, "step": 661 }, { "epoch": 1.2192538000921234, "grad_norm": 0.12931731343269348, "learning_rate": 0.0003985573971761817, "loss": 1.0827, "num_tokens": 41971329.0, "step": 662 }, { "epoch": 1.2210962690004605, "grad_norm": 0.13438211381435394, "learning_rate": 0.00039840392879066913, "loss": 1.0819, "num_tokens": 42034393.0, "step": 663 }, { "epoch": 1.2229387379087977, "grad_norm": 0.12451083958148956, "learning_rate": 0.00039825046040515655, "loss": 1.1101, "num_tokens": 42098592.0, "step": 664 }, { "epoch": 1.2247812068171349, "grad_norm": 0.13521477580070496, "learning_rate": 0.00039809699201964397, "loss": 1.0566, "num_tokens": 42159974.0, "step": 665 }, { "epoch": 1.226623675725472, "grad_norm": 0.1300981640815735, "learning_rate": 0.0003979435236341314, "loss": 1.0818, "num_tokens": 42223536.0, "step": 666 }, { "epoch": 1.2284661446338092, "grad_norm": 0.12449292093515396, "learning_rate": 0.0003977900552486188, "loss": 1.0495, "num_tokens": 42288565.0, "step": 667 }, { "epoch": 1.2303086135421464, "grad_norm": 0.13017182052135468, "learning_rate": 0.0003976365868631062, "loss": 1.0745, "num_tokens": 42353536.0, "step": 668 }, { "epoch": 1.2321510824504838, "grad_norm": 0.14621619880199432, "learning_rate": 0.0003974831184775936, "loss": 1.1162, "num_tokens": 42414967.0, "step": 669 }, { "epoch": 1.2339935513588207, "grad_norm": 0.1369888335466385, "learning_rate": 0.00039732965009208105, "loss": 1.061, "num_tokens": 42477959.0, "step": 670 }, { "epoch": 1.235836020267158, "grad_norm": 0.12462855130434036, "learning_rate": 0.0003971761817065684, "loss": 1.0405, "num_tokens": 42541197.0, "step": 671 }, { "epoch": 1.237678489175495, "grad_norm": 0.13860023021697998, "learning_rate": 0.0003970227133210559, "loss": 1.0127, "num_tokens": 42603188.0, "step": 672 }, { "epoch": 1.2395209580838324, "grad_norm": 0.12800508737564087, "learning_rate": 0.00039686924493554325, "loss": 1.0594, "num_tokens": 42664419.0, "step": 673 }, { "epoch": 1.2413634269921694, "grad_norm": 0.13222259283065796, "learning_rate": 0.0003967157765500307, "loss": 1.0156, "num_tokens": 42727132.0, "step": 674 }, { "epoch": 1.2432058959005068, "grad_norm": 0.13226573169231415, "learning_rate": 0.0003965623081645181, "loss": 1.086, "num_tokens": 42789243.0, "step": 675 }, { "epoch": 1.2450483648088437, "grad_norm": 0.1365078240633011, "learning_rate": 0.00039640883977900556, "loss": 1.132, "num_tokens": 42852223.0, "step": 676 }, { "epoch": 1.2468908337171811, "grad_norm": 0.12399625033140182, "learning_rate": 0.0003962553713934929, "loss": 1.0579, "num_tokens": 42915867.0, "step": 677 }, { "epoch": 1.248733302625518, "grad_norm": 0.12943506240844727, "learning_rate": 0.0003961019030079804, "loss": 1.1318, "num_tokens": 42979164.0, "step": 678 }, { "epoch": 1.2505757715338555, "grad_norm": 0.1279231756925583, "learning_rate": 0.00039594843462246776, "loss": 0.9943, "num_tokens": 43041907.0, "step": 679 }, { "epoch": 1.2524182404421924, "grad_norm": 0.1242523267865181, "learning_rate": 0.00039579496623695523, "loss": 1.0756, "num_tokens": 43104760.0, "step": 680 }, { "epoch": 1.2542607093505298, "grad_norm": 0.13291744887828827, "learning_rate": 0.0003956414978514426, "loss": 1.0219, "num_tokens": 43167902.0, "step": 681 }, { "epoch": 1.2561031782588667, "grad_norm": 0.1203235387802124, "learning_rate": 0.00039548802946593007, "loss": 0.9984, "num_tokens": 43231259.0, "step": 682 }, { "epoch": 1.2579456471672041, "grad_norm": 0.13500893115997314, "learning_rate": 0.00039533456108041743, "loss": 1.0983, "num_tokens": 43295732.0, "step": 683 }, { "epoch": 1.259788116075541, "grad_norm": 0.12775622308254242, "learning_rate": 0.00039518109269490485, "loss": 1.0107, "num_tokens": 43359809.0, "step": 684 }, { "epoch": 1.2616305849838785, "grad_norm": 0.12885712087154388, "learning_rate": 0.00039502762430939227, "loss": 0.9669, "num_tokens": 43423446.0, "step": 685 }, { "epoch": 1.2634730538922156, "grad_norm": 0.12609165906906128, "learning_rate": 0.0003948741559238797, "loss": 1.0732, "num_tokens": 43485864.0, "step": 686 }, { "epoch": 1.2653155228005528, "grad_norm": 0.13932712376117706, "learning_rate": 0.0003947206875383671, "loss": 1.0962, "num_tokens": 43549326.0, "step": 687 }, { "epoch": 1.26715799170889, "grad_norm": 0.12973806262016296, "learning_rate": 0.0003945672191528545, "loss": 1.0358, "num_tokens": 43612156.0, "step": 688 }, { "epoch": 1.2690004606172272, "grad_norm": 0.12397479265928268, "learning_rate": 0.00039441375076734194, "loss": 1.1337, "num_tokens": 43675740.0, "step": 689 }, { "epoch": 1.2708429295255643, "grad_norm": 0.12635843455791473, "learning_rate": 0.00039426028238182936, "loss": 1.0419, "num_tokens": 43740104.0, "step": 690 }, { "epoch": 1.2726853984339015, "grad_norm": 0.13801366090774536, "learning_rate": 0.00039410681399631677, "loss": 1.0743, "num_tokens": 43802501.0, "step": 691 }, { "epoch": 1.2745278673422387, "grad_norm": 0.13082829117774963, "learning_rate": 0.0003939533456108042, "loss": 1.0509, "num_tokens": 43865950.0, "step": 692 }, { "epoch": 1.2763703362505758, "grad_norm": 0.12914472818374634, "learning_rate": 0.0003937998772252916, "loss": 1.0971, "num_tokens": 43930045.0, "step": 693 }, { "epoch": 1.278212805158913, "grad_norm": 0.13754862546920776, "learning_rate": 0.000393646408839779, "loss": 1.1238, "num_tokens": 43992101.0, "step": 694 }, { "epoch": 1.2800552740672502, "grad_norm": 0.12515980005264282, "learning_rate": 0.00039349294045426644, "loss": 1.0891, "num_tokens": 44056376.0, "step": 695 }, { "epoch": 1.2818977429755873, "grad_norm": 0.12897200882434845, "learning_rate": 0.00039333947206875386, "loss": 1.0825, "num_tokens": 44119081.0, "step": 696 }, { "epoch": 1.2837402118839245, "grad_norm": 0.12154325097799301, "learning_rate": 0.0003931860036832413, "loss": 1.0578, "num_tokens": 44182233.0, "step": 697 }, { "epoch": 1.2855826807922617, "grad_norm": 0.1292453557252884, "learning_rate": 0.0003930325352977287, "loss": 1.0252, "num_tokens": 44244925.0, "step": 698 }, { "epoch": 1.2874251497005988, "grad_norm": 0.13510467112064362, "learning_rate": 0.0003928790669122161, "loss": 1.1159, "num_tokens": 44308538.0, "step": 699 }, { "epoch": 1.289267618608936, "grad_norm": 0.13416287302970886, "learning_rate": 0.00039272559852670353, "loss": 1.077, "num_tokens": 44373133.0, "step": 700 }, { "epoch": 1.2911100875172732, "grad_norm": 0.12529687583446503, "learning_rate": 0.0003925721301411909, "loss": 1.0785, "num_tokens": 44436820.0, "step": 701 }, { "epoch": 1.2929525564256104, "grad_norm": 0.1304008662700653, "learning_rate": 0.00039241866175567837, "loss": 1.062, "num_tokens": 44500376.0, "step": 702 }, { "epoch": 1.2947950253339475, "grad_norm": 0.1292172074317932, "learning_rate": 0.00039226519337016573, "loss": 1.1316, "num_tokens": 44564317.0, "step": 703 }, { "epoch": 1.2966374942422847, "grad_norm": 0.12457603961229324, "learning_rate": 0.0003921117249846532, "loss": 1.0871, "num_tokens": 44628345.0, "step": 704 }, { "epoch": 1.2984799631506219, "grad_norm": 0.13426421582698822, "learning_rate": 0.00039195825659914057, "loss": 1.035, "num_tokens": 44691984.0, "step": 705 }, { "epoch": 1.300322432058959, "grad_norm": 0.1299581378698349, "learning_rate": 0.00039180478821362804, "loss": 1.0741, "num_tokens": 44755872.0, "step": 706 }, { "epoch": 1.3021649009672962, "grad_norm": 0.13021452724933624, "learning_rate": 0.0003916513198281154, "loss": 1.1003, "num_tokens": 44821111.0, "step": 707 }, { "epoch": 1.3040073698756334, "grad_norm": 0.13607025146484375, "learning_rate": 0.0003914978514426029, "loss": 1.1634, "num_tokens": 44885387.0, "step": 708 }, { "epoch": 1.3058498387839705, "grad_norm": 0.12859688699245453, "learning_rate": 0.00039134438305709024, "loss": 1.0557, "num_tokens": 44948987.0, "step": 709 }, { "epoch": 1.3076923076923077, "grad_norm": 0.133605495095253, "learning_rate": 0.00039119091467157766, "loss": 1.0817, "num_tokens": 45011838.0, "step": 710 }, { "epoch": 1.3095347766006449, "grad_norm": 0.1267014741897583, "learning_rate": 0.0003910374462860651, "loss": 1.1387, "num_tokens": 45076314.0, "step": 711 }, { "epoch": 1.311377245508982, "grad_norm": 0.13061627745628357, "learning_rate": 0.0003908839779005525, "loss": 1.077, "num_tokens": 45140072.0, "step": 712 }, { "epoch": 1.3132197144173192, "grad_norm": 0.1281026154756546, "learning_rate": 0.0003907305095150399, "loss": 1.0704, "num_tokens": 45202348.0, "step": 713 }, { "epoch": 1.3150621833256564, "grad_norm": 0.13497942686080933, "learning_rate": 0.0003905770411295273, "loss": 1.0771, "num_tokens": 45265746.0, "step": 714 }, { "epoch": 1.3169046522339936, "grad_norm": 0.13246698677539825, "learning_rate": 0.00039042357274401474, "loss": 1.0822, "num_tokens": 45329104.0, "step": 715 }, { "epoch": 1.3187471211423307, "grad_norm": 0.12657536566257477, "learning_rate": 0.0003902701043585021, "loss": 1.0837, "num_tokens": 45393112.0, "step": 716 }, { "epoch": 1.320589590050668, "grad_norm": 0.13033834099769592, "learning_rate": 0.0003901166359729896, "loss": 1.0641, "num_tokens": 45456167.0, "step": 717 }, { "epoch": 1.322432058959005, "grad_norm": 0.12903805077075958, "learning_rate": 0.00038996316758747694, "loss": 1.0874, "num_tokens": 45519654.0, "step": 718 }, { "epoch": 1.3242745278673422, "grad_norm": 0.12729863822460175, "learning_rate": 0.0003898096992019644, "loss": 1.1305, "num_tokens": 45581549.0, "step": 719 }, { "epoch": 1.3261169967756794, "grad_norm": 0.13288886845111847, "learning_rate": 0.0003896562308164518, "loss": 1.0684, "num_tokens": 45643057.0, "step": 720 }, { "epoch": 1.3279594656840166, "grad_norm": 0.12823398411273956, "learning_rate": 0.00038950276243093925, "loss": 1.0244, "num_tokens": 45707283.0, "step": 721 }, { "epoch": 1.3298019345923537, "grad_norm": 0.13396227359771729, "learning_rate": 0.0003893492940454266, "loss": 1.0371, "num_tokens": 45770097.0, "step": 722 }, { "epoch": 1.331644403500691, "grad_norm": 0.1347714066505432, "learning_rate": 0.0003891958256599141, "loss": 1.0592, "num_tokens": 45832573.0, "step": 723 }, { "epoch": 1.333486872409028, "grad_norm": 0.1289147585630417, "learning_rate": 0.00038904235727440145, "loss": 1.0579, "num_tokens": 45896127.0, "step": 724 }, { "epoch": 1.3353293413173652, "grad_norm": 0.1299239844083786, "learning_rate": 0.0003888888888888889, "loss": 1.0702, "num_tokens": 45960062.0, "step": 725 }, { "epoch": 1.3371718102257024, "grad_norm": 0.13136406242847443, "learning_rate": 0.0003887354205033763, "loss": 1.0163, "num_tokens": 46023501.0, "step": 726 }, { "epoch": 1.3390142791340396, "grad_norm": 0.12691178917884827, "learning_rate": 0.00038858195211786376, "loss": 1.0601, "num_tokens": 46087266.0, "step": 727 }, { "epoch": 1.3408567480423768, "grad_norm": 0.13739453256130219, "learning_rate": 0.0003884284837323511, "loss": 1.0576, "num_tokens": 46149917.0, "step": 728 }, { "epoch": 1.342699216950714, "grad_norm": 0.12772402167320251, "learning_rate": 0.0003882750153468386, "loss": 1.1352, "num_tokens": 46214658.0, "step": 729 }, { "epoch": 1.344541685859051, "grad_norm": 0.13775849342346191, "learning_rate": 0.00038812154696132596, "loss": 1.0549, "num_tokens": 46278557.0, "step": 730 }, { "epoch": 1.3463841547673883, "grad_norm": 0.13911227881908417, "learning_rate": 0.0003879680785758134, "loss": 1.0611, "num_tokens": 46342506.0, "step": 731 }, { "epoch": 1.3482266236757254, "grad_norm": 0.13413330912590027, "learning_rate": 0.0003878146101903008, "loss": 1.0898, "num_tokens": 46406488.0, "step": 732 }, { "epoch": 1.3500690925840626, "grad_norm": 0.1454044133424759, "learning_rate": 0.0003876611418047882, "loss": 1.134, "num_tokens": 46467374.0, "step": 733 }, { "epoch": 1.3519115614923998, "grad_norm": 0.13860803842544556, "learning_rate": 0.0003875076734192756, "loss": 0.9948, "num_tokens": 46530639.0, "step": 734 }, { "epoch": 1.353754030400737, "grad_norm": 0.14459386467933655, "learning_rate": 0.00038735420503376304, "loss": 1.0871, "num_tokens": 46594003.0, "step": 735 }, { "epoch": 1.355596499309074, "grad_norm": 0.13003963232040405, "learning_rate": 0.00038720073664825046, "loss": 1.0469, "num_tokens": 46657522.0, "step": 736 }, { "epoch": 1.3574389682174113, "grad_norm": 0.13303177058696747, "learning_rate": 0.0003870472682627379, "loss": 1.0347, "num_tokens": 46720654.0, "step": 737 }, { "epoch": 1.3592814371257484, "grad_norm": 0.13589806854724884, "learning_rate": 0.0003868937998772253, "loss": 1.0499, "num_tokens": 46784237.0, "step": 738 }, { "epoch": 1.3611239060340856, "grad_norm": 0.13575750589370728, "learning_rate": 0.0003867403314917127, "loss": 1.0592, "num_tokens": 46849048.0, "step": 739 }, { "epoch": 1.3629663749424228, "grad_norm": 0.12788820266723633, "learning_rate": 0.00038658686310620013, "loss": 1.142, "num_tokens": 46913825.0, "step": 740 }, { "epoch": 1.36480884385076, "grad_norm": 0.12991441786289215, "learning_rate": 0.00038643339472068755, "loss": 1.0845, "num_tokens": 46977645.0, "step": 741 }, { "epoch": 1.3666513127590971, "grad_norm": 0.13096961379051208, "learning_rate": 0.00038627992633517497, "loss": 1.0198, "num_tokens": 47041702.0, "step": 742 }, { "epoch": 1.3684937816674343, "grad_norm": 0.12964561581611633, "learning_rate": 0.0003861264579496624, "loss": 1.1339, "num_tokens": 47105100.0, "step": 743 }, { "epoch": 1.3703362505757715, "grad_norm": 0.12656527757644653, "learning_rate": 0.0003859729895641498, "loss": 1.1353, "num_tokens": 47168815.0, "step": 744 }, { "epoch": 1.3721787194841086, "grad_norm": 0.13400386273860931, "learning_rate": 0.0003858195211786372, "loss": 1.0098, "num_tokens": 47231627.0, "step": 745 }, { "epoch": 1.3740211883924458, "grad_norm": 0.13001197576522827, "learning_rate": 0.0003856660527931246, "loss": 1.0666, "num_tokens": 47295684.0, "step": 746 }, { "epoch": 1.375863657300783, "grad_norm": 0.13138820230960846, "learning_rate": 0.00038551258440761206, "loss": 1.0406, "num_tokens": 47359842.0, "step": 747 }, { "epoch": 1.3777061262091201, "grad_norm": 0.17022383213043213, "learning_rate": 0.0003853591160220994, "loss": 1.132, "num_tokens": 47424406.0, "step": 748 }, { "epoch": 1.3795485951174573, "grad_norm": 0.12607988715171814, "learning_rate": 0.0003852056476365869, "loss": 1.1236, "num_tokens": 47487032.0, "step": 749 }, { "epoch": 1.3813910640257947, "grad_norm": 0.1289805769920349, "learning_rate": 0.00038505217925107426, "loss": 1.0449, "num_tokens": 47550570.0, "step": 750 }, { "epoch": 1.3832335329341316, "grad_norm": 0.13479186594486237, "learning_rate": 0.00038489871086556173, "loss": 1.046, "num_tokens": 47612646.0, "step": 751 }, { "epoch": 1.385076001842469, "grad_norm": 0.1430378258228302, "learning_rate": 0.0003847452424800491, "loss": 1.0465, "num_tokens": 47675874.0, "step": 752 }, { "epoch": 1.386918470750806, "grad_norm": 0.14014749228954315, "learning_rate": 0.00038459177409453656, "loss": 1.0983, "num_tokens": 47738331.0, "step": 753 }, { "epoch": 1.3887609396591434, "grad_norm": 0.128395214676857, "learning_rate": 0.0003844383057090239, "loss": 0.9911, "num_tokens": 47802208.0, "step": 754 }, { "epoch": 1.3906034085674803, "grad_norm": 0.1308898776769638, "learning_rate": 0.0003842848373235114, "loss": 1.0764, "num_tokens": 47865660.0, "step": 755 }, { "epoch": 1.3924458774758177, "grad_norm": 0.13973218202590942, "learning_rate": 0.00038413136893799876, "loss": 1.058, "num_tokens": 47928378.0, "step": 756 }, { "epoch": 1.3942883463841547, "grad_norm": 0.13476680219173431, "learning_rate": 0.00038397790055248623, "loss": 0.9887, "num_tokens": 47992111.0, "step": 757 }, { "epoch": 1.396130815292492, "grad_norm": 0.13395485281944275, "learning_rate": 0.0003838244321669736, "loss": 1.0471, "num_tokens": 48055149.0, "step": 758 }, { "epoch": 1.397973284200829, "grad_norm": 0.1429411768913269, "learning_rate": 0.00038367096378146107, "loss": 1.0668, "num_tokens": 48117783.0, "step": 759 }, { "epoch": 1.3998157531091664, "grad_norm": 0.13286976516246796, "learning_rate": 0.00038351749539594843, "loss": 1.1501, "num_tokens": 48182507.0, "step": 760 }, { "epoch": 1.4016582220175033, "grad_norm": 0.13405436277389526, "learning_rate": 0.0003833640270104359, "loss": 1.1256, "num_tokens": 48244914.0, "step": 761 }, { "epoch": 1.4035006909258407, "grad_norm": 0.1286347657442093, "learning_rate": 0.00038321055862492327, "loss": 1.041, "num_tokens": 48309403.0, "step": 762 }, { "epoch": 1.4053431598341777, "grad_norm": 0.13000443577766418, "learning_rate": 0.0003830570902394107, "loss": 0.9973, "num_tokens": 48373874.0, "step": 763 }, { "epoch": 1.407185628742515, "grad_norm": 0.1317588835954666, "learning_rate": 0.0003829036218538981, "loss": 1.0653, "num_tokens": 48436347.0, "step": 764 }, { "epoch": 1.409028097650852, "grad_norm": 0.13678258657455444, "learning_rate": 0.0003827501534683855, "loss": 1.0399, "num_tokens": 48500127.0, "step": 765 }, { "epoch": 1.4108705665591894, "grad_norm": 0.12911517918109894, "learning_rate": 0.00038259668508287294, "loss": 1.1464, "num_tokens": 48564658.0, "step": 766 }, { "epoch": 1.4127130354675264, "grad_norm": 0.13497161865234375, "learning_rate": 0.00038244321669736036, "loss": 1.061, "num_tokens": 48628339.0, "step": 767 }, { "epoch": 1.4145555043758637, "grad_norm": 0.12880748510360718, "learning_rate": 0.0003822897483118478, "loss": 1.0188, "num_tokens": 48690739.0, "step": 768 }, { "epoch": 1.4163979732842007, "grad_norm": 0.13399122655391693, "learning_rate": 0.0003821362799263352, "loss": 1.0935, "num_tokens": 48753743.0, "step": 769 }, { "epoch": 1.418240442192538, "grad_norm": 0.13444405794143677, "learning_rate": 0.0003819828115408226, "loss": 1.1801, "num_tokens": 48816915.0, "step": 770 }, { "epoch": 1.4200829111008753, "grad_norm": 0.13703827559947968, "learning_rate": 0.00038182934315531003, "loss": 1.0421, "num_tokens": 48880683.0, "step": 771 }, { "epoch": 1.4219253800092124, "grad_norm": 0.13169650733470917, "learning_rate": 0.00038167587476979745, "loss": 1.1085, "num_tokens": 48942959.0, "step": 772 }, { "epoch": 1.4237678489175496, "grad_norm": 0.13152791559696198, "learning_rate": 0.00038152240638428486, "loss": 1.0333, "num_tokens": 49006424.0, "step": 773 }, { "epoch": 1.4256103178258868, "grad_norm": 0.13275116682052612, "learning_rate": 0.0003813689379987723, "loss": 1.1243, "num_tokens": 49070810.0, "step": 774 }, { "epoch": 1.427452786734224, "grad_norm": 0.135684996843338, "learning_rate": 0.0003812154696132597, "loss": 1.1182, "num_tokens": 49134461.0, "step": 775 }, { "epoch": 1.429295255642561, "grad_norm": 0.1353912502527237, "learning_rate": 0.0003810620012277471, "loss": 1.1307, "num_tokens": 49196702.0, "step": 776 }, { "epoch": 1.4311377245508983, "grad_norm": 0.13089455664157867, "learning_rate": 0.00038090853284223453, "loss": 1.1377, "num_tokens": 49260056.0, "step": 777 }, { "epoch": 1.4329801934592354, "grad_norm": 0.13258096575737, "learning_rate": 0.0003807550644567219, "loss": 1.0141, "num_tokens": 49322607.0, "step": 778 }, { "epoch": 1.4348226623675726, "grad_norm": 0.13472718000411987, "learning_rate": 0.00038060159607120937, "loss": 1.0547, "num_tokens": 49385088.0, "step": 779 }, { "epoch": 1.4366651312759098, "grad_norm": 0.127224862575531, "learning_rate": 0.00038044812768569673, "loss": 1.0844, "num_tokens": 49448716.0, "step": 780 }, { "epoch": 1.438507600184247, "grad_norm": 0.129881352186203, "learning_rate": 0.00038029465930018415, "loss": 1.0219, "num_tokens": 49512931.0, "step": 781 }, { "epoch": 1.4403500690925841, "grad_norm": 0.1327105164527893, "learning_rate": 0.00038014119091467157, "loss": 1.1086, "num_tokens": 49575565.0, "step": 782 }, { "epoch": 1.4421925380009213, "grad_norm": 0.13992249965667725, "learning_rate": 0.000379987722529159, "loss": 1.054, "num_tokens": 49637956.0, "step": 783 }, { "epoch": 1.4440350069092585, "grad_norm": 0.13873405754566193, "learning_rate": 0.0003798342541436464, "loss": 1.077, "num_tokens": 49700611.0, "step": 784 }, { "epoch": 1.4458774758175956, "grad_norm": 0.13092182576656342, "learning_rate": 0.0003796807857581338, "loss": 1.0364, "num_tokens": 49765552.0, "step": 785 }, { "epoch": 1.4477199447259328, "grad_norm": 0.1328861564397812, "learning_rate": 0.00037952731737262124, "loss": 1.0787, "num_tokens": 49827582.0, "step": 786 }, { "epoch": 1.44956241363427, "grad_norm": 0.12841132283210754, "learning_rate": 0.00037937384898710866, "loss": 1.1828, "num_tokens": 49890449.0, "step": 787 }, { "epoch": 1.4514048825426071, "grad_norm": 0.1258872002363205, "learning_rate": 0.0003792203806015961, "loss": 1.0263, "num_tokens": 49954958.0, "step": 788 }, { "epoch": 1.4532473514509443, "grad_norm": 0.12833485007286072, "learning_rate": 0.0003790669122160835, "loss": 1.1344, "num_tokens": 50019522.0, "step": 789 }, { "epoch": 1.4550898203592815, "grad_norm": 0.13174398243427277, "learning_rate": 0.0003789134438305709, "loss": 0.9975, "num_tokens": 50083621.0, "step": 790 }, { "epoch": 1.4569322892676186, "grad_norm": 0.1265680342912674, "learning_rate": 0.00037875997544505833, "loss": 1.1397, "num_tokens": 50147978.0, "step": 791 }, { "epoch": 1.4587747581759558, "grad_norm": 0.1316194236278534, "learning_rate": 0.00037860650705954575, "loss": 1.1304, "num_tokens": 50212097.0, "step": 792 }, { "epoch": 1.460617227084293, "grad_norm": 0.13528521358966827, "learning_rate": 0.0003784530386740331, "loss": 1.1421, "num_tokens": 50276553.0, "step": 793 }, { "epoch": 1.4624596959926301, "grad_norm": 0.12780359387397766, "learning_rate": 0.0003782995702885206, "loss": 1.0685, "num_tokens": 50340911.0, "step": 794 }, { "epoch": 1.4643021649009673, "grad_norm": 0.12254846841096878, "learning_rate": 0.00037814610190300794, "loss": 1.0204, "num_tokens": 50405339.0, "step": 795 }, { "epoch": 1.4661446338093045, "grad_norm": 0.1333206593990326, "learning_rate": 0.0003779926335174954, "loss": 1.0344, "num_tokens": 50469178.0, "step": 796 }, { "epoch": 1.4679871027176417, "grad_norm": 0.1380300670862198, "learning_rate": 0.0003778391651319828, "loss": 1.0367, "num_tokens": 50531914.0, "step": 797 }, { "epoch": 1.4698295716259788, "grad_norm": 0.13612312078475952, "learning_rate": 0.00037768569674647025, "loss": 1.1153, "num_tokens": 50596049.0, "step": 798 }, { "epoch": 1.471672040534316, "grad_norm": 0.13668759167194366, "learning_rate": 0.0003775322283609576, "loss": 1.0117, "num_tokens": 50660097.0, "step": 799 }, { "epoch": 1.4735145094426532, "grad_norm": 0.12975311279296875, "learning_rate": 0.0003773787599754451, "loss": 1.0379, "num_tokens": 50724116.0, "step": 800 }, { "epoch": 1.4753569783509903, "grad_norm": 0.13458195328712463, "learning_rate": 0.00037722529158993245, "loss": 1.0893, "num_tokens": 50786528.0, "step": 801 }, { "epoch": 1.4771994472593275, "grad_norm": 0.13505524396896362, "learning_rate": 0.0003770718232044199, "loss": 1.0831, "num_tokens": 50849321.0, "step": 802 }, { "epoch": 1.4790419161676647, "grad_norm": 0.12924998998641968, "learning_rate": 0.0003769183548189073, "loss": 1.0742, "num_tokens": 50913798.0, "step": 803 }, { "epoch": 1.4808843850760018, "grad_norm": 0.1375415027141571, "learning_rate": 0.00037676488643339476, "loss": 1.0423, "num_tokens": 50977128.0, "step": 804 }, { "epoch": 1.482726853984339, "grad_norm": 0.12442465126514435, "learning_rate": 0.0003766114180478821, "loss": 1.1478, "num_tokens": 51041701.0, "step": 805 }, { "epoch": 1.4845693228926762, "grad_norm": 0.13494227826595306, "learning_rate": 0.0003764579496623696, "loss": 0.9542, "num_tokens": 51105693.0, "step": 806 }, { "epoch": 1.4864117918010133, "grad_norm": 0.13977153599262238, "learning_rate": 0.00037630448127685696, "loss": 1.004, "num_tokens": 51167563.0, "step": 807 }, { "epoch": 1.4882542607093505, "grad_norm": 0.13287165760993958, "learning_rate": 0.0003761510128913444, "loss": 1.1066, "num_tokens": 51231729.0, "step": 808 }, { "epoch": 1.4900967296176877, "grad_norm": 0.13393534719944, "learning_rate": 0.0003759975445058318, "loss": 1.065, "num_tokens": 51295365.0, "step": 809 }, { "epoch": 1.4919391985260249, "grad_norm": 0.13096314668655396, "learning_rate": 0.0003758440761203192, "loss": 1.0964, "num_tokens": 51359229.0, "step": 810 }, { "epoch": 1.493781667434362, "grad_norm": 0.1305319368839264, "learning_rate": 0.00037569060773480663, "loss": 1.0885, "num_tokens": 51422883.0, "step": 811 }, { "epoch": 1.4956241363426992, "grad_norm": 0.12851180136203766, "learning_rate": 0.00037553713934929405, "loss": 1.1179, "num_tokens": 51486075.0, "step": 812 }, { "epoch": 1.4974666052510364, "grad_norm": 0.1377866119146347, "learning_rate": 0.00037538367096378146, "loss": 1.0521, "num_tokens": 51548817.0, "step": 813 }, { "epoch": 1.4993090741593735, "grad_norm": 0.13508673012256622, "learning_rate": 0.0003752302025782689, "loss": 1.0318, "num_tokens": 51612016.0, "step": 814 }, { "epoch": 1.5011515430677107, "grad_norm": 0.1359851062297821, "learning_rate": 0.0003750767341927563, "loss": 1.1177, "num_tokens": 51675601.0, "step": 815 }, { "epoch": 1.5029940119760479, "grad_norm": 0.12731659412384033, "learning_rate": 0.0003749232658072437, "loss": 1.1252, "num_tokens": 51740765.0, "step": 816 }, { "epoch": 1.504836480884385, "grad_norm": 0.13190843164920807, "learning_rate": 0.00037476979742173113, "loss": 1.1111, "num_tokens": 51805082.0, "step": 817 }, { "epoch": 1.5066789497927222, "grad_norm": 0.1323549449443817, "learning_rate": 0.00037461632903621855, "loss": 1.1046, "num_tokens": 51868598.0, "step": 818 }, { "epoch": 1.5085214187010594, "grad_norm": 0.1332823485136032, "learning_rate": 0.00037446286065070597, "loss": 1.122, "num_tokens": 51931816.0, "step": 819 }, { "epoch": 1.5103638876093965, "grad_norm": 0.13884252309799194, "learning_rate": 0.0003743093922651934, "loss": 1.0911, "num_tokens": 51995739.0, "step": 820 }, { "epoch": 1.5122063565177337, "grad_norm": 0.13599342107772827, "learning_rate": 0.0003741559238796808, "loss": 1.093, "num_tokens": 52059246.0, "step": 821 }, { "epoch": 1.5140488254260709, "grad_norm": 0.14804258942604065, "learning_rate": 0.0003740024554941682, "loss": 1.1353, "num_tokens": 52122130.0, "step": 822 }, { "epoch": 1.515891294334408, "grad_norm": 0.13337133824825287, "learning_rate": 0.00037384898710865564, "loss": 1.0727, "num_tokens": 52185868.0, "step": 823 }, { "epoch": 1.5177337632427452, "grad_norm": 0.1361090987920761, "learning_rate": 0.00037369551872314306, "loss": 1.012, "num_tokens": 52248316.0, "step": 824 }, { "epoch": 1.5195762321510824, "grad_norm": 0.13409677147865295, "learning_rate": 0.0003735420503376304, "loss": 1.0123, "num_tokens": 52310702.0, "step": 825 }, { "epoch": 1.5214187010594196, "grad_norm": 0.13780620694160461, "learning_rate": 0.0003733885819521179, "loss": 1.0075, "num_tokens": 52372771.0, "step": 826 }, { "epoch": 1.523261169967757, "grad_norm": 0.13783210515975952, "learning_rate": 0.00037323511356660526, "loss": 1.0502, "num_tokens": 52434941.0, "step": 827 }, { "epoch": 1.525103638876094, "grad_norm": 0.13472455739974976, "learning_rate": 0.00037308164518109273, "loss": 1.0084, "num_tokens": 52498198.0, "step": 828 }, { "epoch": 1.5269461077844313, "grad_norm": 0.13454951345920563, "learning_rate": 0.0003729281767955801, "loss": 1.0444, "num_tokens": 52562403.0, "step": 829 }, { "epoch": 1.5287885766927682, "grad_norm": 0.1358122080564499, "learning_rate": 0.00037277470841006756, "loss": 1.0927, "num_tokens": 52626429.0, "step": 830 }, { "epoch": 1.5306310456011056, "grad_norm": 0.1359580159187317, "learning_rate": 0.00037262124002455493, "loss": 1.1235, "num_tokens": 52690064.0, "step": 831 }, { "epoch": 1.5324735145094426, "grad_norm": 0.13666477799415588, "learning_rate": 0.0003724677716390424, "loss": 1.0571, "num_tokens": 52753242.0, "step": 832 }, { "epoch": 1.53431598341778, "grad_norm": 0.1405927687883377, "learning_rate": 0.00037231430325352976, "loss": 1.0302, "num_tokens": 52816715.0, "step": 833 }, { "epoch": 1.536158452326117, "grad_norm": 0.12900520861148834, "learning_rate": 0.00037216083486801724, "loss": 0.9922, "num_tokens": 52880678.0, "step": 834 }, { "epoch": 1.5380009212344543, "grad_norm": 0.13933543860912323, "learning_rate": 0.0003720073664825046, "loss": 1.0119, "num_tokens": 52944240.0, "step": 835 }, { "epoch": 1.5398433901427913, "grad_norm": 0.13321153819561005, "learning_rate": 0.00037185389809699207, "loss": 1.0564, "num_tokens": 53008811.0, "step": 836 }, { "epoch": 1.5416858590511286, "grad_norm": 0.1284405142068863, "learning_rate": 0.00037170042971147943, "loss": 1.0643, "num_tokens": 53072134.0, "step": 837 }, { "epoch": 1.5435283279594656, "grad_norm": 0.1288565695285797, "learning_rate": 0.0003715469613259669, "loss": 1.0093, "num_tokens": 53136211.0, "step": 838 }, { "epoch": 1.545370796867803, "grad_norm": 0.13987620174884796, "learning_rate": 0.00037139349294045427, "loss": 1.0045, "num_tokens": 53198779.0, "step": 839 }, { "epoch": 1.54721326577614, "grad_norm": 0.14008845388889313, "learning_rate": 0.0003712400245549417, "loss": 1.0459, "num_tokens": 53260953.0, "step": 840 }, { "epoch": 1.5490557346844773, "grad_norm": 0.13746990263462067, "learning_rate": 0.0003710865561694291, "loss": 1.0507, "num_tokens": 53324976.0, "step": 841 }, { "epoch": 1.5508982035928143, "grad_norm": 0.13658778369426727, "learning_rate": 0.0003709330877839165, "loss": 1.095, "num_tokens": 53388311.0, "step": 842 }, { "epoch": 1.5527406725011517, "grad_norm": 0.1472586840391159, "learning_rate": 0.00037077961939840394, "loss": 1.0138, "num_tokens": 53452138.0, "step": 843 }, { "epoch": 1.5545831414094886, "grad_norm": 0.13199104368686676, "learning_rate": 0.00037062615101289136, "loss": 1.0742, "num_tokens": 53515649.0, "step": 844 }, { "epoch": 1.556425610317826, "grad_norm": 0.13417163491249084, "learning_rate": 0.0003704726826273788, "loss": 1.0491, "num_tokens": 53579142.0, "step": 845 }, { "epoch": 1.558268079226163, "grad_norm": 0.14449448883533478, "learning_rate": 0.0003703192142418662, "loss": 1.108, "num_tokens": 53642309.0, "step": 846 }, { "epoch": 1.5601105481345003, "grad_norm": 0.14179502427577972, "learning_rate": 0.0003701657458563536, "loss": 1.0627, "num_tokens": 53705649.0, "step": 847 }, { "epoch": 1.5619530170428373, "grad_norm": 0.13362976908683777, "learning_rate": 0.00037001227747084103, "loss": 1.0217, "num_tokens": 53769381.0, "step": 848 }, { "epoch": 1.5637954859511747, "grad_norm": 0.13433603942394257, "learning_rate": 0.00036985880908532845, "loss": 1.0345, "num_tokens": 53832500.0, "step": 849 }, { "epoch": 1.5656379548595116, "grad_norm": 0.13791204988956451, "learning_rate": 0.00036970534069981586, "loss": 1.0249, "num_tokens": 53895700.0, "step": 850 }, { "epoch": 1.567480423767849, "grad_norm": 0.14751125872135162, "learning_rate": 0.0003695518723143033, "loss": 1.077, "num_tokens": 53958055.0, "step": 851 }, { "epoch": 1.569322892676186, "grad_norm": 0.13405564427375793, "learning_rate": 0.00036939840392879065, "loss": 1.0094, "num_tokens": 54022060.0, "step": 852 }, { "epoch": 1.5711653615845234, "grad_norm": 0.1366650015115738, "learning_rate": 0.0003692449355432781, "loss": 1.024, "num_tokens": 54085890.0, "step": 853 }, { "epoch": 1.5730078304928603, "grad_norm": 0.13869313895702362, "learning_rate": 0.0003690914671577655, "loss": 1.0753, "num_tokens": 54148772.0, "step": 854 }, { "epoch": 1.5748502994011977, "grad_norm": 0.12851761281490326, "learning_rate": 0.0003689379987722529, "loss": 1.1453, "num_tokens": 54212779.0, "step": 855 }, { "epoch": 1.5766927683095346, "grad_norm": 0.13626216351985931, "learning_rate": 0.0003687845303867403, "loss": 1.0407, "num_tokens": 54276338.0, "step": 856 }, { "epoch": 1.578535237217872, "grad_norm": 0.13398998975753784, "learning_rate": 0.00036863106200122773, "loss": 1.0826, "num_tokens": 54338949.0, "step": 857 }, { "epoch": 1.580377706126209, "grad_norm": 0.14381282031536102, "learning_rate": 0.00036847759361571515, "loss": 1.0454, "num_tokens": 54402134.0, "step": 858 }, { "epoch": 1.5822201750345464, "grad_norm": 0.13570824265480042, "learning_rate": 0.00036832412523020257, "loss": 1.0926, "num_tokens": 54465216.0, "step": 859 }, { "epoch": 1.5840626439428833, "grad_norm": 0.14960207045078278, "learning_rate": 0.00036817065684469, "loss": 1.0924, "num_tokens": 54528153.0, "step": 860 }, { "epoch": 1.5859051128512207, "grad_norm": 0.1432880014181137, "learning_rate": 0.0003680171884591774, "loss": 1.0438, "num_tokens": 54590704.0, "step": 861 }, { "epoch": 1.5877475817595577, "grad_norm": 0.13729074597358704, "learning_rate": 0.0003678637200736648, "loss": 1.0187, "num_tokens": 54655508.0, "step": 862 }, { "epoch": 1.589590050667895, "grad_norm": 0.14445574581623077, "learning_rate": 0.00036771025168815224, "loss": 1.0508, "num_tokens": 54717487.0, "step": 863 }, { "epoch": 1.591432519576232, "grad_norm": 0.13574881851673126, "learning_rate": 0.00036755678330263966, "loss": 1.0834, "num_tokens": 54781383.0, "step": 864 }, { "epoch": 1.5932749884845694, "grad_norm": 0.1384097784757614, "learning_rate": 0.0003674033149171271, "loss": 1.1252, "num_tokens": 54844929.0, "step": 865 }, { "epoch": 1.5951174573929063, "grad_norm": 0.13757126033306122, "learning_rate": 0.0003672498465316145, "loss": 1.0299, "num_tokens": 54908858.0, "step": 866 }, { "epoch": 1.5969599263012437, "grad_norm": 0.13274484872817993, "learning_rate": 0.0003670963781461019, "loss": 1.0917, "num_tokens": 54972728.0, "step": 867 }, { "epoch": 1.5988023952095807, "grad_norm": 0.14229229092597961, "learning_rate": 0.00036694290976058933, "loss": 1.0388, "num_tokens": 55034977.0, "step": 868 }, { "epoch": 1.600644864117918, "grad_norm": 0.13969269394874573, "learning_rate": 0.00036678944137507675, "loss": 1.0636, "num_tokens": 55097268.0, "step": 869 }, { "epoch": 1.6024873330262552, "grad_norm": 0.14237268269062042, "learning_rate": 0.0003666359729895641, "loss": 1.0495, "num_tokens": 55159792.0, "step": 870 }, { "epoch": 1.6043298019345924, "grad_norm": 0.13865789771080017, "learning_rate": 0.0003664825046040516, "loss": 1.0755, "num_tokens": 55224258.0, "step": 871 }, { "epoch": 1.6061722708429296, "grad_norm": 0.13890185952186584, "learning_rate": 0.00036632903621853895, "loss": 1.0575, "num_tokens": 55287447.0, "step": 872 }, { "epoch": 1.6080147397512667, "grad_norm": 0.14000490307807922, "learning_rate": 0.0003661755678330264, "loss": 1.1276, "num_tokens": 55350604.0, "step": 873 }, { "epoch": 1.609857208659604, "grad_norm": 0.1358124166727066, "learning_rate": 0.0003660220994475138, "loss": 1.0502, "num_tokens": 55413976.0, "step": 874 }, { "epoch": 1.611699677567941, "grad_norm": 0.1319161206483841, "learning_rate": 0.00036586863106200125, "loss": 1.1356, "num_tokens": 55476367.0, "step": 875 }, { "epoch": 1.6135421464762782, "grad_norm": 0.14268425107002258, "learning_rate": 0.0003657151626764886, "loss": 1.1016, "num_tokens": 55540327.0, "step": 876 }, { "epoch": 1.6153846153846154, "grad_norm": 0.13713549077510834, "learning_rate": 0.0003655616942909761, "loss": 1.0672, "num_tokens": 55604457.0, "step": 877 }, { "epoch": 1.6172270842929526, "grad_norm": 0.13105061650276184, "learning_rate": 0.00036540822590546345, "loss": 1.1241, "num_tokens": 55668344.0, "step": 878 }, { "epoch": 1.6190695532012898, "grad_norm": 0.13389931619167328, "learning_rate": 0.0003652547575199509, "loss": 1.1127, "num_tokens": 55731645.0, "step": 879 }, { "epoch": 1.620912022109627, "grad_norm": 0.1322927325963974, "learning_rate": 0.0003651012891344383, "loss": 1.117, "num_tokens": 55795682.0, "step": 880 }, { "epoch": 1.622754491017964, "grad_norm": 0.1320277750492096, "learning_rate": 0.00036494782074892576, "loss": 1.1186, "num_tokens": 55860061.0, "step": 881 }, { "epoch": 1.6245969599263013, "grad_norm": 0.13178834319114685, "learning_rate": 0.0003647943523634131, "loss": 1.0496, "num_tokens": 55923086.0, "step": 882 }, { "epoch": 1.6264394288346384, "grad_norm": 0.133665069937706, "learning_rate": 0.0003646408839779006, "loss": 1.1106, "num_tokens": 55985871.0, "step": 883 }, { "epoch": 1.6282818977429756, "grad_norm": 0.14381074905395508, "learning_rate": 0.00036448741559238796, "loss": 1.0855, "num_tokens": 56049126.0, "step": 884 }, { "epoch": 1.6301243666513128, "grad_norm": 0.13551026582717896, "learning_rate": 0.00036433394720687543, "loss": 1.0951, "num_tokens": 56112246.0, "step": 885 }, { "epoch": 1.63196683555965, "grad_norm": 0.1377939134836197, "learning_rate": 0.0003641804788213628, "loss": 1.0369, "num_tokens": 56174492.0, "step": 886 }, { "epoch": 1.6338093044679871, "grad_norm": 0.13889652490615845, "learning_rate": 0.0003640270104358502, "loss": 1.1643, "num_tokens": 56237566.0, "step": 887 }, { "epoch": 1.6356517733763243, "grad_norm": 0.13611756265163422, "learning_rate": 0.00036387354205033763, "loss": 0.9903, "num_tokens": 56299455.0, "step": 888 }, { "epoch": 1.6374942422846614, "grad_norm": 0.13658468425273895, "learning_rate": 0.00036372007366482505, "loss": 0.9975, "num_tokens": 56363046.0, "step": 889 }, { "epoch": 1.6393367111929986, "grad_norm": 0.14619164168834686, "learning_rate": 0.00036356660527931247, "loss": 1.0717, "num_tokens": 56425622.0, "step": 890 }, { "epoch": 1.6411791801013358, "grad_norm": 0.1418885737657547, "learning_rate": 0.0003634131368937999, "loss": 0.996, "num_tokens": 56488559.0, "step": 891 }, { "epoch": 1.643021649009673, "grad_norm": 0.13534091413021088, "learning_rate": 0.0003632596685082873, "loss": 1.0414, "num_tokens": 56550845.0, "step": 892 }, { "epoch": 1.6448641179180101, "grad_norm": 0.1452713906764984, "learning_rate": 0.0003631062001227747, "loss": 1.0617, "num_tokens": 56615614.0, "step": 893 }, { "epoch": 1.6467065868263473, "grad_norm": 0.13658203184604645, "learning_rate": 0.00036295273173726214, "loss": 1.0868, "num_tokens": 56679650.0, "step": 894 }, { "epoch": 1.6485490557346845, "grad_norm": 0.13476993143558502, "learning_rate": 0.00036279926335174955, "loss": 1.0486, "num_tokens": 56741640.0, "step": 895 }, { "epoch": 1.6503915246430216, "grad_norm": 0.13899432122707367, "learning_rate": 0.00036264579496623697, "loss": 1.1058, "num_tokens": 56804341.0, "step": 896 }, { "epoch": 1.6522339935513588, "grad_norm": 0.1309812068939209, "learning_rate": 0.0003624923265807244, "loss": 1.1095, "num_tokens": 56868924.0, "step": 897 }, { "epoch": 1.654076462459696, "grad_norm": 0.13357317447662354, "learning_rate": 0.0003623388581952118, "loss": 1.0163, "num_tokens": 56933145.0, "step": 898 }, { "epoch": 1.6559189313680331, "grad_norm": 0.1391957700252533, "learning_rate": 0.0003621853898096992, "loss": 1.0302, "num_tokens": 56995335.0, "step": 899 }, { "epoch": 1.6577614002763703, "grad_norm": 0.14279860258102417, "learning_rate": 0.00036203192142418664, "loss": 1.0762, "num_tokens": 57057263.0, "step": 900 }, { "epoch": 1.6596038691847075, "grad_norm": 0.1456848382949829, "learning_rate": 0.00036187845303867406, "loss": 1.0052, "num_tokens": 57120907.0, "step": 901 }, { "epoch": 1.6614463380930447, "grad_norm": 0.13784626126289368, "learning_rate": 0.0003617249846531614, "loss": 1.065, "num_tokens": 57184852.0, "step": 902 }, { "epoch": 1.6632888070013818, "grad_norm": 0.13293471932411194, "learning_rate": 0.0003615715162676489, "loss": 1.03, "num_tokens": 57248758.0, "step": 903 }, { "epoch": 1.665131275909719, "grad_norm": 0.13485071063041687, "learning_rate": 0.00036141804788213626, "loss": 1.0593, "num_tokens": 57311232.0, "step": 904 }, { "epoch": 1.6669737448180562, "grad_norm": 0.1416548788547516, "learning_rate": 0.00036126457949662373, "loss": 1.0598, "num_tokens": 57374341.0, "step": 905 }, { "epoch": 1.6688162137263933, "grad_norm": 0.1371317058801651, "learning_rate": 0.0003611111111111111, "loss": 1.0337, "num_tokens": 57437719.0, "step": 906 }, { "epoch": 1.6706586826347305, "grad_norm": 0.13241852819919586, "learning_rate": 0.00036095764272559857, "loss": 1.1596, "num_tokens": 57501356.0, "step": 907 }, { "epoch": 1.6725011515430677, "grad_norm": 0.13943298161029816, "learning_rate": 0.00036080417434008593, "loss": 1.0716, "num_tokens": 57563288.0, "step": 908 }, { "epoch": 1.6743436204514048, "grad_norm": 0.1462913304567337, "learning_rate": 0.0003606507059545734, "loss": 1.062, "num_tokens": 57625998.0, "step": 909 }, { "epoch": 1.676186089359742, "grad_norm": 0.14652124047279358, "learning_rate": 0.00036049723756906077, "loss": 1.0392, "num_tokens": 57689587.0, "step": 910 }, { "epoch": 1.6780285582680792, "grad_norm": 0.13646289706230164, "learning_rate": 0.00036034376918354824, "loss": 1.1961, "num_tokens": 57753986.0, "step": 911 }, { "epoch": 1.6798710271764163, "grad_norm": 0.13739533722400665, "learning_rate": 0.0003601903007980356, "loss": 1.0982, "num_tokens": 57817883.0, "step": 912 }, { "epoch": 1.6817134960847535, "grad_norm": 0.1336182951927185, "learning_rate": 0.00036003683241252307, "loss": 1.0983, "num_tokens": 57881494.0, "step": 913 }, { "epoch": 1.683555964993091, "grad_norm": 0.13000032305717468, "learning_rate": 0.00035988336402701044, "loss": 1.0611, "num_tokens": 57945821.0, "step": 914 }, { "epoch": 1.6853984339014279, "grad_norm": 0.13237865269184113, "learning_rate": 0.0003597298956414979, "loss": 1.1214, "num_tokens": 58010654.0, "step": 915 }, { "epoch": 1.6872409028097652, "grad_norm": 0.14571617543697357, "learning_rate": 0.00035957642725598527, "loss": 1.041, "num_tokens": 58074166.0, "step": 916 }, { "epoch": 1.6890833717181022, "grad_norm": 0.13966834545135498, "learning_rate": 0.0003594229588704727, "loss": 1.0168, "num_tokens": 58136238.0, "step": 917 }, { "epoch": 1.6909258406264396, "grad_norm": 0.19514749944210052, "learning_rate": 0.0003592694904849601, "loss": 1.0665, "num_tokens": 58200631.0, "step": 918 }, { "epoch": 1.6927683095347765, "grad_norm": 0.13237938284873962, "learning_rate": 0.0003591160220994475, "loss": 1.0417, "num_tokens": 58265011.0, "step": 919 }, { "epoch": 1.694610778443114, "grad_norm": 0.136752188205719, "learning_rate": 0.00035896255371393494, "loss": 1.0872, "num_tokens": 58327924.0, "step": 920 }, { "epoch": 1.6964532473514509, "grad_norm": 0.14831231534481049, "learning_rate": 0.00035880908532842236, "loss": 1.093, "num_tokens": 58390320.0, "step": 921 }, { "epoch": 1.6982957162597883, "grad_norm": 0.14083169400691986, "learning_rate": 0.0003586556169429098, "loss": 1.1438, "num_tokens": 58453478.0, "step": 922 }, { "epoch": 1.7001381851681252, "grad_norm": 0.13706861436367035, "learning_rate": 0.00035850214855739714, "loss": 1.0869, "num_tokens": 58517064.0, "step": 923 }, { "epoch": 1.7019806540764626, "grad_norm": 0.13420000672340393, "learning_rate": 0.0003583486801718846, "loss": 1.0598, "num_tokens": 58581554.0, "step": 924 }, { "epoch": 1.7038231229847995, "grad_norm": 0.13543251156806946, "learning_rate": 0.000358195211786372, "loss": 1.0677, "num_tokens": 58645424.0, "step": 925 }, { "epoch": 1.705665591893137, "grad_norm": 0.13443203270435333, "learning_rate": 0.00035804174340085945, "loss": 1.0461, "num_tokens": 58708289.0, "step": 926 }, { "epoch": 1.7075080608014739, "grad_norm": 0.14595481753349304, "learning_rate": 0.0003578882750153468, "loss": 1.0302, "num_tokens": 58771532.0, "step": 927 }, { "epoch": 1.7093505297098113, "grad_norm": 0.13613392412662506, "learning_rate": 0.0003577348066298343, "loss": 1.0352, "num_tokens": 58835267.0, "step": 928 }, { "epoch": 1.7111929986181482, "grad_norm": 0.13716648519039154, "learning_rate": 0.00035758133824432165, "loss": 1.0541, "num_tokens": 58899736.0, "step": 929 }, { "epoch": 1.7130354675264856, "grad_norm": 0.13658875226974487, "learning_rate": 0.0003574278698588091, "loss": 0.9967, "num_tokens": 58964316.0, "step": 930 }, { "epoch": 1.7148779364348226, "grad_norm": 0.13264615833759308, "learning_rate": 0.0003572744014732965, "loss": 0.9942, "num_tokens": 59029003.0, "step": 931 }, { "epoch": 1.71672040534316, "grad_norm": 0.1355143040418625, "learning_rate": 0.0003571209330877839, "loss": 1.0781, "num_tokens": 59092836.0, "step": 932 }, { "epoch": 1.718562874251497, "grad_norm": 0.14615677297115326, "learning_rate": 0.0003569674647022713, "loss": 1.06, "num_tokens": 59154521.0, "step": 933 }, { "epoch": 1.7204053431598343, "grad_norm": 0.14041247963905334, "learning_rate": 0.00035681399631675874, "loss": 1.0801, "num_tokens": 59218722.0, "step": 934 }, { "epoch": 1.7222478120681712, "grad_norm": 0.14324942231178284, "learning_rate": 0.00035666052793124615, "loss": 1.0459, "num_tokens": 59281488.0, "step": 935 }, { "epoch": 1.7240902809765086, "grad_norm": 0.14051613211631775, "learning_rate": 0.00035650705954573357, "loss": 1.0109, "num_tokens": 59345685.0, "step": 936 }, { "epoch": 1.7259327498848456, "grad_norm": 0.1423776000738144, "learning_rate": 0.000356353591160221, "loss": 1.0615, "num_tokens": 59408263.0, "step": 937 }, { "epoch": 1.727775218793183, "grad_norm": 0.13416016101837158, "learning_rate": 0.0003562001227747084, "loss": 1.0779, "num_tokens": 59471801.0, "step": 938 }, { "epoch": 1.72961768770152, "grad_norm": 0.14446453750133514, "learning_rate": 0.0003560466543891958, "loss": 1.0739, "num_tokens": 59534309.0, "step": 939 }, { "epoch": 1.7314601566098573, "grad_norm": 0.1441536545753479, "learning_rate": 0.00035589318600368324, "loss": 1.0017, "num_tokens": 59597331.0, "step": 940 }, { "epoch": 1.7333026255181943, "grad_norm": 0.13224966824054718, "learning_rate": 0.00035573971761817066, "loss": 1.0635, "num_tokens": 59660498.0, "step": 941 }, { "epoch": 1.7351450944265316, "grad_norm": 0.13527892529964447, "learning_rate": 0.0003555862492326581, "loss": 1.07, "num_tokens": 59725567.0, "step": 942 }, { "epoch": 1.7369875633348686, "grad_norm": 0.13602504134178162, "learning_rate": 0.0003554327808471455, "loss": 1.0968, "num_tokens": 59789961.0, "step": 943 }, { "epoch": 1.738830032243206, "grad_norm": 0.1493835151195526, "learning_rate": 0.0003552793124616329, "loss": 1.1034, "num_tokens": 59852827.0, "step": 944 }, { "epoch": 1.740672501151543, "grad_norm": 0.1322387456893921, "learning_rate": 0.00035512584407612033, "loss": 1.0855, "num_tokens": 59916966.0, "step": 945 }, { "epoch": 1.7425149700598803, "grad_norm": 0.14197446405887604, "learning_rate": 0.00035497237569060775, "loss": 0.9688, "num_tokens": 59979884.0, "step": 946 }, { "epoch": 1.7443574389682173, "grad_norm": 0.133633092045784, "learning_rate": 0.00035481890730509517, "loss": 1.0991, "num_tokens": 60043318.0, "step": 947 }, { "epoch": 1.7461999078765547, "grad_norm": 0.145382359623909, "learning_rate": 0.0003546654389195826, "loss": 1.0297, "num_tokens": 60105908.0, "step": 948 }, { "epoch": 1.7480423767848916, "grad_norm": 0.1831725388765335, "learning_rate": 0.00035451197053406995, "loss": 1.0458, "num_tokens": 60169244.0, "step": 949 }, { "epoch": 1.749884845693229, "grad_norm": 0.14389753341674805, "learning_rate": 0.0003543585021485574, "loss": 1.1524, "num_tokens": 60233220.0, "step": 950 }, { "epoch": 1.751727314601566, "grad_norm": 0.14187029004096985, "learning_rate": 0.0003542050337630448, "loss": 1.0676, "num_tokens": 60296383.0, "step": 951 }, { "epoch": 1.7535697835099033, "grad_norm": 0.15117555856704712, "learning_rate": 0.00035405156537753226, "loss": 1.0458, "num_tokens": 60361212.0, "step": 952 }, { "epoch": 1.7554122524182403, "grad_norm": 0.14113982021808624, "learning_rate": 0.0003538980969920196, "loss": 1.1288, "num_tokens": 60424907.0, "step": 953 }, { "epoch": 1.7572547213265777, "grad_norm": 0.14195045828819275, "learning_rate": 0.0003537446286065071, "loss": 1.0544, "num_tokens": 60488844.0, "step": 954 }, { "epoch": 1.7590971902349146, "grad_norm": 0.1415780931711197, "learning_rate": 0.00035359116022099445, "loss": 1.0704, "num_tokens": 60551938.0, "step": 955 }, { "epoch": 1.760939659143252, "grad_norm": 0.13733792304992676, "learning_rate": 0.0003534376918354819, "loss": 1.0256, "num_tokens": 60616219.0, "step": 956 }, { "epoch": 1.7627821280515892, "grad_norm": 0.14412182569503784, "learning_rate": 0.0003532842234499693, "loss": 1.0658, "num_tokens": 60678858.0, "step": 957 }, { "epoch": 1.7646245969599264, "grad_norm": 0.14894133806228638, "learning_rate": 0.00035313075506445676, "loss": 1.0312, "num_tokens": 60742174.0, "step": 958 }, { "epoch": 1.7664670658682635, "grad_norm": 0.13773882389068604, "learning_rate": 0.0003529772866789441, "loss": 1.085, "num_tokens": 60806508.0, "step": 959 }, { "epoch": 1.7683095347766007, "grad_norm": 0.14279845356941223, "learning_rate": 0.0003528238182934316, "loss": 1.0745, "num_tokens": 60870722.0, "step": 960 }, { "epoch": 1.7701520036849379, "grad_norm": 0.1354806125164032, "learning_rate": 0.00035267034990791896, "loss": 1.0667, "num_tokens": 60934865.0, "step": 961 }, { "epoch": 1.771994472593275, "grad_norm": 0.1323605626821518, "learning_rate": 0.00035251688152240643, "loss": 1.1557, "num_tokens": 60998793.0, "step": 962 }, { "epoch": 1.7738369415016122, "grad_norm": 0.1420532464981079, "learning_rate": 0.0003523634131368938, "loss": 1.0245, "num_tokens": 61063032.0, "step": 963 }, { "epoch": 1.7756794104099494, "grad_norm": 0.14204001426696777, "learning_rate": 0.0003522099447513812, "loss": 1.0926, "num_tokens": 61126042.0, "step": 964 }, { "epoch": 1.7775218793182865, "grad_norm": 0.1364821344614029, "learning_rate": 0.00035205647636586863, "loss": 1.0563, "num_tokens": 61190113.0, "step": 965 }, { "epoch": 1.7793643482266237, "grad_norm": 0.1366649568080902, "learning_rate": 0.00035190300798035605, "loss": 1.023, "num_tokens": 61253300.0, "step": 966 }, { "epoch": 1.7812068171349609, "grad_norm": 0.14800086617469788, "learning_rate": 0.00035174953959484347, "loss": 1.0806, "num_tokens": 61315933.0, "step": 967 }, { "epoch": 1.783049286043298, "grad_norm": 0.15090206265449524, "learning_rate": 0.0003515960712093309, "loss": 1.0091, "num_tokens": 61380553.0, "step": 968 }, { "epoch": 1.7848917549516352, "grad_norm": 0.13830028474330902, "learning_rate": 0.0003514426028238183, "loss": 0.9765, "num_tokens": 61444379.0, "step": 969 }, { "epoch": 1.7867342238599724, "grad_norm": 0.13394901156425476, "learning_rate": 0.0003512891344383057, "loss": 1.0338, "num_tokens": 61507920.0, "step": 970 }, { "epoch": 1.7885766927683096, "grad_norm": 0.14819517731666565, "learning_rate": 0.00035113566605279314, "loss": 1.0604, "num_tokens": 61571350.0, "step": 971 }, { "epoch": 1.7904191616766467, "grad_norm": 0.13714073598384857, "learning_rate": 0.00035098219766728056, "loss": 1.0261, "num_tokens": 61635868.0, "step": 972 }, { "epoch": 1.792261630584984, "grad_norm": 0.14572075009346008, "learning_rate": 0.000350828729281768, "loss": 0.964, "num_tokens": 61699424.0, "step": 973 }, { "epoch": 1.794104099493321, "grad_norm": 0.13384932279586792, "learning_rate": 0.0003506752608962554, "loss": 1.048, "num_tokens": 61763486.0, "step": 974 }, { "epoch": 1.7959465684016582, "grad_norm": 0.14781111478805542, "learning_rate": 0.0003505217925107428, "loss": 1.0789, "num_tokens": 61827582.0, "step": 975 }, { "epoch": 1.7977890373099954, "grad_norm": 0.13754740357398987, "learning_rate": 0.0003503683241252302, "loss": 1.0444, "num_tokens": 61891352.0, "step": 976 }, { "epoch": 1.7996315062183326, "grad_norm": 0.15444348752498627, "learning_rate": 0.00035021485573971764, "loss": 1.1382, "num_tokens": 61951497.0, "step": 977 }, { "epoch": 1.8014739751266697, "grad_norm": 0.1409686654806137, "learning_rate": 0.00035006138735420506, "loss": 1.1189, "num_tokens": 62014144.0, "step": 978 }, { "epoch": 1.803316444035007, "grad_norm": 0.14326603710651398, "learning_rate": 0.0003499079189686924, "loss": 1.0083, "num_tokens": 62076729.0, "step": 979 }, { "epoch": 1.805158912943344, "grad_norm": 0.14200563728809357, "learning_rate": 0.0003497544505831799, "loss": 1.0196, "num_tokens": 62139555.0, "step": 980 }, { "epoch": 1.8070013818516812, "grad_norm": 0.14865367114543915, "learning_rate": 0.00034960098219766726, "loss": 1.0193, "num_tokens": 62202737.0, "step": 981 }, { "epoch": 1.8088438507600184, "grad_norm": 0.1349029839038849, "learning_rate": 0.00034944751381215473, "loss": 1.0347, "num_tokens": 62268015.0, "step": 982 }, { "epoch": 1.8106863196683556, "grad_norm": 0.14150698482990265, "learning_rate": 0.0003492940454266421, "loss": 1.0257, "num_tokens": 62331862.0, "step": 983 }, { "epoch": 1.8125287885766928, "grad_norm": 0.14912614226341248, "learning_rate": 0.00034914057704112957, "loss": 1.0321, "num_tokens": 62393082.0, "step": 984 }, { "epoch": 1.81437125748503, "grad_norm": 0.1414833515882492, "learning_rate": 0.00034898710865561693, "loss": 1.0294, "num_tokens": 62456505.0, "step": 985 }, { "epoch": 1.816213726393367, "grad_norm": 0.14016607403755188, "learning_rate": 0.0003488336402701044, "loss": 1.1469, "num_tokens": 62518854.0, "step": 986 }, { "epoch": 1.8180561953017043, "grad_norm": 0.1377253383398056, "learning_rate": 0.00034868017188459177, "loss": 1.0662, "num_tokens": 62581696.0, "step": 987 }, { "epoch": 1.8198986642100414, "grad_norm": 0.14641469717025757, "learning_rate": 0.00034852670349907924, "loss": 1.1097, "num_tokens": 62645107.0, "step": 988 }, { "epoch": 1.8217411331183786, "grad_norm": 0.14346802234649658, "learning_rate": 0.0003483732351135666, "loss": 1.0535, "num_tokens": 62709625.0, "step": 989 }, { "epoch": 1.8235836020267158, "grad_norm": 0.14843928813934326, "learning_rate": 0.0003482197667280541, "loss": 1.0647, "num_tokens": 62773417.0, "step": 990 }, { "epoch": 1.825426070935053, "grad_norm": 0.13763704895973206, "learning_rate": 0.00034806629834254144, "loss": 1.0168, "num_tokens": 62837865.0, "step": 991 }, { "epoch": 1.82726853984339, "grad_norm": 0.13563072681427002, "learning_rate": 0.0003479128299570289, "loss": 1.1178, "num_tokens": 62901940.0, "step": 992 }, { "epoch": 1.8291110087517273, "grad_norm": 0.14395751059055328, "learning_rate": 0.0003477593615715163, "loss": 1.0632, "num_tokens": 62965438.0, "step": 993 }, { "epoch": 1.8309534776600644, "grad_norm": 0.1381485015153885, "learning_rate": 0.00034760589318600364, "loss": 1.0178, "num_tokens": 63029475.0, "step": 994 }, { "epoch": 1.8327959465684016, "grad_norm": 0.1352289468050003, "learning_rate": 0.0003474524248004911, "loss": 1.0284, "num_tokens": 63093459.0, "step": 995 }, { "epoch": 1.8346384154767388, "grad_norm": 0.14335095882415771, "learning_rate": 0.00034729895641497847, "loss": 1.0154, "num_tokens": 63156426.0, "step": 996 }, { "epoch": 1.836480884385076, "grad_norm": 0.1345560997724533, "learning_rate": 0.00034714548802946594, "loss": 1.0304, "num_tokens": 63219948.0, "step": 997 }, { "epoch": 1.8383233532934131, "grad_norm": 0.13491497933864594, "learning_rate": 0.0003469920196439533, "loss": 1.1021, "num_tokens": 63282751.0, "step": 998 }, { "epoch": 1.8401658222017505, "grad_norm": 0.13710865378379822, "learning_rate": 0.0003468385512584408, "loss": 1.0519, "num_tokens": 63347194.0, "step": 999 }, { "epoch": 1.8420082911100875, "grad_norm": 0.14143912494182587, "learning_rate": 0.00034668508287292814, "loss": 1.0235, "num_tokens": 63409714.0, "step": 1000 }, { "epoch": 1.8438507600184249, "grad_norm": 0.1427825689315796, "learning_rate": 0.0003465316144874156, "loss": 1.0792, "num_tokens": 63472611.0, "step": 1001 }, { "epoch": 1.8456932289267618, "grad_norm": 0.15908342599868774, "learning_rate": 0.000346378146101903, "loss": 1.0689, "num_tokens": 63534881.0, "step": 1002 }, { "epoch": 1.8475356978350992, "grad_norm": 0.13086023926734924, "learning_rate": 0.00034622467771639045, "loss": 1.0139, "num_tokens": 63599223.0, "step": 1003 }, { "epoch": 1.8493781667434361, "grad_norm": 0.143429696559906, "learning_rate": 0.0003460712093308778, "loss": 1.1104, "num_tokens": 63662291.0, "step": 1004 }, { "epoch": 1.8512206356517735, "grad_norm": 0.13867072761058807, "learning_rate": 0.0003459177409453653, "loss": 1.0565, "num_tokens": 63726056.0, "step": 1005 }, { "epoch": 1.8530631045601105, "grad_norm": 0.132095605134964, "learning_rate": 0.00034576427255985265, "loss": 1.0234, "num_tokens": 63789620.0, "step": 1006 }, { "epoch": 1.8549055734684479, "grad_norm": 0.13332417607307434, "learning_rate": 0.0003456108041743401, "loss": 1.085, "num_tokens": 63853941.0, "step": 1007 }, { "epoch": 1.8567480423767848, "grad_norm": 0.14407210052013397, "learning_rate": 0.0003454573357888275, "loss": 1.0381, "num_tokens": 63917244.0, "step": 1008 }, { "epoch": 1.8585905112851222, "grad_norm": 0.13964633643627167, "learning_rate": 0.00034530386740331496, "loss": 1.0794, "num_tokens": 63981948.0, "step": 1009 }, { "epoch": 1.8604329801934592, "grad_norm": 0.14604869484901428, "learning_rate": 0.0003451503990178023, "loss": 1.0325, "num_tokens": 64045647.0, "step": 1010 }, { "epoch": 1.8622754491017965, "grad_norm": 0.13391055166721344, "learning_rate": 0.00034499693063228974, "loss": 0.9861, "num_tokens": 64109361.0, "step": 1011 }, { "epoch": 1.8641179180101335, "grad_norm": 0.1344023495912552, "learning_rate": 0.00034484346224677716, "loss": 1.0488, "num_tokens": 64173381.0, "step": 1012 }, { "epoch": 1.8659603869184709, "grad_norm": 0.14035257697105408, "learning_rate": 0.0003446899938612646, "loss": 1.069, "num_tokens": 64236777.0, "step": 1013 }, { "epoch": 1.8678028558268078, "grad_norm": 0.13856321573257446, "learning_rate": 0.000344536525475752, "loss": 1.0569, "num_tokens": 64299698.0, "step": 1014 }, { "epoch": 1.8696453247351452, "grad_norm": 0.13685572147369385, "learning_rate": 0.0003443830570902394, "loss": 1.0591, "num_tokens": 64362855.0, "step": 1015 }, { "epoch": 1.8714877936434822, "grad_norm": 0.13798482716083527, "learning_rate": 0.0003442295887047268, "loss": 1.0051, "num_tokens": 64425508.0, "step": 1016 }, { "epoch": 1.8733302625518196, "grad_norm": 0.17099057137966156, "learning_rate": 0.00034407612031921424, "loss": 1.2321, "num_tokens": 64488469.0, "step": 1017 }, { "epoch": 1.8751727314601565, "grad_norm": 0.1382746696472168, "learning_rate": 0.00034392265193370166, "loss": 1.0828, "num_tokens": 64551129.0, "step": 1018 }, { "epoch": 1.877015200368494, "grad_norm": 0.1407041996717453, "learning_rate": 0.0003437691835481891, "loss": 1.0253, "num_tokens": 64614595.0, "step": 1019 }, { "epoch": 1.8788576692768308, "grad_norm": 0.12762504816055298, "learning_rate": 0.0003436157151626765, "loss": 1.0544, "num_tokens": 64678508.0, "step": 1020 }, { "epoch": 1.8807001381851682, "grad_norm": 0.14556488394737244, "learning_rate": 0.0003434622467771639, "loss": 1.0344, "num_tokens": 64741235.0, "step": 1021 }, { "epoch": 1.8825426070935052, "grad_norm": 0.15125203132629395, "learning_rate": 0.00034330877839165133, "loss": 1.0696, "num_tokens": 64802199.0, "step": 1022 }, { "epoch": 1.8843850760018426, "grad_norm": 0.13773596286773682, "learning_rate": 0.00034315531000613875, "loss": 1.0117, "num_tokens": 64864744.0, "step": 1023 }, { "epoch": 1.8862275449101795, "grad_norm": 0.14844799041748047, "learning_rate": 0.00034300184162062617, "loss": 1.0486, "num_tokens": 64927040.0, "step": 1024 }, { "epoch": 1.888070013818517, "grad_norm": 0.14144256711006165, "learning_rate": 0.0003428483732351136, "loss": 1.009, "num_tokens": 64990879.0, "step": 1025 }, { "epoch": 1.8899124827268539, "grad_norm": 0.14125753939151764, "learning_rate": 0.00034269490484960095, "loss": 1.1168, "num_tokens": 65055605.0, "step": 1026 }, { "epoch": 1.8917549516351913, "grad_norm": 0.1349659264087677, "learning_rate": 0.0003425414364640884, "loss": 1.0319, "num_tokens": 65119029.0, "step": 1027 }, { "epoch": 1.8935974205435282, "grad_norm": 0.14192059636116028, "learning_rate": 0.0003423879680785758, "loss": 1.0786, "num_tokens": 65183391.0, "step": 1028 }, { "epoch": 1.8954398894518656, "grad_norm": 0.14411604404449463, "learning_rate": 0.00034223449969306326, "loss": 1.0296, "num_tokens": 65245338.0, "step": 1029 }, { "epoch": 1.8972823583602025, "grad_norm": 0.14297747611999512, "learning_rate": 0.0003420810313075506, "loss": 1.0725, "num_tokens": 65308916.0, "step": 1030 }, { "epoch": 1.89912482726854, "grad_norm": 0.1446768045425415, "learning_rate": 0.0003419275629220381, "loss": 0.9906, "num_tokens": 65373351.0, "step": 1031 }, { "epoch": 1.9009672961768769, "grad_norm": 0.14011329412460327, "learning_rate": 0.00034177409453652546, "loss": 1.0224, "num_tokens": 65438399.0, "step": 1032 }, { "epoch": 1.9028097650852143, "grad_norm": 0.14559093117713928, "learning_rate": 0.00034162062615101293, "loss": 1.0663, "num_tokens": 65502331.0, "step": 1033 }, { "epoch": 1.9046522339935512, "grad_norm": 0.1442815065383911, "learning_rate": 0.0003414671577655003, "loss": 1.1344, "num_tokens": 65566365.0, "step": 1034 }, { "epoch": 1.9064947029018886, "grad_norm": 0.13309608399868011, "learning_rate": 0.00034131368937998776, "loss": 1.024, "num_tokens": 65630051.0, "step": 1035 }, { "epoch": 1.9083371718102256, "grad_norm": 0.14601893723011017, "learning_rate": 0.0003411602209944751, "loss": 1.0374, "num_tokens": 65693770.0, "step": 1036 }, { "epoch": 1.910179640718563, "grad_norm": 0.1409274935722351, "learning_rate": 0.0003410067526089626, "loss": 1.0755, "num_tokens": 65757741.0, "step": 1037 }, { "epoch": 1.9120221096269, "grad_norm": 0.13729223608970642, "learning_rate": 0.00034085328422344996, "loss": 1.0287, "num_tokens": 65820478.0, "step": 1038 }, { "epoch": 1.9138645785352373, "grad_norm": 0.14172834157943726, "learning_rate": 0.00034069981583793743, "loss": 1.1349, "num_tokens": 65883951.0, "step": 1039 }, { "epoch": 1.9157070474435742, "grad_norm": 0.14250539243221283, "learning_rate": 0.0003405463474524248, "loss": 1.1044, "num_tokens": 65948037.0, "step": 1040 }, { "epoch": 1.9175495163519116, "grad_norm": 0.14007382094860077, "learning_rate": 0.0003403928790669122, "loss": 1.0548, "num_tokens": 66011652.0, "step": 1041 }, { "epoch": 1.9193919852602486, "grad_norm": 0.1429586261510849, "learning_rate": 0.00034023941068139963, "loss": 1.0669, "num_tokens": 66075362.0, "step": 1042 }, { "epoch": 1.921234454168586, "grad_norm": 0.1352539211511612, "learning_rate": 0.00034008594229588705, "loss": 1.0483, "num_tokens": 66139494.0, "step": 1043 }, { "epoch": 1.9230769230769231, "grad_norm": 0.14233103394508362, "learning_rate": 0.00033993247391037447, "loss": 1.0073, "num_tokens": 66202666.0, "step": 1044 }, { "epoch": 1.9249193919852603, "grad_norm": 0.14869600534439087, "learning_rate": 0.0003397790055248619, "loss": 1.0816, "num_tokens": 66264518.0, "step": 1045 }, { "epoch": 1.9267618608935975, "grad_norm": 0.14643168449401855, "learning_rate": 0.0003396255371393493, "loss": 0.976, "num_tokens": 66327533.0, "step": 1046 }, { "epoch": 1.9286043298019346, "grad_norm": 0.1351689100265503, "learning_rate": 0.0003394720687538367, "loss": 1.1164, "num_tokens": 66391260.0, "step": 1047 }, { "epoch": 1.9304467987102718, "grad_norm": 0.14119617640972137, "learning_rate": 0.00033931860036832414, "loss": 1.0149, "num_tokens": 66454608.0, "step": 1048 }, { "epoch": 1.932289267618609, "grad_norm": 0.14800028502941132, "learning_rate": 0.00033916513198281156, "loss": 1.0241, "num_tokens": 66515719.0, "step": 1049 }, { "epoch": 1.9341317365269461, "grad_norm": 0.14098840951919556, "learning_rate": 0.000339011663597299, "loss": 1.0422, "num_tokens": 66579485.0, "step": 1050 }, { "epoch": 1.9359742054352833, "grad_norm": 0.1450314223766327, "learning_rate": 0.0003388581952117864, "loss": 1.0631, "num_tokens": 66644157.0, "step": 1051 }, { "epoch": 1.9378166743436205, "grad_norm": 0.13933686912059784, "learning_rate": 0.0003387047268262738, "loss": 1.1518, "num_tokens": 66708681.0, "step": 1052 }, { "epoch": 1.9396591432519577, "grad_norm": 0.13747040927410126, "learning_rate": 0.00033855125844076123, "loss": 1.0346, "num_tokens": 66772609.0, "step": 1053 }, { "epoch": 1.9415016121602948, "grad_norm": 0.1320979744195938, "learning_rate": 0.00033839779005524865, "loss": 1.0557, "num_tokens": 66836432.0, "step": 1054 }, { "epoch": 1.943344081068632, "grad_norm": 0.14005760848522186, "learning_rate": 0.00033824432166973606, "loss": 1.0576, "num_tokens": 66899750.0, "step": 1055 }, { "epoch": 1.9451865499769692, "grad_norm": 0.14324940741062164, "learning_rate": 0.00033809085328422343, "loss": 1.022, "num_tokens": 66962628.0, "step": 1056 }, { "epoch": 1.9470290188853063, "grad_norm": 0.14307090640068054, "learning_rate": 0.0003379373848987109, "loss": 1.0232, "num_tokens": 67026107.0, "step": 1057 }, { "epoch": 1.9488714877936435, "grad_norm": 0.140453040599823, "learning_rate": 0.00033778391651319826, "loss": 0.9889, "num_tokens": 67088748.0, "step": 1058 }, { "epoch": 1.9507139567019807, "grad_norm": 0.14100651443004608, "learning_rate": 0.00033763044812768573, "loss": 1.1031, "num_tokens": 67152591.0, "step": 1059 }, { "epoch": 1.9525564256103178, "grad_norm": 0.1543610543012619, "learning_rate": 0.0003374769797421731, "loss": 1.0845, "num_tokens": 67216389.0, "step": 1060 }, { "epoch": 1.954398894518655, "grad_norm": 0.1388433575630188, "learning_rate": 0.00033732351135666057, "loss": 1.1156, "num_tokens": 67280591.0, "step": 1061 }, { "epoch": 1.9562413634269922, "grad_norm": 0.13792873919010162, "learning_rate": 0.00033717004297114793, "loss": 1.1021, "num_tokens": 67344143.0, "step": 1062 }, { "epoch": 1.9580838323353293, "grad_norm": 0.13591495156288147, "learning_rate": 0.0003370165745856354, "loss": 1.0664, "num_tokens": 67408027.0, "step": 1063 }, { "epoch": 1.9599263012436665, "grad_norm": 0.14591087400913239, "learning_rate": 0.00033686310620012277, "loss": 1.0257, "num_tokens": 67470242.0, "step": 1064 }, { "epoch": 1.9617687701520037, "grad_norm": 0.13191170990467072, "learning_rate": 0.0003367096378146102, "loss": 1.0701, "num_tokens": 67534582.0, "step": 1065 }, { "epoch": 1.9636112390603409, "grad_norm": 0.13897576928138733, "learning_rate": 0.0003365561694290976, "loss": 1.062, "num_tokens": 67597044.0, "step": 1066 }, { "epoch": 1.965453707968678, "grad_norm": 0.14836342632770538, "learning_rate": 0.000336402701043585, "loss": 1.0717, "num_tokens": 67660274.0, "step": 1067 }, { "epoch": 1.9672961768770152, "grad_norm": 0.14373357594013214, "learning_rate": 0.00033624923265807244, "loss": 1.0194, "num_tokens": 67723247.0, "step": 1068 }, { "epoch": 1.9691386457853524, "grad_norm": 0.14612920582294464, "learning_rate": 0.00033609576427255986, "loss": 1.0488, "num_tokens": 67786503.0, "step": 1069 }, { "epoch": 1.9709811146936895, "grad_norm": 0.13666598498821259, "learning_rate": 0.0003359422958870473, "loss": 0.987, "num_tokens": 67850921.0, "step": 1070 }, { "epoch": 1.9728235836020267, "grad_norm": 0.13729161024093628, "learning_rate": 0.0003357888275015347, "loss": 1.0572, "num_tokens": 67915692.0, "step": 1071 }, { "epoch": 1.9746660525103639, "grad_norm": 0.1485169529914856, "learning_rate": 0.0003356353591160221, "loss": 1.0335, "num_tokens": 67979669.0, "step": 1072 }, { "epoch": 1.976508521418701, "grad_norm": 0.13720309734344482, "learning_rate": 0.0003354818907305095, "loss": 1.0315, "num_tokens": 68043233.0, "step": 1073 }, { "epoch": 1.9783509903270382, "grad_norm": 0.14199182391166687, "learning_rate": 0.00033532842234499695, "loss": 1.0377, "num_tokens": 68107580.0, "step": 1074 }, { "epoch": 1.9801934592353754, "grad_norm": 0.13901129364967346, "learning_rate": 0.0003351749539594843, "loss": 1.0837, "num_tokens": 68171198.0, "step": 1075 }, { "epoch": 1.9820359281437125, "grad_norm": 0.14221207797527313, "learning_rate": 0.0003350214855739718, "loss": 1.0856, "num_tokens": 68235425.0, "step": 1076 }, { "epoch": 1.9838783970520497, "grad_norm": 0.14458948373794556, "learning_rate": 0.00033486801718845915, "loss": 1.0674, "num_tokens": 68298693.0, "step": 1077 }, { "epoch": 1.9857208659603869, "grad_norm": 0.14700518548488617, "learning_rate": 0.0003347145488029466, "loss": 1.0259, "num_tokens": 68361787.0, "step": 1078 }, { "epoch": 1.987563334868724, "grad_norm": 0.14588667452335358, "learning_rate": 0.000334561080417434, "loss": 1.0382, "num_tokens": 68424428.0, "step": 1079 }, { "epoch": 1.9894058037770612, "grad_norm": 0.14462609589099884, "learning_rate": 0.00033440761203192145, "loss": 0.9943, "num_tokens": 68485167.0, "step": 1080 }, { "epoch": 1.9912482726853984, "grad_norm": 0.1510200798511505, "learning_rate": 0.0003342541436464088, "loss": 1.0577, "num_tokens": 68547663.0, "step": 1081 }, { "epoch": 1.9930907415937356, "grad_norm": 0.1442541480064392, "learning_rate": 0.0003341006752608963, "loss": 1.0793, "num_tokens": 68610239.0, "step": 1082 }, { "epoch": 1.9949332105020727, "grad_norm": 0.14800505340099335, "learning_rate": 0.00033394720687538365, "loss": 0.9972, "num_tokens": 68674130.0, "step": 1083 }, { "epoch": 1.99677567941041, "grad_norm": 0.14415167272090912, "learning_rate": 0.0003337937384898711, "loss": 1.0113, "num_tokens": 68736540.0, "step": 1084 }, { "epoch": 1.998618148318747, "grad_norm": 0.14426662027835846, "learning_rate": 0.0003336402701043585, "loss": 0.9867, "num_tokens": 68799846.0, "step": 1085 }, { "epoch": 2.0, "grad_norm": 0.16373221576213837, "learning_rate": 0.00033348680171884596, "loss": 1.0438, "num_tokens": 68846136.0, "step": 1086 }, { "epoch": 2.0, "eval_loss": 1.2264201641082764, "eval_num_tokens": 68846136.0, "eval_runtime": 2.1367, "eval_samples_per_second": 23.4, "eval_steps_per_second": 0.936, "step": 1086 }, { "epoch": 2.0018424689083374, "grad_norm": 0.16984853148460388, "learning_rate": 0.0003333333333333333, "loss": 0.888, "num_tokens": 68911133.0, "step": 1087 }, { "epoch": 2.0036849378166743, "grad_norm": 0.1494671255350113, "learning_rate": 0.00033317986494782074, "loss": 0.9404, "num_tokens": 68974204.0, "step": 1088 }, { "epoch": 2.0055274067250117, "grad_norm": 0.14414705336093903, "learning_rate": 0.00033302639656230816, "loss": 0.8851, "num_tokens": 69037863.0, "step": 1089 }, { "epoch": 2.0073698756333487, "grad_norm": 0.16758902370929718, "learning_rate": 0.0003328729281767956, "loss": 0.8541, "num_tokens": 69100715.0, "step": 1090 }, { "epoch": 2.009212344541686, "grad_norm": 0.1788230687379837, "learning_rate": 0.000332719459791283, "loss": 0.8562, "num_tokens": 69164552.0, "step": 1091 }, { "epoch": 2.011054813450023, "grad_norm": 0.16494359076023102, "learning_rate": 0.0003325659914057704, "loss": 0.8255, "num_tokens": 69227408.0, "step": 1092 }, { "epoch": 2.0128972823583604, "grad_norm": 0.15348272025585175, "learning_rate": 0.00033241252302025783, "loss": 0.8824, "num_tokens": 69289816.0, "step": 1093 }, { "epoch": 2.0147397512666974, "grad_norm": 0.1658121943473816, "learning_rate": 0.00033225905463474525, "loss": 0.8468, "num_tokens": 69352352.0, "step": 1094 }, { "epoch": 2.0165822201750347, "grad_norm": 0.1605059653520584, "learning_rate": 0.00033210558624923266, "loss": 0.7927, "num_tokens": 69415152.0, "step": 1095 }, { "epoch": 2.0184246890833717, "grad_norm": 0.14735350012779236, "learning_rate": 0.0003319521178637201, "loss": 0.8759, "num_tokens": 69478810.0, "step": 1096 }, { "epoch": 2.020267157991709, "grad_norm": 0.15649904310703278, "learning_rate": 0.0003317986494782075, "loss": 0.8734, "num_tokens": 69541766.0, "step": 1097 }, { "epoch": 2.022109626900046, "grad_norm": 0.1575070321559906, "learning_rate": 0.0003316451810926949, "loss": 0.8157, "num_tokens": 69605321.0, "step": 1098 }, { "epoch": 2.0239520958083834, "grad_norm": 0.15113645792007446, "learning_rate": 0.00033149171270718233, "loss": 0.781, "num_tokens": 69669508.0, "step": 1099 }, { "epoch": 2.0257945647167204, "grad_norm": 0.15397216379642487, "learning_rate": 0.00033133824432166975, "loss": 0.8449, "num_tokens": 69732125.0, "step": 1100 }, { "epoch": 2.0276370336250578, "grad_norm": 0.1684069037437439, "learning_rate": 0.00033118477593615717, "loss": 0.8095, "num_tokens": 69795572.0, "step": 1101 }, { "epoch": 2.0294795025333947, "grad_norm": 0.15281590819358826, "learning_rate": 0.0003310313075506446, "loss": 0.875, "num_tokens": 69859278.0, "step": 1102 }, { "epoch": 2.031321971441732, "grad_norm": 0.15630635619163513, "learning_rate": 0.00033087783916513195, "loss": 0.817, "num_tokens": 69921343.0, "step": 1103 }, { "epoch": 2.033164440350069, "grad_norm": 0.1593100130558014, "learning_rate": 0.0003307243707796194, "loss": 0.8406, "num_tokens": 69981739.0, "step": 1104 }, { "epoch": 2.0350069092584064, "grad_norm": 0.14424654841423035, "learning_rate": 0.0003305709023941068, "loss": 0.9012, "num_tokens": 70046222.0, "step": 1105 }, { "epoch": 2.0368493781667434, "grad_norm": 0.14525814354419708, "learning_rate": 0.00033041743400859426, "loss": 0.8937, "num_tokens": 70110111.0, "step": 1106 }, { "epoch": 2.0386918470750808, "grad_norm": 0.15496230125427246, "learning_rate": 0.0003302639656230816, "loss": 0.8527, "num_tokens": 70174183.0, "step": 1107 }, { "epoch": 2.0405343159834177, "grad_norm": 0.15748001635074615, "learning_rate": 0.0003301104972375691, "loss": 0.8674, "num_tokens": 70236131.0, "step": 1108 }, { "epoch": 2.042376784891755, "grad_norm": 0.15078572928905487, "learning_rate": 0.00032995702885205646, "loss": 0.801, "num_tokens": 70299731.0, "step": 1109 }, { "epoch": 2.044219253800092, "grad_norm": 0.15767665207386017, "learning_rate": 0.00032980356046654393, "loss": 0.9181, "num_tokens": 70361486.0, "step": 1110 }, { "epoch": 2.0460617227084295, "grad_norm": 0.1591143161058426, "learning_rate": 0.0003296500920810313, "loss": 0.851, "num_tokens": 70423718.0, "step": 1111 }, { "epoch": 2.0479041916167664, "grad_norm": 0.14542070031166077, "learning_rate": 0.00032949662369551877, "loss": 0.9145, "num_tokens": 70487186.0, "step": 1112 }, { "epoch": 2.049746660525104, "grad_norm": 0.15077775716781616, "learning_rate": 0.00032934315531000613, "loss": 0.8284, "num_tokens": 70551311.0, "step": 1113 }, { "epoch": 2.0515891294334407, "grad_norm": 0.14714139699935913, "learning_rate": 0.0003291896869244936, "loss": 0.8274, "num_tokens": 70616359.0, "step": 1114 }, { "epoch": 2.053431598341778, "grad_norm": 0.15408436954021454, "learning_rate": 0.00032903621853898096, "loss": 0.8553, "num_tokens": 70679844.0, "step": 1115 }, { "epoch": 2.055274067250115, "grad_norm": 0.16367758810520172, "learning_rate": 0.00032888275015346844, "loss": 0.8322, "num_tokens": 70743453.0, "step": 1116 }, { "epoch": 2.0571165361584525, "grad_norm": 0.15314680337905884, "learning_rate": 0.0003287292817679558, "loss": 0.8671, "num_tokens": 70806703.0, "step": 1117 }, { "epoch": 2.0589590050667894, "grad_norm": 0.15240533649921417, "learning_rate": 0.00032857581338244327, "loss": 0.9175, "num_tokens": 70869084.0, "step": 1118 }, { "epoch": 2.060801473975127, "grad_norm": 0.15593239665031433, "learning_rate": 0.00032842234499693063, "loss": 0.8614, "num_tokens": 70931563.0, "step": 1119 }, { "epoch": 2.0626439428834638, "grad_norm": 0.15806686878204346, "learning_rate": 0.00032826887661141805, "loss": 0.8902, "num_tokens": 70995360.0, "step": 1120 }, { "epoch": 2.064486411791801, "grad_norm": 0.1559874415397644, "learning_rate": 0.00032811540822590547, "loss": 0.8279, "num_tokens": 71059424.0, "step": 1121 }, { "epoch": 2.066328880700138, "grad_norm": 0.1549561768770218, "learning_rate": 0.0003279619398403929, "loss": 0.9117, "num_tokens": 71121392.0, "step": 1122 }, { "epoch": 2.0681713496084755, "grad_norm": 0.15119092166423798, "learning_rate": 0.0003278084714548803, "loss": 0.8496, "num_tokens": 71184612.0, "step": 1123 }, { "epoch": 2.0700138185168124, "grad_norm": 0.1488768309354782, "learning_rate": 0.0003276550030693677, "loss": 0.9238, "num_tokens": 71248500.0, "step": 1124 }, { "epoch": 2.07185628742515, "grad_norm": 0.1480545699596405, "learning_rate": 0.00032750153468385514, "loss": 0.749, "num_tokens": 71312838.0, "step": 1125 }, { "epoch": 2.0736987563334868, "grad_norm": 0.1618625968694687, "learning_rate": 0.00032734806629834256, "loss": 0.7772, "num_tokens": 71375309.0, "step": 1126 }, { "epoch": 2.075541225241824, "grad_norm": 0.15834467113018036, "learning_rate": 0.00032719459791283, "loss": 0.8099, "num_tokens": 71438715.0, "step": 1127 }, { "epoch": 2.077383694150161, "grad_norm": 0.1582443118095398, "learning_rate": 0.0003270411295273174, "loss": 0.8625, "num_tokens": 71502149.0, "step": 1128 }, { "epoch": 2.0792261630584985, "grad_norm": 0.16046173870563507, "learning_rate": 0.0003268876611418048, "loss": 0.8476, "num_tokens": 71565133.0, "step": 1129 }, { "epoch": 2.0810686319668354, "grad_norm": 0.15636935830116272, "learning_rate": 0.00032673419275629223, "loss": 0.8212, "num_tokens": 71628990.0, "step": 1130 }, { "epoch": 2.082911100875173, "grad_norm": 0.1601140797138214, "learning_rate": 0.00032658072437077965, "loss": 0.8169, "num_tokens": 71692265.0, "step": 1131 }, { "epoch": 2.08475356978351, "grad_norm": 0.15062229335308075, "learning_rate": 0.00032642725598526707, "loss": 0.8802, "num_tokens": 71756776.0, "step": 1132 }, { "epoch": 2.086596038691847, "grad_norm": 0.15252214670181274, "learning_rate": 0.0003262737875997545, "loss": 0.8314, "num_tokens": 71820399.0, "step": 1133 }, { "epoch": 2.088438507600184, "grad_norm": 0.1666100025177002, "learning_rate": 0.0003261203192142419, "loss": 0.877, "num_tokens": 71882472.0, "step": 1134 }, { "epoch": 2.0902809765085215, "grad_norm": 0.15985724329948425, "learning_rate": 0.00032596685082872926, "loss": 0.9256, "num_tokens": 71945628.0, "step": 1135 }, { "epoch": 2.0921234454168585, "grad_norm": 0.15525072813034058, "learning_rate": 0.0003258133824432167, "loss": 0.9159, "num_tokens": 72008586.0, "step": 1136 }, { "epoch": 2.093965914325196, "grad_norm": 0.16595934331417084, "learning_rate": 0.0003256599140577041, "loss": 0.8048, "num_tokens": 72070295.0, "step": 1137 }, { "epoch": 2.095808383233533, "grad_norm": 0.1627570539712906, "learning_rate": 0.0003255064456721915, "loss": 0.8232, "num_tokens": 72134205.0, "step": 1138 }, { "epoch": 2.09765085214187, "grad_norm": 0.1536836326122284, "learning_rate": 0.00032535297728667894, "loss": 0.8218, "num_tokens": 72196955.0, "step": 1139 }, { "epoch": 2.099493321050207, "grad_norm": 0.15144026279449463, "learning_rate": 0.00032519950890116635, "loss": 0.8182, "num_tokens": 72261376.0, "step": 1140 }, { "epoch": 2.1013357899585445, "grad_norm": 0.15525870025157928, "learning_rate": 0.00032504604051565377, "loss": 0.8637, "num_tokens": 72325260.0, "step": 1141 }, { "epoch": 2.1031782588668815, "grad_norm": 0.17015916109085083, "learning_rate": 0.0003248925721301412, "loss": 0.889, "num_tokens": 72387621.0, "step": 1142 }, { "epoch": 2.105020727775219, "grad_norm": 0.155491903424263, "learning_rate": 0.0003247391037446286, "loss": 0.9097, "num_tokens": 72450204.0, "step": 1143 }, { "epoch": 2.106863196683556, "grad_norm": 0.14661136269569397, "learning_rate": 0.000324585635359116, "loss": 0.8713, "num_tokens": 72513812.0, "step": 1144 }, { "epoch": 2.108705665591893, "grad_norm": 0.15960104763507843, "learning_rate": 0.00032443216697360344, "loss": 0.9226, "num_tokens": 72577653.0, "step": 1145 }, { "epoch": 2.11054813450023, "grad_norm": 0.1635487675666809, "learning_rate": 0.00032427869858809086, "loss": 0.8179, "num_tokens": 72641539.0, "step": 1146 }, { "epoch": 2.1123906034085675, "grad_norm": 0.15923166275024414, "learning_rate": 0.0003241252302025783, "loss": 0.8244, "num_tokens": 72704079.0, "step": 1147 }, { "epoch": 2.1142330723169045, "grad_norm": 0.1571742594242096, "learning_rate": 0.0003239717618170657, "loss": 0.8597, "num_tokens": 72767780.0, "step": 1148 }, { "epoch": 2.116075541225242, "grad_norm": 0.16235539317131042, "learning_rate": 0.0003238182934315531, "loss": 0.884, "num_tokens": 72831254.0, "step": 1149 }, { "epoch": 2.117918010133579, "grad_norm": 0.16066992282867432, "learning_rate": 0.0003236648250460405, "loss": 0.8087, "num_tokens": 72894676.0, "step": 1150 }, { "epoch": 2.1197604790419162, "grad_norm": 0.15130330622196198, "learning_rate": 0.00032351135666052795, "loss": 0.8765, "num_tokens": 72959101.0, "step": 1151 }, { "epoch": 2.121602947950253, "grad_norm": 0.16087692975997925, "learning_rate": 0.0003233578882750153, "loss": 0.9285, "num_tokens": 73022266.0, "step": 1152 }, { "epoch": 2.1234454168585906, "grad_norm": 0.1642264872789383, "learning_rate": 0.0003232044198895028, "loss": 0.8422, "num_tokens": 73084879.0, "step": 1153 }, { "epoch": 2.1252878857669275, "grad_norm": 0.16239331662654877, "learning_rate": 0.00032305095150399015, "loss": 0.8259, "num_tokens": 73147711.0, "step": 1154 }, { "epoch": 2.127130354675265, "grad_norm": 0.16754363477230072, "learning_rate": 0.0003228974831184776, "loss": 0.852, "num_tokens": 73210314.0, "step": 1155 }, { "epoch": 2.128972823583602, "grad_norm": 0.16137517988681793, "learning_rate": 0.000322744014732965, "loss": 0.8067, "num_tokens": 73274348.0, "step": 1156 }, { "epoch": 2.1308152924919392, "grad_norm": 0.17301030457019806, "learning_rate": 0.00032259054634745245, "loss": 0.8505, "num_tokens": 73338154.0, "step": 1157 }, { "epoch": 2.132657761400276, "grad_norm": 0.16286084055900574, "learning_rate": 0.0003224370779619398, "loss": 0.8444, "num_tokens": 73402266.0, "step": 1158 }, { "epoch": 2.1345002303086136, "grad_norm": 0.15782423317432404, "learning_rate": 0.0003222836095764273, "loss": 0.8656, "num_tokens": 73465360.0, "step": 1159 }, { "epoch": 2.1363426992169505, "grad_norm": 0.1692672073841095, "learning_rate": 0.00032213014119091465, "loss": 0.7838, "num_tokens": 73527081.0, "step": 1160 }, { "epoch": 2.138185168125288, "grad_norm": 0.1649845689535141, "learning_rate": 0.0003219766728054021, "loss": 0.9213, "num_tokens": 73589266.0, "step": 1161 }, { "epoch": 2.140027637033625, "grad_norm": 0.1627446860074997, "learning_rate": 0.0003218232044198895, "loss": 0.8412, "num_tokens": 73651582.0, "step": 1162 }, { "epoch": 2.1418701059419623, "grad_norm": 0.1602930724620819, "learning_rate": 0.00032166973603437696, "loss": 0.8101, "num_tokens": 73715193.0, "step": 1163 }, { "epoch": 2.143712574850299, "grad_norm": 0.1607874482870102, "learning_rate": 0.0003215162676488643, "loss": 0.876, "num_tokens": 73778811.0, "step": 1164 }, { "epoch": 2.1455550437586366, "grad_norm": 0.1621437966823578, "learning_rate": 0.00032136279926335174, "loss": 0.8575, "num_tokens": 73842190.0, "step": 1165 }, { "epoch": 2.1473975126669735, "grad_norm": 0.16172486543655396, "learning_rate": 0.00032120933087783916, "loss": 0.8638, "num_tokens": 73905721.0, "step": 1166 }, { "epoch": 2.149239981575311, "grad_norm": 0.1532696932554245, "learning_rate": 0.0003210558624923266, "loss": 0.8828, "num_tokens": 73969869.0, "step": 1167 }, { "epoch": 2.1510824504836483, "grad_norm": 0.1597205400466919, "learning_rate": 0.000320902394106814, "loss": 0.8585, "num_tokens": 74032711.0, "step": 1168 }, { "epoch": 2.1529249193919853, "grad_norm": 0.16112646460533142, "learning_rate": 0.0003207489257213014, "loss": 0.8791, "num_tokens": 74096521.0, "step": 1169 }, { "epoch": 2.154767388300322, "grad_norm": 0.1540362685918808, "learning_rate": 0.00032059545733578883, "loss": 0.8811, "num_tokens": 74160205.0, "step": 1170 }, { "epoch": 2.1566098572086596, "grad_norm": 0.16625173389911652, "learning_rate": 0.00032044198895027625, "loss": 0.8685, "num_tokens": 74222764.0, "step": 1171 }, { "epoch": 2.158452326116997, "grad_norm": 0.16300834715366364, "learning_rate": 0.00032028852056476367, "loss": 0.9152, "num_tokens": 74284027.0, "step": 1172 }, { "epoch": 2.160294795025334, "grad_norm": 0.16565117239952087, "learning_rate": 0.0003201350521792511, "loss": 0.8644, "num_tokens": 74347263.0, "step": 1173 }, { "epoch": 2.162137263933671, "grad_norm": 0.1639375388622284, "learning_rate": 0.0003199815837937385, "loss": 0.8784, "num_tokens": 74411646.0, "step": 1174 }, { "epoch": 2.1639797328420083, "grad_norm": 0.15901881456375122, "learning_rate": 0.0003198281154082259, "loss": 0.7787, "num_tokens": 74474944.0, "step": 1175 }, { "epoch": 2.1658222017503457, "grad_norm": 0.1688585728406906, "learning_rate": 0.00031967464702271334, "loss": 0.8545, "num_tokens": 74536717.0, "step": 1176 }, { "epoch": 2.1676646706586826, "grad_norm": 0.16756203770637512, "learning_rate": 0.00031952117863720075, "loss": 0.9458, "num_tokens": 74600217.0, "step": 1177 }, { "epoch": 2.1695071395670196, "grad_norm": 0.17589649558067322, "learning_rate": 0.00031936771025168817, "loss": 0.8198, "num_tokens": 74663063.0, "step": 1178 }, { "epoch": 2.171349608475357, "grad_norm": 0.15765434503555298, "learning_rate": 0.0003192142418661756, "loss": 0.8931, "num_tokens": 74727384.0, "step": 1179 }, { "epoch": 2.1731920773836944, "grad_norm": 0.1610925793647766, "learning_rate": 0.00031906077348066295, "loss": 0.8743, "num_tokens": 74790706.0, "step": 1180 }, { "epoch": 2.1750345462920313, "grad_norm": 0.16805237531661987, "learning_rate": 0.0003189073050951504, "loss": 0.8539, "num_tokens": 74853564.0, "step": 1181 }, { "epoch": 2.1768770152003687, "grad_norm": 0.1666988581418991, "learning_rate": 0.0003187538367096378, "loss": 0.843, "num_tokens": 74916593.0, "step": 1182 }, { "epoch": 2.1787194841087056, "grad_norm": 0.1648874133825302, "learning_rate": 0.00031860036832412526, "loss": 0.8557, "num_tokens": 74980098.0, "step": 1183 }, { "epoch": 2.180561953017043, "grad_norm": 0.1715010106563568, "learning_rate": 0.0003184468999386126, "loss": 0.9159, "num_tokens": 75043666.0, "step": 1184 }, { "epoch": 2.18240442192538, "grad_norm": 0.17312660813331604, "learning_rate": 0.0003182934315531001, "loss": 0.816, "num_tokens": 75107256.0, "step": 1185 }, { "epoch": 2.1842468908337174, "grad_norm": 0.16471423208713531, "learning_rate": 0.00031813996316758746, "loss": 0.8957, "num_tokens": 75171380.0, "step": 1186 }, { "epoch": 2.1860893597420543, "grad_norm": 0.17068549990653992, "learning_rate": 0.00031798649478207493, "loss": 0.8317, "num_tokens": 75235948.0, "step": 1187 }, { "epoch": 2.1879318286503917, "grad_norm": 0.16212452948093414, "learning_rate": 0.0003178330263965623, "loss": 0.7988, "num_tokens": 75298215.0, "step": 1188 }, { "epoch": 2.1897742975587287, "grad_norm": 0.16133829951286316, "learning_rate": 0.00031767955801104977, "loss": 0.839, "num_tokens": 75361808.0, "step": 1189 }, { "epoch": 2.191616766467066, "grad_norm": 0.1571374535560608, "learning_rate": 0.00031752608962553713, "loss": 0.8299, "num_tokens": 75426553.0, "step": 1190 }, { "epoch": 2.193459235375403, "grad_norm": 0.16188815236091614, "learning_rate": 0.0003173726212400246, "loss": 0.8414, "num_tokens": 75488952.0, "step": 1191 }, { "epoch": 2.1953017042837404, "grad_norm": 0.16189850866794586, "learning_rate": 0.00031721915285451197, "loss": 0.8282, "num_tokens": 75551547.0, "step": 1192 }, { "epoch": 2.1971441731920773, "grad_norm": 0.16277408599853516, "learning_rate": 0.00031706568446899944, "loss": 0.8867, "num_tokens": 75615996.0, "step": 1193 }, { "epoch": 2.1989866421004147, "grad_norm": 0.16674628853797913, "learning_rate": 0.0003169122160834868, "loss": 0.9194, "num_tokens": 75679170.0, "step": 1194 }, { "epoch": 2.2008291110087517, "grad_norm": 0.17249374091625214, "learning_rate": 0.0003167587476979743, "loss": 0.8946, "num_tokens": 75742348.0, "step": 1195 }, { "epoch": 2.202671579917089, "grad_norm": 0.17263878881931305, "learning_rate": 0.00031660527931246164, "loss": 0.805, "num_tokens": 75805817.0, "step": 1196 }, { "epoch": 2.204514048825426, "grad_norm": 0.16652712225914001, "learning_rate": 0.00031645181092694905, "loss": 0.8251, "num_tokens": 75869619.0, "step": 1197 }, { "epoch": 2.2063565177337634, "grad_norm": 0.16170048713684082, "learning_rate": 0.00031629834254143647, "loss": 0.863, "num_tokens": 75932498.0, "step": 1198 }, { "epoch": 2.2081989866421003, "grad_norm": 0.15545658767223358, "learning_rate": 0.0003161448741559239, "loss": 0.8384, "num_tokens": 75997641.0, "step": 1199 }, { "epoch": 2.2100414555504377, "grad_norm": 0.16418619453907013, "learning_rate": 0.0003159914057704113, "loss": 0.8618, "num_tokens": 76061350.0, "step": 1200 }, { "epoch": 2.2118839244587747, "grad_norm": 0.15865960717201233, "learning_rate": 0.0003158379373848987, "loss": 0.9373, "num_tokens": 76124542.0, "step": 1201 }, { "epoch": 2.213726393367112, "grad_norm": 0.16154494881629944, "learning_rate": 0.00031568446899938614, "loss": 0.8592, "num_tokens": 76188577.0, "step": 1202 }, { "epoch": 2.215568862275449, "grad_norm": 0.15961094200611115, "learning_rate": 0.00031553100061387356, "loss": 0.9322, "num_tokens": 76252972.0, "step": 1203 }, { "epoch": 2.2174113311837864, "grad_norm": 0.17792817950248718, "learning_rate": 0.000315377532228361, "loss": 0.8701, "num_tokens": 76315285.0, "step": 1204 }, { "epoch": 2.2192538000921234, "grad_norm": 0.17403940856456757, "learning_rate": 0.0003152240638428484, "loss": 0.8435, "num_tokens": 76378412.0, "step": 1205 }, { "epoch": 2.2210962690004608, "grad_norm": 0.1636994481086731, "learning_rate": 0.0003150705954573358, "loss": 0.915, "num_tokens": 76441920.0, "step": 1206 }, { "epoch": 2.2229387379087977, "grad_norm": 0.16003087162971497, "learning_rate": 0.0003149171270718232, "loss": 0.7753, "num_tokens": 76505091.0, "step": 1207 }, { "epoch": 2.224781206817135, "grad_norm": 0.1621602475643158, "learning_rate": 0.00031476365868631065, "loss": 0.8358, "num_tokens": 76567805.0, "step": 1208 }, { "epoch": 2.226623675725472, "grad_norm": 0.1585310846567154, "learning_rate": 0.000314610190300798, "loss": 0.8681, "num_tokens": 76630120.0, "step": 1209 }, { "epoch": 2.2284661446338094, "grad_norm": 0.165370911359787, "learning_rate": 0.0003144567219152855, "loss": 0.9598, "num_tokens": 76692982.0, "step": 1210 }, { "epoch": 2.2303086135421464, "grad_norm": 0.16409288346767426, "learning_rate": 0.00031430325352977285, "loss": 0.8101, "num_tokens": 76756883.0, "step": 1211 }, { "epoch": 2.2321510824504838, "grad_norm": 0.17152152955532074, "learning_rate": 0.00031414978514426027, "loss": 0.8446, "num_tokens": 76821227.0, "step": 1212 }, { "epoch": 2.2339935513588207, "grad_norm": 0.15764600038528442, "learning_rate": 0.0003139963167587477, "loss": 0.8458, "num_tokens": 76883757.0, "step": 1213 }, { "epoch": 2.235836020267158, "grad_norm": 0.16953057050704956, "learning_rate": 0.0003138428483732351, "loss": 0.9305, "num_tokens": 76946474.0, "step": 1214 }, { "epoch": 2.237678489175495, "grad_norm": 0.15523064136505127, "learning_rate": 0.0003136893799877225, "loss": 0.8945, "num_tokens": 77011054.0, "step": 1215 }, { "epoch": 2.2395209580838324, "grad_norm": 0.1675792932510376, "learning_rate": 0.00031353591160220994, "loss": 0.8568, "num_tokens": 77074428.0, "step": 1216 }, { "epoch": 2.2413634269921694, "grad_norm": 0.15893308818340302, "learning_rate": 0.00031338244321669735, "loss": 0.884, "num_tokens": 77138683.0, "step": 1217 }, { "epoch": 2.243205895900507, "grad_norm": 0.16404743492603302, "learning_rate": 0.00031322897483118477, "loss": 0.9167, "num_tokens": 77200235.0, "step": 1218 }, { "epoch": 2.2450483648088437, "grad_norm": 0.1669047772884369, "learning_rate": 0.0003130755064456722, "loss": 0.7987, "num_tokens": 77263122.0, "step": 1219 }, { "epoch": 2.246890833717181, "grad_norm": 0.1601138710975647, "learning_rate": 0.0003129220380601596, "loss": 0.8427, "num_tokens": 77326779.0, "step": 1220 }, { "epoch": 2.248733302625518, "grad_norm": 0.16571852564811707, "learning_rate": 0.000312768569674647, "loss": 0.8171, "num_tokens": 77389432.0, "step": 1221 }, { "epoch": 2.2505757715338555, "grad_norm": 0.17164970934391022, "learning_rate": 0.00031261510128913444, "loss": 0.8399, "num_tokens": 77451390.0, "step": 1222 }, { "epoch": 2.2524182404421924, "grad_norm": 0.17509162425994873, "learning_rate": 0.00031246163290362186, "loss": 0.8599, "num_tokens": 77515077.0, "step": 1223 }, { "epoch": 2.25426070935053, "grad_norm": 0.15718570351600647, "learning_rate": 0.0003123081645181093, "loss": 0.8134, "num_tokens": 77578487.0, "step": 1224 }, { "epoch": 2.2561031782588667, "grad_norm": 0.16115356981754303, "learning_rate": 0.0003121546961325967, "loss": 0.8891, "num_tokens": 77641889.0, "step": 1225 }, { "epoch": 2.257945647167204, "grad_norm": 0.16110274195671082, "learning_rate": 0.0003120012277470841, "loss": 0.8302, "num_tokens": 77706005.0, "step": 1226 }, { "epoch": 2.259788116075541, "grad_norm": 0.15859033167362213, "learning_rate": 0.0003118477593615715, "loss": 0.8561, "num_tokens": 77770127.0, "step": 1227 }, { "epoch": 2.2616305849838785, "grad_norm": 0.17327940464019775, "learning_rate": 0.00031169429097605895, "loss": 0.8622, "num_tokens": 77833272.0, "step": 1228 }, { "epoch": 2.2634730538922154, "grad_norm": 0.1538032591342926, "learning_rate": 0.0003115408225905463, "loss": 0.845, "num_tokens": 77897063.0, "step": 1229 }, { "epoch": 2.265315522800553, "grad_norm": 0.1620187610387802, "learning_rate": 0.0003113873542050338, "loss": 0.8277, "num_tokens": 77961592.0, "step": 1230 }, { "epoch": 2.2671579917088898, "grad_norm": 0.1555824875831604, "learning_rate": 0.00031123388581952115, "loss": 0.8316, "num_tokens": 78025774.0, "step": 1231 }, { "epoch": 2.269000460617227, "grad_norm": 0.16537873446941376, "learning_rate": 0.0003110804174340086, "loss": 0.9024, "num_tokens": 78089860.0, "step": 1232 }, { "epoch": 2.270842929525564, "grad_norm": 0.15933917462825775, "learning_rate": 0.000310926949048496, "loss": 0.8434, "num_tokens": 78153005.0, "step": 1233 }, { "epoch": 2.2726853984339015, "grad_norm": 0.16467449069023132, "learning_rate": 0.00031077348066298346, "loss": 0.8748, "num_tokens": 78217182.0, "step": 1234 }, { "epoch": 2.2745278673422384, "grad_norm": 0.16931268572807312, "learning_rate": 0.0003106200122774708, "loss": 0.8216, "num_tokens": 78280064.0, "step": 1235 }, { "epoch": 2.276370336250576, "grad_norm": 0.16499516367912292, "learning_rate": 0.0003104665438919583, "loss": 0.837, "num_tokens": 78343721.0, "step": 1236 }, { "epoch": 2.278212805158913, "grad_norm": 0.17254117131233215, "learning_rate": 0.00031031307550644565, "loss": 0.8712, "num_tokens": 78407008.0, "step": 1237 }, { "epoch": 2.28005527406725, "grad_norm": 0.16283386945724487, "learning_rate": 0.0003101596071209331, "loss": 0.9496, "num_tokens": 78471026.0, "step": 1238 }, { "epoch": 2.281897742975587, "grad_norm": 0.16400615870952606, "learning_rate": 0.0003100061387354205, "loss": 0.8524, "num_tokens": 78534957.0, "step": 1239 }, { "epoch": 2.2837402118839245, "grad_norm": 0.16811801493167877, "learning_rate": 0.00030985267034990796, "loss": 0.8244, "num_tokens": 78597811.0, "step": 1240 }, { "epoch": 2.2855826807922615, "grad_norm": 0.16786707937717438, "learning_rate": 0.0003096992019643953, "loss": 0.8701, "num_tokens": 78661552.0, "step": 1241 }, { "epoch": 2.287425149700599, "grad_norm": 0.154982790350914, "learning_rate": 0.0003095457335788828, "loss": 0.8902, "num_tokens": 78726061.0, "step": 1242 }, { "epoch": 2.289267618608936, "grad_norm": 0.170968696475029, "learning_rate": 0.00030939226519337016, "loss": 0.8837, "num_tokens": 78789756.0, "step": 1243 }, { "epoch": 2.291110087517273, "grad_norm": 0.16389021277427673, "learning_rate": 0.0003092387968078576, "loss": 0.8564, "num_tokens": 78853933.0, "step": 1244 }, { "epoch": 2.29295255642561, "grad_norm": 0.15703271329402924, "learning_rate": 0.000309085328422345, "loss": 1.012, "num_tokens": 78918415.0, "step": 1245 }, { "epoch": 2.2947950253339475, "grad_norm": 0.15860621631145477, "learning_rate": 0.0003089318600368324, "loss": 0.8516, "num_tokens": 78981356.0, "step": 1246 }, { "epoch": 2.2966374942422845, "grad_norm": 0.16440868377685547, "learning_rate": 0.00030877839165131983, "loss": 0.8628, "num_tokens": 79044569.0, "step": 1247 }, { "epoch": 2.298479963150622, "grad_norm": 0.16588100790977478, "learning_rate": 0.00030862492326580725, "loss": 0.8658, "num_tokens": 79107658.0, "step": 1248 }, { "epoch": 2.300322432058959, "grad_norm": 0.16621863842010498, "learning_rate": 0.00030847145488029467, "loss": 0.8832, "num_tokens": 79171624.0, "step": 1249 }, { "epoch": 2.302164900967296, "grad_norm": 0.16766925156116486, "learning_rate": 0.0003083179864947821, "loss": 0.928, "num_tokens": 79236101.0, "step": 1250 }, { "epoch": 2.3040073698756336, "grad_norm": 0.17170105874538422, "learning_rate": 0.0003081645181092695, "loss": 0.8142, "num_tokens": 79298577.0, "step": 1251 }, { "epoch": 2.3058498387839705, "grad_norm": 0.1680782437324524, "learning_rate": 0.0003080110497237569, "loss": 0.837, "num_tokens": 79362582.0, "step": 1252 }, { "epoch": 2.3076923076923075, "grad_norm": 0.23485174775123596, "learning_rate": 0.00030785758133824434, "loss": 0.8371, "num_tokens": 79426073.0, "step": 1253 }, { "epoch": 2.309534776600645, "grad_norm": 0.19018375873565674, "learning_rate": 0.00030770411295273176, "loss": 0.9656, "num_tokens": 79488051.0, "step": 1254 }, { "epoch": 2.3113772455089823, "grad_norm": 0.16255460679531097, "learning_rate": 0.0003075506445672192, "loss": 0.9167, "num_tokens": 79551278.0, "step": 1255 }, { "epoch": 2.313219714417319, "grad_norm": 0.16779112815856934, "learning_rate": 0.0003073971761817066, "loss": 0.8318, "num_tokens": 79615262.0, "step": 1256 }, { "epoch": 2.315062183325656, "grad_norm": 0.1615806668996811, "learning_rate": 0.000307243707796194, "loss": 0.8168, "num_tokens": 79678953.0, "step": 1257 }, { "epoch": 2.3169046522339936, "grad_norm": 0.16602550446987152, "learning_rate": 0.0003070902394106814, "loss": 0.9182, "num_tokens": 79742722.0, "step": 1258 }, { "epoch": 2.318747121142331, "grad_norm": 0.15661804378032684, "learning_rate": 0.0003069367710251688, "loss": 0.798, "num_tokens": 79807755.0, "step": 1259 }, { "epoch": 2.320589590050668, "grad_norm": 0.15925875306129456, "learning_rate": 0.00030678330263965626, "loss": 0.9122, "num_tokens": 79871165.0, "step": 1260 }, { "epoch": 2.322432058959005, "grad_norm": 0.16634108126163483, "learning_rate": 0.0003066298342541436, "loss": 0.8417, "num_tokens": 79934525.0, "step": 1261 }, { "epoch": 2.3242745278673422, "grad_norm": 0.1675313264131546, "learning_rate": 0.0003064763658686311, "loss": 0.8256, "num_tokens": 79999128.0, "step": 1262 }, { "epoch": 2.3261169967756796, "grad_norm": 0.1724894493818283, "learning_rate": 0.00030632289748311846, "loss": 0.9006, "num_tokens": 80062163.0, "step": 1263 }, { "epoch": 2.3279594656840166, "grad_norm": 0.15514163672924042, "learning_rate": 0.00030616942909760593, "loss": 0.8799, "num_tokens": 80126909.0, "step": 1264 }, { "epoch": 2.3298019345923535, "grad_norm": 0.16515201330184937, "learning_rate": 0.0003060159607120933, "loss": 0.8656, "num_tokens": 80189585.0, "step": 1265 }, { "epoch": 2.331644403500691, "grad_norm": 0.22200551629066467, "learning_rate": 0.00030586249232658077, "loss": 0.8702, "num_tokens": 80252699.0, "step": 1266 }, { "epoch": 2.3334868724090283, "grad_norm": 0.16337992250919342, "learning_rate": 0.00030570902394106813, "loss": 0.8188, "num_tokens": 80316380.0, "step": 1267 }, { "epoch": 2.3353293413173652, "grad_norm": 0.16779443621635437, "learning_rate": 0.0003055555555555556, "loss": 0.934, "num_tokens": 80381414.0, "step": 1268 }, { "epoch": 2.337171810225702, "grad_norm": 0.18845051527023315, "learning_rate": 0.00030540208717004297, "loss": 0.9405, "num_tokens": 80444883.0, "step": 1269 }, { "epoch": 2.3390142791340396, "grad_norm": 0.1779564917087555, "learning_rate": 0.00030524861878453044, "loss": 0.8299, "num_tokens": 80506400.0, "step": 1270 }, { "epoch": 2.340856748042377, "grad_norm": 0.17017672955989838, "learning_rate": 0.0003050951503990178, "loss": 0.8691, "num_tokens": 80569718.0, "step": 1271 }, { "epoch": 2.342699216950714, "grad_norm": 0.16497883200645447, "learning_rate": 0.0003049416820135053, "loss": 0.8495, "num_tokens": 80632726.0, "step": 1272 }, { "epoch": 2.3445416858590513, "grad_norm": 0.17041975259780884, "learning_rate": 0.00030478821362799264, "loss": 0.7973, "num_tokens": 80695064.0, "step": 1273 }, { "epoch": 2.3463841547673883, "grad_norm": 0.1635364443063736, "learning_rate": 0.00030463474524248006, "loss": 0.8511, "num_tokens": 80757437.0, "step": 1274 }, { "epoch": 2.3482266236757257, "grad_norm": 0.16570639610290527, "learning_rate": 0.0003044812768569675, "loss": 0.9581, "num_tokens": 80821507.0, "step": 1275 }, { "epoch": 2.3500690925840626, "grad_norm": 0.17749936878681183, "learning_rate": 0.00030432780847145484, "loss": 0.8596, "num_tokens": 80885836.0, "step": 1276 }, { "epoch": 2.3519115614924, "grad_norm": 0.1776285469532013, "learning_rate": 0.0003041743400859423, "loss": 0.9249, "num_tokens": 80948909.0, "step": 1277 }, { "epoch": 2.353754030400737, "grad_norm": 0.16676318645477295, "learning_rate": 0.00030402087170042967, "loss": 0.8486, "num_tokens": 81011148.0, "step": 1278 }, { "epoch": 2.3555964993090743, "grad_norm": 0.16984796524047852, "learning_rate": 0.00030386740331491714, "loss": 0.865, "num_tokens": 81073885.0, "step": 1279 }, { "epoch": 2.3574389682174113, "grad_norm": 0.1554846167564392, "learning_rate": 0.0003037139349294045, "loss": 0.9098, "num_tokens": 81138238.0, "step": 1280 }, { "epoch": 2.3592814371257487, "grad_norm": 0.16914153099060059, "learning_rate": 0.000303560466543892, "loss": 0.7899, "num_tokens": 81202202.0, "step": 1281 }, { "epoch": 2.3611239060340856, "grad_norm": 0.1708213984966278, "learning_rate": 0.00030340699815837934, "loss": 0.8186, "num_tokens": 81264658.0, "step": 1282 }, { "epoch": 2.362966374942423, "grad_norm": 0.16595284640789032, "learning_rate": 0.0003032535297728668, "loss": 0.8598, "num_tokens": 81329245.0, "step": 1283 }, { "epoch": 2.36480884385076, "grad_norm": 0.16838669776916504, "learning_rate": 0.0003031000613873542, "loss": 0.8707, "num_tokens": 81393179.0, "step": 1284 }, { "epoch": 2.3666513127590973, "grad_norm": 0.16516734659671783, "learning_rate": 0.00030294659300184165, "loss": 0.8478, "num_tokens": 81458372.0, "step": 1285 }, { "epoch": 2.3684937816674343, "grad_norm": 0.16706456243991852, "learning_rate": 0.000302793124616329, "loss": 0.8837, "num_tokens": 81521343.0, "step": 1286 }, { "epoch": 2.3703362505757717, "grad_norm": 0.1649736762046814, "learning_rate": 0.0003026396562308165, "loss": 0.8589, "num_tokens": 81585573.0, "step": 1287 }, { "epoch": 2.3721787194841086, "grad_norm": 0.15945257246494293, "learning_rate": 0.00030248618784530385, "loss": 0.8744, "num_tokens": 81649424.0, "step": 1288 }, { "epoch": 2.374021188392446, "grad_norm": 0.1621333211660385, "learning_rate": 0.00030233271945979127, "loss": 0.8672, "num_tokens": 81714676.0, "step": 1289 }, { "epoch": 2.375863657300783, "grad_norm": 0.1744396984577179, "learning_rate": 0.0003021792510742787, "loss": 0.9389, "num_tokens": 81777700.0, "step": 1290 }, { "epoch": 2.3777061262091204, "grad_norm": 0.1622326672077179, "learning_rate": 0.0003020257826887661, "loss": 0.8314, "num_tokens": 81841368.0, "step": 1291 }, { "epoch": 2.3795485951174573, "grad_norm": 0.16686280071735382, "learning_rate": 0.0003018723143032535, "loss": 0.8281, "num_tokens": 81905744.0, "step": 1292 }, { "epoch": 2.3813910640257947, "grad_norm": 0.16718749701976776, "learning_rate": 0.00030171884591774094, "loss": 0.8845, "num_tokens": 81968228.0, "step": 1293 }, { "epoch": 2.3832335329341316, "grad_norm": 0.16298998892307281, "learning_rate": 0.00030156537753222836, "loss": 0.9552, "num_tokens": 82032430.0, "step": 1294 }, { "epoch": 2.385076001842469, "grad_norm": 0.19707953929901123, "learning_rate": 0.0003014119091467158, "loss": 0.8199, "num_tokens": 82095791.0, "step": 1295 }, { "epoch": 2.386918470750806, "grad_norm": 0.16045893728733063, "learning_rate": 0.0003012584407612032, "loss": 0.9437, "num_tokens": 82158938.0, "step": 1296 }, { "epoch": 2.3887609396591434, "grad_norm": 0.17610220611095428, "learning_rate": 0.0003011049723756906, "loss": 0.7909, "num_tokens": 82222271.0, "step": 1297 }, { "epoch": 2.3906034085674803, "grad_norm": 0.17391462624073029, "learning_rate": 0.00030095150399017803, "loss": 0.8481, "num_tokens": 82283320.0, "step": 1298 }, { "epoch": 2.3924458774758177, "grad_norm": 0.16436637938022614, "learning_rate": 0.00030079803560466544, "loss": 0.8946, "num_tokens": 82347459.0, "step": 1299 }, { "epoch": 2.3942883463841547, "grad_norm": 0.16267961263656616, "learning_rate": 0.00030064456721915286, "loss": 0.8868, "num_tokens": 82410307.0, "step": 1300 }, { "epoch": 2.396130815292492, "grad_norm": 0.1701728105545044, "learning_rate": 0.0003004910988336403, "loss": 0.8683, "num_tokens": 82472660.0, "step": 1301 }, { "epoch": 2.397973284200829, "grad_norm": 0.170204296708107, "learning_rate": 0.0003003376304481277, "loss": 0.8299, "num_tokens": 82535418.0, "step": 1302 }, { "epoch": 2.3998157531091664, "grad_norm": 0.17602920532226562, "learning_rate": 0.0003001841620626151, "loss": 0.8924, "num_tokens": 82599277.0, "step": 1303 }, { "epoch": 2.4016582220175033, "grad_norm": 0.16409729421138763, "learning_rate": 0.0003000306936771025, "loss": 0.8689, "num_tokens": 82664341.0, "step": 1304 }, { "epoch": 2.4035006909258407, "grad_norm": 0.16914337873458862, "learning_rate": 0.00029987722529158995, "loss": 0.8947, "num_tokens": 82727946.0, "step": 1305 }, { "epoch": 2.4053431598341777, "grad_norm": 0.17951591312885284, "learning_rate": 0.0002997237569060773, "loss": 0.8, "num_tokens": 82788465.0, "step": 1306 }, { "epoch": 2.407185628742515, "grad_norm": 0.1620374470949173, "learning_rate": 0.0002995702885205648, "loss": 0.881, "num_tokens": 82852470.0, "step": 1307 }, { "epoch": 2.409028097650852, "grad_norm": 0.1618890017271042, "learning_rate": 0.00029941682013505215, "loss": 0.8332, "num_tokens": 82916534.0, "step": 1308 }, { "epoch": 2.4108705665591894, "grad_norm": 0.16889844834804535, "learning_rate": 0.0002992633517495396, "loss": 0.8599, "num_tokens": 82980538.0, "step": 1309 }, { "epoch": 2.4127130354675264, "grad_norm": 0.16954143345355988, "learning_rate": 0.000299109883364027, "loss": 0.8888, "num_tokens": 83043543.0, "step": 1310 }, { "epoch": 2.4145555043758637, "grad_norm": 0.1708676815032959, "learning_rate": 0.00029895641497851446, "loss": 0.9179, "num_tokens": 83106945.0, "step": 1311 }, { "epoch": 2.4163979732842007, "grad_norm": 0.16547061502933502, "learning_rate": 0.0002988029465930018, "loss": 0.912, "num_tokens": 83170444.0, "step": 1312 }, { "epoch": 2.418240442192538, "grad_norm": 0.16122227907180786, "learning_rate": 0.0002986494782074893, "loss": 0.8392, "num_tokens": 83234828.0, "step": 1313 }, { "epoch": 2.420082911100875, "grad_norm": 0.1728641539812088, "learning_rate": 0.00029849600982197666, "loss": 0.9202, "num_tokens": 83296640.0, "step": 1314 }, { "epoch": 2.4219253800092124, "grad_norm": 0.17369601130485535, "learning_rate": 0.00029834254143646413, "loss": 0.9141, "num_tokens": 83359406.0, "step": 1315 }, { "epoch": 2.4237678489175494, "grad_norm": 0.16509297490119934, "learning_rate": 0.0002981890730509515, "loss": 0.8661, "num_tokens": 83422145.0, "step": 1316 }, { "epoch": 2.4256103178258868, "grad_norm": 0.1654469072818756, "learning_rate": 0.00029803560466543896, "loss": 0.8351, "num_tokens": 83485178.0, "step": 1317 }, { "epoch": 2.4274527867342237, "grad_norm": 0.16715212166309357, "learning_rate": 0.00029788213627992633, "loss": 0.8491, "num_tokens": 83549106.0, "step": 1318 }, { "epoch": 2.429295255642561, "grad_norm": 0.16461081802845, "learning_rate": 0.0002977286678944138, "loss": 0.878, "num_tokens": 83613815.0, "step": 1319 }, { "epoch": 2.431137724550898, "grad_norm": 0.19393743574619293, "learning_rate": 0.00029757519950890116, "loss": 0.916, "num_tokens": 83677724.0, "step": 1320 }, { "epoch": 2.4329801934592354, "grad_norm": 0.16237396001815796, "learning_rate": 0.0002974217311233886, "loss": 0.8997, "num_tokens": 83742028.0, "step": 1321 }, { "epoch": 2.4348226623675724, "grad_norm": 0.1678115576505661, "learning_rate": 0.000297268262737876, "loss": 0.8546, "num_tokens": 83804909.0, "step": 1322 }, { "epoch": 2.43666513127591, "grad_norm": 0.17171329259872437, "learning_rate": 0.0002971147943523634, "loss": 0.8168, "num_tokens": 83866218.0, "step": 1323 }, { "epoch": 2.4385076001842467, "grad_norm": 0.17284557223320007, "learning_rate": 0.00029696132596685083, "loss": 0.8675, "num_tokens": 83930014.0, "step": 1324 }, { "epoch": 2.440350069092584, "grad_norm": 0.1880342960357666, "learning_rate": 0.00029680785758133825, "loss": 0.8371, "num_tokens": 83994704.0, "step": 1325 }, { "epoch": 2.442192538000921, "grad_norm": 0.16544462740421295, "learning_rate": 0.00029665438919582567, "loss": 0.906, "num_tokens": 84058952.0, "step": 1326 }, { "epoch": 2.4440350069092585, "grad_norm": 0.17082522809505463, "learning_rate": 0.0002965009208103131, "loss": 0.8551, "num_tokens": 84122154.0, "step": 1327 }, { "epoch": 2.4458774758175954, "grad_norm": 0.16611029207706451, "learning_rate": 0.0002963474524248005, "loss": 0.8023, "num_tokens": 84184710.0, "step": 1328 }, { "epoch": 2.447719944725933, "grad_norm": 0.16573627293109894, "learning_rate": 0.0002961939840392879, "loss": 0.8343, "num_tokens": 84248553.0, "step": 1329 }, { "epoch": 2.4495624136342697, "grad_norm": 0.1627078354358673, "learning_rate": 0.00029604051565377534, "loss": 0.8497, "num_tokens": 84312074.0, "step": 1330 }, { "epoch": 2.451404882542607, "grad_norm": 0.1644984483718872, "learning_rate": 0.00029588704726826276, "loss": 0.8852, "num_tokens": 84376139.0, "step": 1331 }, { "epoch": 2.453247351450944, "grad_norm": 0.16413229703903198, "learning_rate": 0.0002957335788827502, "loss": 0.846, "num_tokens": 84440335.0, "step": 1332 }, { "epoch": 2.4550898203592815, "grad_norm": 0.1651073545217514, "learning_rate": 0.0002955801104972376, "loss": 0.9438, "num_tokens": 84504815.0, "step": 1333 }, { "epoch": 2.4569322892676184, "grad_norm": 0.1636662632226944, "learning_rate": 0.000295426642111725, "loss": 0.8017, "num_tokens": 84569247.0, "step": 1334 }, { "epoch": 2.458774758175956, "grad_norm": 0.20116524398326874, "learning_rate": 0.00029527317372621243, "loss": 0.828, "num_tokens": 84633711.0, "step": 1335 }, { "epoch": 2.4606172270842928, "grad_norm": 0.1775459349155426, "learning_rate": 0.0002951197053406998, "loss": 0.8292, "num_tokens": 84697382.0, "step": 1336 }, { "epoch": 2.46245969599263, "grad_norm": 0.16567833721637726, "learning_rate": 0.00029496623695518726, "loss": 0.9097, "num_tokens": 84761294.0, "step": 1337 }, { "epoch": 2.4643021649009675, "grad_norm": 0.16452424228191376, "learning_rate": 0.00029481276856967463, "loss": 0.8008, "num_tokens": 84824495.0, "step": 1338 }, { "epoch": 2.4661446338093045, "grad_norm": 0.1733730286359787, "learning_rate": 0.0002946593001841621, "loss": 0.8709, "num_tokens": 84886819.0, "step": 1339 }, { "epoch": 2.4679871027176414, "grad_norm": 0.17559471726417542, "learning_rate": 0.00029450583179864946, "loss": 0.9284, "num_tokens": 84950414.0, "step": 1340 }, { "epoch": 2.469829571625979, "grad_norm": 0.17280519008636475, "learning_rate": 0.00029435236341313693, "loss": 0.8348, "num_tokens": 85014474.0, "step": 1341 }, { "epoch": 2.471672040534316, "grad_norm": 0.16357670724391937, "learning_rate": 0.0002941988950276243, "loss": 0.8756, "num_tokens": 85077991.0, "step": 1342 }, { "epoch": 2.473514509442653, "grad_norm": 0.17150455713272095, "learning_rate": 0.00029404542664211177, "loss": 0.8232, "num_tokens": 85139616.0, "step": 1343 }, { "epoch": 2.47535697835099, "grad_norm": 0.1786317080259323, "learning_rate": 0.00029389195825659913, "loss": 0.8225, "num_tokens": 85202708.0, "step": 1344 }, { "epoch": 2.4771994472593275, "grad_norm": 0.16733038425445557, "learning_rate": 0.0002937384898710866, "loss": 0.8977, "num_tokens": 85266933.0, "step": 1345 }, { "epoch": 2.479041916167665, "grad_norm": 0.15840771794319153, "learning_rate": 0.00029358502148557397, "loss": 0.8917, "num_tokens": 85331603.0, "step": 1346 }, { "epoch": 2.480884385076002, "grad_norm": 0.16771864891052246, "learning_rate": 0.0002934315531000614, "loss": 0.8823, "num_tokens": 85395140.0, "step": 1347 }, { "epoch": 2.482726853984339, "grad_norm": 0.18234655261039734, "learning_rate": 0.0002932780847145488, "loss": 0.8192, "num_tokens": 85457620.0, "step": 1348 }, { "epoch": 2.484569322892676, "grad_norm": 0.16921643912792206, "learning_rate": 0.0002931246163290362, "loss": 0.8847, "num_tokens": 85520533.0, "step": 1349 }, { "epoch": 2.4864117918010136, "grad_norm": 0.176498144865036, "learning_rate": 0.00029297114794352364, "loss": 0.9153, "num_tokens": 85583940.0, "step": 1350 }, { "epoch": 2.4882542607093505, "grad_norm": 0.1642284244298935, "learning_rate": 0.000292817679558011, "loss": 0.9347, "num_tokens": 85647669.0, "step": 1351 }, { "epoch": 2.4900967296176875, "grad_norm": 0.17522276937961578, "learning_rate": 0.0002926642111724985, "loss": 0.8318, "num_tokens": 85708099.0, "step": 1352 }, { "epoch": 2.491939198526025, "grad_norm": 0.16750015318393707, "learning_rate": 0.00029251074278698584, "loss": 0.871, "num_tokens": 85771967.0, "step": 1353 }, { "epoch": 2.4937816674343622, "grad_norm": 0.20807668566703796, "learning_rate": 0.0002923572744014733, "loss": 0.9694, "num_tokens": 85836292.0, "step": 1354 }, { "epoch": 2.495624136342699, "grad_norm": 0.19167818129062653, "learning_rate": 0.0002922038060159607, "loss": 0.8935, "num_tokens": 85899597.0, "step": 1355 }, { "epoch": 2.497466605251036, "grad_norm": 0.16750583052635193, "learning_rate": 0.00029205033763044815, "loss": 0.9872, "num_tokens": 85963641.0, "step": 1356 }, { "epoch": 2.4993090741593735, "grad_norm": 0.17718112468719482, "learning_rate": 0.0002918968692449355, "loss": 0.8723, "num_tokens": 86026010.0, "step": 1357 }, { "epoch": 2.501151543067711, "grad_norm": 0.1705387383699417, "learning_rate": 0.000291743400859423, "loss": 0.8817, "num_tokens": 86090940.0, "step": 1358 }, { "epoch": 2.502994011976048, "grad_norm": 0.17788055539131165, "learning_rate": 0.00029158993247391035, "loss": 0.8093, "num_tokens": 86154428.0, "step": 1359 }, { "epoch": 2.504836480884385, "grad_norm": 0.17629402875900269, "learning_rate": 0.0002914364640883978, "loss": 0.8429, "num_tokens": 86216153.0, "step": 1360 }, { "epoch": 2.506678949792722, "grad_norm": 0.15908007323741913, "learning_rate": 0.0002912829957028852, "loss": 0.9316, "num_tokens": 86278617.0, "step": 1361 }, { "epoch": 2.5085214187010596, "grad_norm": 0.16571736335754395, "learning_rate": 0.00029112952731737265, "loss": 0.9064, "num_tokens": 86341900.0, "step": 1362 }, { "epoch": 2.5103638876093965, "grad_norm": 0.15960372984409332, "learning_rate": 0.00029097605893186, "loss": 0.8754, "num_tokens": 86404911.0, "step": 1363 }, { "epoch": 2.5122063565177335, "grad_norm": 0.16258177161216736, "learning_rate": 0.0002908225905463475, "loss": 0.892, "num_tokens": 86469222.0, "step": 1364 }, { "epoch": 2.514048825426071, "grad_norm": 0.16886773705482483, "learning_rate": 0.00029066912216083485, "loss": 0.812, "num_tokens": 86533211.0, "step": 1365 }, { "epoch": 2.5158912943344083, "grad_norm": 0.169014573097229, "learning_rate": 0.0002905156537753223, "loss": 0.9176, "num_tokens": 86597506.0, "step": 1366 }, { "epoch": 2.5177337632427452, "grad_norm": 0.18225499987602234, "learning_rate": 0.0002903621853898097, "loss": 0.8973, "num_tokens": 86660207.0, "step": 1367 }, { "epoch": 2.519576232151082, "grad_norm": 0.18338841199874878, "learning_rate": 0.0002902087170042971, "loss": 0.8273, "num_tokens": 86723671.0, "step": 1368 }, { "epoch": 2.5214187010594196, "grad_norm": 0.1726466715335846, "learning_rate": 0.0002900552486187845, "loss": 0.9432, "num_tokens": 86786369.0, "step": 1369 }, { "epoch": 2.523261169967757, "grad_norm": 0.17203280329704285, "learning_rate": 0.00028990178023327194, "loss": 0.8931, "num_tokens": 86850480.0, "step": 1370 }, { "epoch": 2.525103638876094, "grad_norm": 0.1630592942237854, "learning_rate": 0.00028974831184775936, "loss": 0.9083, "num_tokens": 86913822.0, "step": 1371 }, { "epoch": 2.5269461077844313, "grad_norm": 0.17440439760684967, "learning_rate": 0.0002895948434622468, "loss": 0.8125, "num_tokens": 86978177.0, "step": 1372 }, { "epoch": 2.5287885766927682, "grad_norm": 0.17346927523612976, "learning_rate": 0.0002894413750767342, "loss": 0.8953, "num_tokens": 87041316.0, "step": 1373 }, { "epoch": 2.5306310456011056, "grad_norm": 0.16948837041854858, "learning_rate": 0.0002892879066912216, "loss": 0.8233, "num_tokens": 87104395.0, "step": 1374 }, { "epoch": 2.5324735145094426, "grad_norm": 0.16330085694789886, "learning_rate": 0.00028913443830570903, "loss": 0.9199, "num_tokens": 87168753.0, "step": 1375 }, { "epoch": 2.53431598341778, "grad_norm": 0.17549988627433777, "learning_rate": 0.00028898096992019645, "loss": 0.7977, "num_tokens": 87233006.0, "step": 1376 }, { "epoch": 2.536158452326117, "grad_norm": 0.16299568116664886, "learning_rate": 0.00028882750153468386, "loss": 0.8906, "num_tokens": 87296540.0, "step": 1377 }, { "epoch": 2.5380009212344543, "grad_norm": 0.16738207638263702, "learning_rate": 0.0002886740331491713, "loss": 0.8776, "num_tokens": 87359710.0, "step": 1378 }, { "epoch": 2.5398433901427913, "grad_norm": 0.17282524704933167, "learning_rate": 0.0002885205647636587, "loss": 0.8808, "num_tokens": 87420735.0, "step": 1379 }, { "epoch": 2.5416858590511286, "grad_norm": 0.17037560045719147, "learning_rate": 0.0002883670963781461, "loss": 0.8099, "num_tokens": 87484893.0, "step": 1380 }, { "epoch": 2.5435283279594656, "grad_norm": 0.1665983498096466, "learning_rate": 0.00028821362799263354, "loss": 0.8202, "num_tokens": 87547929.0, "step": 1381 }, { "epoch": 2.545370796867803, "grad_norm": 0.1596672385931015, "learning_rate": 0.00028806015960712095, "loss": 0.8909, "num_tokens": 87610697.0, "step": 1382 }, { "epoch": 2.54721326577614, "grad_norm": 0.1648922562599182, "learning_rate": 0.0002879066912216083, "loss": 0.878, "num_tokens": 87674903.0, "step": 1383 }, { "epoch": 2.5490557346844773, "grad_norm": 0.163678839802742, "learning_rate": 0.0002877532228360958, "loss": 0.8263, "num_tokens": 87739216.0, "step": 1384 }, { "epoch": 2.5508982035928143, "grad_norm": 0.16728372871875763, "learning_rate": 0.00028759975445058315, "loss": 0.8745, "num_tokens": 87801884.0, "step": 1385 }, { "epoch": 2.5527406725011517, "grad_norm": 0.17248749732971191, "learning_rate": 0.0002874462860650706, "loss": 0.8616, "num_tokens": 87866186.0, "step": 1386 }, { "epoch": 2.5545831414094886, "grad_norm": 0.16773444414138794, "learning_rate": 0.000287292817679558, "loss": 0.8493, "num_tokens": 87929157.0, "step": 1387 }, { "epoch": 2.556425610317826, "grad_norm": 0.16283169388771057, "learning_rate": 0.00028713934929404546, "loss": 0.8755, "num_tokens": 87994137.0, "step": 1388 }, { "epoch": 2.558268079226163, "grad_norm": 0.16781534254550934, "learning_rate": 0.0002869858809085328, "loss": 0.8526, "num_tokens": 88057509.0, "step": 1389 }, { "epoch": 2.5601105481345003, "grad_norm": 0.16738800704479218, "learning_rate": 0.0002868324125230203, "loss": 0.866, "num_tokens": 88121736.0, "step": 1390 }, { "epoch": 2.5619530170428373, "grad_norm": 0.17069005966186523, "learning_rate": 0.00028667894413750766, "loss": 0.8826, "num_tokens": 88186134.0, "step": 1391 }, { "epoch": 2.5637954859511747, "grad_norm": 0.16415159404277802, "learning_rate": 0.00028652547575199513, "loss": 0.8701, "num_tokens": 88250032.0, "step": 1392 }, { "epoch": 2.5656379548595116, "grad_norm": 0.1729252189397812, "learning_rate": 0.0002863720073664825, "loss": 0.8096, "num_tokens": 88313812.0, "step": 1393 }, { "epoch": 2.567480423767849, "grad_norm": 0.18014687299728394, "learning_rate": 0.00028621853898096997, "loss": 0.8603, "num_tokens": 88376651.0, "step": 1394 }, { "epoch": 2.569322892676186, "grad_norm": 0.17127405107021332, "learning_rate": 0.00028606507059545733, "loss": 0.8529, "num_tokens": 88440679.0, "step": 1395 }, { "epoch": 2.5711653615845234, "grad_norm": 0.17417334020137787, "learning_rate": 0.0002859116022099448, "loss": 0.8983, "num_tokens": 88503650.0, "step": 1396 }, { "epoch": 2.5730078304928603, "grad_norm": 0.16880720853805542, "learning_rate": 0.00028575813382443216, "loss": 0.8556, "num_tokens": 88567106.0, "step": 1397 }, { "epoch": 2.5748502994011977, "grad_norm": 0.1736394315958023, "learning_rate": 0.0002856046654389196, "loss": 0.9185, "num_tokens": 88631323.0, "step": 1398 }, { "epoch": 2.5766927683095346, "grad_norm": 0.17235898971557617, "learning_rate": 0.000285451197053407, "loss": 0.8881, "num_tokens": 88694636.0, "step": 1399 }, { "epoch": 2.578535237217872, "grad_norm": 0.17868350446224213, "learning_rate": 0.0002852977286678944, "loss": 0.8881, "num_tokens": 88758035.0, "step": 1400 }, { "epoch": 2.580377706126209, "grad_norm": 0.16463591158390045, "learning_rate": 0.00028514426028238184, "loss": 0.8251, "num_tokens": 88821844.0, "step": 1401 }, { "epoch": 2.5822201750345464, "grad_norm": 0.16336798667907715, "learning_rate": 0.00028499079189686925, "loss": 0.8896, "num_tokens": 88884290.0, "step": 1402 }, { "epoch": 2.5840626439428833, "grad_norm": 0.1737934798002243, "learning_rate": 0.00028483732351135667, "loss": 0.8129, "num_tokens": 88947492.0, "step": 1403 }, { "epoch": 2.5859051128512207, "grad_norm": 0.17110894620418549, "learning_rate": 0.0002846838551258441, "loss": 0.885, "num_tokens": 89009687.0, "step": 1404 }, { "epoch": 2.5877475817595577, "grad_norm": 0.18348485231399536, "learning_rate": 0.0002845303867403315, "loss": 0.8204, "num_tokens": 89073268.0, "step": 1405 }, { "epoch": 2.589590050667895, "grad_norm": 0.1770930141210556, "learning_rate": 0.0002843769183548189, "loss": 0.8436, "num_tokens": 89136812.0, "step": 1406 }, { "epoch": 2.591432519576232, "grad_norm": 0.1797017902135849, "learning_rate": 0.00028422344996930634, "loss": 0.9044, "num_tokens": 89200049.0, "step": 1407 }, { "epoch": 2.5932749884845694, "grad_norm": 0.17901688814163208, "learning_rate": 0.00028406998158379376, "loss": 0.9367, "num_tokens": 89262391.0, "step": 1408 }, { "epoch": 2.5951174573929063, "grad_norm": 0.17462150752544403, "learning_rate": 0.0002839165131982812, "loss": 0.8114, "num_tokens": 89324998.0, "step": 1409 }, { "epoch": 2.5969599263012437, "grad_norm": 0.1762257218360901, "learning_rate": 0.0002837630448127686, "loss": 0.8794, "num_tokens": 89389871.0, "step": 1410 }, { "epoch": 2.5988023952095807, "grad_norm": 0.16544538736343384, "learning_rate": 0.000283609576427256, "loss": 0.9127, "num_tokens": 89453313.0, "step": 1411 }, { "epoch": 2.600644864117918, "grad_norm": 0.16548889875411987, "learning_rate": 0.00028345610804174343, "loss": 0.9047, "num_tokens": 89517457.0, "step": 1412 }, { "epoch": 2.6024873330262555, "grad_norm": 0.16570420563220978, "learning_rate": 0.0002833026396562308, "loss": 0.8507, "num_tokens": 89582338.0, "step": 1413 }, { "epoch": 2.6043298019345924, "grad_norm": 0.16246682405471802, "learning_rate": 0.00028314917127071827, "loss": 0.812, "num_tokens": 89646304.0, "step": 1414 }, { "epoch": 2.6061722708429294, "grad_norm": 0.16976752877235413, "learning_rate": 0.00028299570288520563, "loss": 0.8593, "num_tokens": 89709992.0, "step": 1415 }, { "epoch": 2.6080147397512667, "grad_norm": 0.16199304163455963, "learning_rate": 0.0002828422344996931, "loss": 0.8673, "num_tokens": 89774147.0, "step": 1416 }, { "epoch": 2.609857208659604, "grad_norm": 0.1684836894273758, "learning_rate": 0.00028268876611418046, "loss": 0.9022, "num_tokens": 89837819.0, "step": 1417 }, { "epoch": 2.611699677567941, "grad_norm": 0.16456417739391327, "learning_rate": 0.0002825352977286679, "loss": 0.9374, "num_tokens": 89902549.0, "step": 1418 }, { "epoch": 2.613542146476278, "grad_norm": 0.16108086705207825, "learning_rate": 0.0002823818293431553, "loss": 0.9246, "num_tokens": 89967681.0, "step": 1419 }, { "epoch": 2.6153846153846154, "grad_norm": 0.17007511854171753, "learning_rate": 0.0002822283609576427, "loss": 0.8459, "num_tokens": 90032421.0, "step": 1420 }, { "epoch": 2.617227084292953, "grad_norm": 0.17453470826148987, "learning_rate": 0.00028207489257213014, "loss": 0.8613, "num_tokens": 90095157.0, "step": 1421 }, { "epoch": 2.6190695532012898, "grad_norm": 0.16557779908180237, "learning_rate": 0.00028192142418661755, "loss": 0.8319, "num_tokens": 90157810.0, "step": 1422 }, { "epoch": 2.6209120221096267, "grad_norm": 0.17129075527191162, "learning_rate": 0.00028176795580110497, "loss": 0.8678, "num_tokens": 90220573.0, "step": 1423 }, { "epoch": 2.622754491017964, "grad_norm": 0.17825546860694885, "learning_rate": 0.0002816144874155924, "loss": 0.8043, "num_tokens": 90284168.0, "step": 1424 }, { "epoch": 2.6245969599263015, "grad_norm": 0.16648611426353455, "learning_rate": 0.0002814610190300798, "loss": 0.8462, "num_tokens": 90348512.0, "step": 1425 }, { "epoch": 2.6264394288346384, "grad_norm": 0.17674121260643005, "learning_rate": 0.0002813075506445672, "loss": 0.8838, "num_tokens": 90412673.0, "step": 1426 }, { "epoch": 2.6282818977429754, "grad_norm": 0.16577276587486267, "learning_rate": 0.00028115408225905464, "loss": 0.8661, "num_tokens": 90476952.0, "step": 1427 }, { "epoch": 2.6301243666513128, "grad_norm": 0.16184106469154358, "learning_rate": 0.000281000613873542, "loss": 0.8885, "num_tokens": 90539773.0, "step": 1428 }, { "epoch": 2.63196683555965, "grad_norm": 0.17103499174118042, "learning_rate": 0.0002808471454880295, "loss": 0.8486, "num_tokens": 90602680.0, "step": 1429 }, { "epoch": 2.633809304467987, "grad_norm": 0.17390237748622894, "learning_rate": 0.00028069367710251684, "loss": 0.8736, "num_tokens": 90664723.0, "step": 1430 }, { "epoch": 2.635651773376324, "grad_norm": 0.1700381338596344, "learning_rate": 0.0002805402087170043, "loss": 0.8806, "num_tokens": 90727022.0, "step": 1431 }, { "epoch": 2.6374942422846614, "grad_norm": 0.2043146938085556, "learning_rate": 0.0002803867403314917, "loss": 0.7971, "num_tokens": 90790638.0, "step": 1432 }, { "epoch": 2.639336711192999, "grad_norm": 0.17201454937458038, "learning_rate": 0.00028023327194597915, "loss": 0.8275, "num_tokens": 90853397.0, "step": 1433 }, { "epoch": 2.641179180101336, "grad_norm": 0.18430374562740326, "learning_rate": 0.0002800798035604665, "loss": 0.8593, "num_tokens": 90914841.0, "step": 1434 }, { "epoch": 2.6430216490096727, "grad_norm": 0.17351946234703064, "learning_rate": 0.000279926335174954, "loss": 0.9147, "num_tokens": 90977321.0, "step": 1435 }, { "epoch": 2.64486411791801, "grad_norm": 0.1800200194120407, "learning_rate": 0.00027977286678944135, "loss": 0.903, "num_tokens": 91039964.0, "step": 1436 }, { "epoch": 2.6467065868263475, "grad_norm": 0.17532412707805634, "learning_rate": 0.0002796193984039288, "loss": 0.9174, "num_tokens": 91103343.0, "step": 1437 }, { "epoch": 2.6485490557346845, "grad_norm": 0.17400258779525757, "learning_rate": 0.0002794659300184162, "loss": 0.8723, "num_tokens": 91167517.0, "step": 1438 }, { "epoch": 2.6503915246430214, "grad_norm": 0.17717871069908142, "learning_rate": 0.00027931246163290365, "loss": 0.8103, "num_tokens": 91230357.0, "step": 1439 }, { "epoch": 2.652233993551359, "grad_norm": 0.1628819853067398, "learning_rate": 0.000279158993247391, "loss": 0.8862, "num_tokens": 91294616.0, "step": 1440 }, { "epoch": 2.654076462459696, "grad_norm": 0.17795978486537933, "learning_rate": 0.0002790055248618785, "loss": 0.8741, "num_tokens": 91357325.0, "step": 1441 }, { "epoch": 2.655918931368033, "grad_norm": 0.17066077888011932, "learning_rate": 0.00027885205647636585, "loss": 0.8418, "num_tokens": 91420404.0, "step": 1442 }, { "epoch": 2.65776140027637, "grad_norm": 0.17837610840797424, "learning_rate": 0.0002786985880908533, "loss": 0.8784, "num_tokens": 91484832.0, "step": 1443 }, { "epoch": 2.6596038691847075, "grad_norm": 0.1599118411540985, "learning_rate": 0.0002785451197053407, "loss": 0.9028, "num_tokens": 91548439.0, "step": 1444 }, { "epoch": 2.661446338093045, "grad_norm": 0.17983463406562805, "learning_rate": 0.0002783916513198281, "loss": 0.8155, "num_tokens": 91612171.0, "step": 1445 }, { "epoch": 2.663288807001382, "grad_norm": 0.17405541241168976, "learning_rate": 0.0002782381829343155, "loss": 0.8569, "num_tokens": 91675898.0, "step": 1446 }, { "epoch": 2.6651312759097188, "grad_norm": 0.18229930102825165, "learning_rate": 0.00027808471454880294, "loss": 0.8229, "num_tokens": 91737148.0, "step": 1447 }, { "epoch": 2.666973744818056, "grad_norm": 0.18017062544822693, "learning_rate": 0.00027793124616329036, "loss": 0.8315, "num_tokens": 91800141.0, "step": 1448 }, { "epoch": 2.6688162137263935, "grad_norm": 0.17009304463863373, "learning_rate": 0.0002777777777777778, "loss": 0.8104, "num_tokens": 91863717.0, "step": 1449 }, { "epoch": 2.6706586826347305, "grad_norm": 0.16799777746200562, "learning_rate": 0.0002776243093922652, "loss": 0.8881, "num_tokens": 91927715.0, "step": 1450 }, { "epoch": 2.6725011515430674, "grad_norm": 0.16979722678661346, "learning_rate": 0.0002774708410067526, "loss": 0.9023, "num_tokens": 91991067.0, "step": 1451 }, { "epoch": 2.674343620451405, "grad_norm": 0.16581138968467712, "learning_rate": 0.00027731737262124003, "loss": 0.8821, "num_tokens": 92055240.0, "step": 1452 }, { "epoch": 2.6761860893597422, "grad_norm": 0.15568412840366364, "learning_rate": 0.00027716390423572745, "loss": 0.9205, "num_tokens": 92119680.0, "step": 1453 }, { "epoch": 2.678028558268079, "grad_norm": 0.17426833510398865, "learning_rate": 0.00027701043585021487, "loss": 0.9043, "num_tokens": 92181938.0, "step": 1454 }, { "epoch": 2.679871027176416, "grad_norm": 0.17322103679180145, "learning_rate": 0.0002768569674647023, "loss": 0.8194, "num_tokens": 92245356.0, "step": 1455 }, { "epoch": 2.6817134960847535, "grad_norm": 0.16862620413303375, "learning_rate": 0.0002767034990791897, "loss": 0.839, "num_tokens": 92308618.0, "step": 1456 }, { "epoch": 2.683555964993091, "grad_norm": 0.1677866280078888, "learning_rate": 0.0002765500306936771, "loss": 0.9026, "num_tokens": 92372553.0, "step": 1457 }, { "epoch": 2.685398433901428, "grad_norm": 0.16603782773017883, "learning_rate": 0.00027639656230816454, "loss": 0.8008, "num_tokens": 92436557.0, "step": 1458 }, { "epoch": 2.6872409028097652, "grad_norm": 0.17233148217201233, "learning_rate": 0.00027624309392265195, "loss": 0.8293, "num_tokens": 92499871.0, "step": 1459 }, { "epoch": 2.689083371718102, "grad_norm": 0.16640885174274445, "learning_rate": 0.0002760896255371393, "loss": 0.8571, "num_tokens": 92564627.0, "step": 1460 }, { "epoch": 2.6909258406264396, "grad_norm": 0.17620030045509338, "learning_rate": 0.0002759361571516268, "loss": 0.8899, "num_tokens": 92628727.0, "step": 1461 }, { "epoch": 2.6927683095347765, "grad_norm": 0.16913621127605438, "learning_rate": 0.00027578268876611415, "loss": 0.8976, "num_tokens": 92692855.0, "step": 1462 }, { "epoch": 2.694610778443114, "grad_norm": 0.17674505710601807, "learning_rate": 0.0002756292203806016, "loss": 0.8926, "num_tokens": 92756143.0, "step": 1463 }, { "epoch": 2.696453247351451, "grad_norm": 0.1710929125547409, "learning_rate": 0.000275475751995089, "loss": 0.8104, "num_tokens": 92820479.0, "step": 1464 }, { "epoch": 2.6982957162597883, "grad_norm": 0.16674526035785675, "learning_rate": 0.00027532228360957646, "loss": 0.808, "num_tokens": 92883857.0, "step": 1465 }, { "epoch": 2.700138185168125, "grad_norm": 0.17011113464832306, "learning_rate": 0.0002751688152240638, "loss": 0.8877, "num_tokens": 92948657.0, "step": 1466 }, { "epoch": 2.7019806540764626, "grad_norm": 0.17168454825878143, "learning_rate": 0.0002750153468385513, "loss": 0.8309, "num_tokens": 93012641.0, "step": 1467 }, { "epoch": 2.7038231229847995, "grad_norm": 0.17245925962924957, "learning_rate": 0.00027486187845303866, "loss": 0.894, "num_tokens": 93076521.0, "step": 1468 }, { "epoch": 2.705665591893137, "grad_norm": 0.17284533381462097, "learning_rate": 0.00027470841006752613, "loss": 0.8871, "num_tokens": 93140322.0, "step": 1469 }, { "epoch": 2.707508060801474, "grad_norm": 0.16464103758335114, "learning_rate": 0.0002745549416820135, "loss": 0.9505, "num_tokens": 93204897.0, "step": 1470 }, { "epoch": 2.7093505297098113, "grad_norm": 0.1646110862493515, "learning_rate": 0.00027440147329650097, "loss": 0.8727, "num_tokens": 93269140.0, "step": 1471 }, { "epoch": 2.711192998618148, "grad_norm": 0.16011179983615875, "learning_rate": 0.00027424800491098833, "loss": 0.9214, "num_tokens": 93333854.0, "step": 1472 }, { "epoch": 2.7130354675264856, "grad_norm": 0.1686745285987854, "learning_rate": 0.0002740945365254758, "loss": 0.9239, "num_tokens": 93398001.0, "step": 1473 }, { "epoch": 2.7148779364348226, "grad_norm": 0.1695268750190735, "learning_rate": 0.00027394106813996317, "loss": 0.9669, "num_tokens": 93462191.0, "step": 1474 }, { "epoch": 2.71672040534316, "grad_norm": 0.17302606999874115, "learning_rate": 0.00027378759975445064, "loss": 0.8389, "num_tokens": 93525146.0, "step": 1475 }, { "epoch": 2.718562874251497, "grad_norm": 0.17418159544467926, "learning_rate": 0.000273634131368938, "loss": 0.8293, "num_tokens": 93588222.0, "step": 1476 }, { "epoch": 2.7204053431598343, "grad_norm": 0.16642943024635315, "learning_rate": 0.0002734806629834254, "loss": 0.8911, "num_tokens": 93651934.0, "step": 1477 }, { "epoch": 2.7222478120681712, "grad_norm": 0.18452619016170502, "learning_rate": 0.00027332719459791284, "loss": 0.9006, "num_tokens": 93714907.0, "step": 1478 }, { "epoch": 2.7240902809765086, "grad_norm": 0.17113402485847473, "learning_rate": 0.00027317372621240025, "loss": 0.9254, "num_tokens": 93777686.0, "step": 1479 }, { "epoch": 2.7259327498848456, "grad_norm": 0.17899402976036072, "learning_rate": 0.00027302025782688767, "loss": 0.882, "num_tokens": 93839766.0, "step": 1480 }, { "epoch": 2.727775218793183, "grad_norm": 0.17085938155651093, "learning_rate": 0.0002728667894413751, "loss": 0.8215, "num_tokens": 93902227.0, "step": 1481 }, { "epoch": 2.72961768770152, "grad_norm": 0.17141443490982056, "learning_rate": 0.0002727133210558625, "loss": 0.9171, "num_tokens": 93965122.0, "step": 1482 }, { "epoch": 2.7314601566098573, "grad_norm": 0.1670040488243103, "learning_rate": 0.0002725598526703499, "loss": 0.8188, "num_tokens": 94027803.0, "step": 1483 }, { "epoch": 2.7333026255181943, "grad_norm": 0.16865143179893494, "learning_rate": 0.00027240638428483734, "loss": 0.9602, "num_tokens": 94091842.0, "step": 1484 }, { "epoch": 2.7351450944265316, "grad_norm": 0.1795819252729416, "learning_rate": 0.00027225291589932476, "loss": 0.8377, "num_tokens": 94156194.0, "step": 1485 }, { "epoch": 2.7369875633348686, "grad_norm": 0.1692114919424057, "learning_rate": 0.0002720994475138122, "loss": 0.8141, "num_tokens": 94220561.0, "step": 1486 }, { "epoch": 2.738830032243206, "grad_norm": 0.1719636470079422, "learning_rate": 0.0002719459791282996, "loss": 0.825, "num_tokens": 94285310.0, "step": 1487 }, { "epoch": 2.740672501151543, "grad_norm": 0.18166568875312805, "learning_rate": 0.000271792510742787, "loss": 0.8429, "num_tokens": 94348451.0, "step": 1488 }, { "epoch": 2.7425149700598803, "grad_norm": 0.16209708154201508, "learning_rate": 0.0002716390423572744, "loss": 0.9019, "num_tokens": 94412761.0, "step": 1489 }, { "epoch": 2.7443574389682173, "grad_norm": 0.16748186945915222, "learning_rate": 0.00027148557397176185, "loss": 0.9128, "num_tokens": 94475799.0, "step": 1490 }, { "epoch": 2.7461999078765547, "grad_norm": 0.17306986451148987, "learning_rate": 0.0002713321055862492, "loss": 0.9381, "num_tokens": 94539028.0, "step": 1491 }, { "epoch": 2.7480423767848916, "grad_norm": 0.17275285720825195, "learning_rate": 0.00027117863720073663, "loss": 0.8921, "num_tokens": 94603514.0, "step": 1492 }, { "epoch": 2.749884845693229, "grad_norm": 0.16435670852661133, "learning_rate": 0.00027102516881522405, "loss": 0.8567, "num_tokens": 94666705.0, "step": 1493 }, { "epoch": 2.751727314601566, "grad_norm": 0.16591998934745789, "learning_rate": 0.00027087170042971147, "loss": 0.7735, "num_tokens": 94730670.0, "step": 1494 }, { "epoch": 2.7535697835099033, "grad_norm": 0.16764159500598907, "learning_rate": 0.0002707182320441989, "loss": 0.8075, "num_tokens": 94795097.0, "step": 1495 }, { "epoch": 2.7554122524182403, "grad_norm": 0.1788029670715332, "learning_rate": 0.0002705647636586863, "loss": 0.8888, "num_tokens": 94859957.0, "step": 1496 }, { "epoch": 2.7572547213265777, "grad_norm": 0.17275986075401306, "learning_rate": 0.0002704112952731737, "loss": 0.9537, "num_tokens": 94923528.0, "step": 1497 }, { "epoch": 2.7590971902349146, "grad_norm": 0.18194875121116638, "learning_rate": 0.00027025782688766114, "loss": 0.9114, "num_tokens": 94986698.0, "step": 1498 }, { "epoch": 2.760939659143252, "grad_norm": 0.17654302716255188, "learning_rate": 0.00027010435850214856, "loss": 0.8439, "num_tokens": 95049049.0, "step": 1499 }, { "epoch": 2.7627821280515894, "grad_norm": 0.17114849388599396, "learning_rate": 0.00026995089011663597, "loss": 0.8688, "num_tokens": 95113025.0, "step": 1500 }, { "epoch": 2.7646245969599264, "grad_norm": 0.17562590539455414, "learning_rate": 0.0002697974217311234, "loss": 0.8856, "num_tokens": 95176593.0, "step": 1501 }, { "epoch": 2.7664670658682633, "grad_norm": 0.17481081187725067, "learning_rate": 0.0002696439533456108, "loss": 0.8807, "num_tokens": 95240656.0, "step": 1502 }, { "epoch": 2.7683095347766007, "grad_norm": 0.17363744974136353, "learning_rate": 0.0002694904849600982, "loss": 0.8181, "num_tokens": 95304542.0, "step": 1503 }, { "epoch": 2.770152003684938, "grad_norm": 0.17284221947193146, "learning_rate": 0.00026933701657458564, "loss": 0.8665, "num_tokens": 95367438.0, "step": 1504 }, { "epoch": 2.771994472593275, "grad_norm": 0.17466473579406738, "learning_rate": 0.00026918354818907306, "loss": 0.8685, "num_tokens": 95431283.0, "step": 1505 }, { "epoch": 2.773836941501612, "grad_norm": 0.1753520518541336, "learning_rate": 0.0002690300798035605, "loss": 0.8262, "num_tokens": 95494186.0, "step": 1506 }, { "epoch": 2.7756794104099494, "grad_norm": 0.160833939909935, "learning_rate": 0.00026887661141804784, "loss": 0.8582, "num_tokens": 95558141.0, "step": 1507 }, { "epoch": 2.7775218793182868, "grad_norm": 0.1743139922618866, "learning_rate": 0.0002687231430325353, "loss": 0.9496, "num_tokens": 95619788.0, "step": 1508 }, { "epoch": 2.7793643482266237, "grad_norm": 0.18049389123916626, "learning_rate": 0.0002685696746470227, "loss": 0.8989, "num_tokens": 95681918.0, "step": 1509 }, { "epoch": 2.7812068171349607, "grad_norm": 0.17746290564537048, "learning_rate": 0.00026841620626151015, "loss": 0.8846, "num_tokens": 95745941.0, "step": 1510 }, { "epoch": 2.783049286043298, "grad_norm": 0.17132055759429932, "learning_rate": 0.0002682627378759975, "loss": 0.8581, "num_tokens": 95810366.0, "step": 1511 }, { "epoch": 2.7848917549516354, "grad_norm": 0.17952117323875427, "learning_rate": 0.000268109269490485, "loss": 0.8861, "num_tokens": 95873330.0, "step": 1512 }, { "epoch": 2.7867342238599724, "grad_norm": 0.17398114502429962, "learning_rate": 0.00026795580110497235, "loss": 0.8959, "num_tokens": 95936247.0, "step": 1513 }, { "epoch": 2.7885766927683093, "grad_norm": 0.16827510297298431, "learning_rate": 0.0002678023327194598, "loss": 0.9642, "num_tokens": 95999280.0, "step": 1514 }, { "epoch": 2.7904191616766467, "grad_norm": 0.17411412298679352, "learning_rate": 0.0002676488643339472, "loss": 0.9048, "num_tokens": 96063327.0, "step": 1515 }, { "epoch": 2.792261630584984, "grad_norm": 0.1616029590368271, "learning_rate": 0.00026749539594843466, "loss": 0.7966, "num_tokens": 96125470.0, "step": 1516 }, { "epoch": 2.794104099493321, "grad_norm": 0.18403594195842743, "learning_rate": 0.000267341927562922, "loss": 0.8775, "num_tokens": 96188516.0, "step": 1517 }, { "epoch": 2.795946568401658, "grad_norm": 0.17351724207401276, "learning_rate": 0.0002671884591774095, "loss": 0.8955, "num_tokens": 96251619.0, "step": 1518 }, { "epoch": 2.7977890373099954, "grad_norm": 0.16978870332241058, "learning_rate": 0.00026703499079189686, "loss": 0.8283, "num_tokens": 96315731.0, "step": 1519 }, { "epoch": 2.799631506218333, "grad_norm": 0.17543160915374756, "learning_rate": 0.00026688152240638433, "loss": 0.8371, "num_tokens": 96378209.0, "step": 1520 }, { "epoch": 2.8014739751266697, "grad_norm": 0.1720304936170578, "learning_rate": 0.0002667280540208717, "loss": 0.8781, "num_tokens": 96441203.0, "step": 1521 }, { "epoch": 2.8033164440350067, "grad_norm": 0.16554026305675507, "learning_rate": 0.0002665745856353591, "loss": 0.893, "num_tokens": 96505022.0, "step": 1522 }, { "epoch": 2.805158912943344, "grad_norm": 0.16832071542739868, "learning_rate": 0.0002664211172498465, "loss": 0.8269, "num_tokens": 96568456.0, "step": 1523 }, { "epoch": 2.8070013818516815, "grad_norm": 0.16560205817222595, "learning_rate": 0.00026626764886433394, "loss": 0.9026, "num_tokens": 96632382.0, "step": 1524 }, { "epoch": 2.8088438507600184, "grad_norm": 0.17545996606349945, "learning_rate": 0.00026611418047882136, "loss": 0.8699, "num_tokens": 96694828.0, "step": 1525 }, { "epoch": 2.8106863196683554, "grad_norm": 0.18063123524188995, "learning_rate": 0.0002659607120933088, "loss": 0.8113, "num_tokens": 96758250.0, "step": 1526 }, { "epoch": 2.8125287885766928, "grad_norm": 0.17142920196056366, "learning_rate": 0.0002658072437077962, "loss": 0.9111, "num_tokens": 96821714.0, "step": 1527 }, { "epoch": 2.81437125748503, "grad_norm": 0.17291228473186493, "learning_rate": 0.0002656537753222836, "loss": 0.8578, "num_tokens": 96885445.0, "step": 1528 }, { "epoch": 2.816213726393367, "grad_norm": 0.16638997197151184, "learning_rate": 0.00026550030693677103, "loss": 0.8663, "num_tokens": 96948713.0, "step": 1529 }, { "epoch": 2.818056195301704, "grad_norm": 0.17010775208473206, "learning_rate": 0.00026534683855125845, "loss": 0.8678, "num_tokens": 97011690.0, "step": 1530 }, { "epoch": 2.8198986642100414, "grad_norm": 0.1717519462108612, "learning_rate": 0.00026519337016574587, "loss": 0.8017, "num_tokens": 97074662.0, "step": 1531 }, { "epoch": 2.821741133118379, "grad_norm": 0.20343242585659027, "learning_rate": 0.0002650399017802333, "loss": 0.8211, "num_tokens": 97138227.0, "step": 1532 }, { "epoch": 2.8235836020267158, "grad_norm": 0.16340969502925873, "learning_rate": 0.0002648864333947207, "loss": 0.9015, "num_tokens": 97202727.0, "step": 1533 }, { "epoch": 2.8254260709350527, "grad_norm": 0.17301443219184875, "learning_rate": 0.0002647329650092081, "loss": 0.846, "num_tokens": 97266479.0, "step": 1534 }, { "epoch": 2.82726853984339, "grad_norm": 0.16751250624656677, "learning_rate": 0.00026457949662369554, "loss": 0.8762, "num_tokens": 97330192.0, "step": 1535 }, { "epoch": 2.8291110087517275, "grad_norm": 0.1738235205411911, "learning_rate": 0.00026442602823818296, "loss": 0.8371, "num_tokens": 97393640.0, "step": 1536 }, { "epoch": 2.8309534776600644, "grad_norm": 0.1770419031381607, "learning_rate": 0.0002642725598526703, "loss": 0.8848, "num_tokens": 97454643.0, "step": 1537 }, { "epoch": 2.8327959465684014, "grad_norm": 0.17476016283035278, "learning_rate": 0.0002641190914671578, "loss": 0.8579, "num_tokens": 97517594.0, "step": 1538 }, { "epoch": 2.834638415476739, "grad_norm": 0.17220346629619598, "learning_rate": 0.00026396562308164516, "loss": 0.8042, "num_tokens": 97581836.0, "step": 1539 }, { "epoch": 2.836480884385076, "grad_norm": 0.16686995327472687, "learning_rate": 0.00026381215469613263, "loss": 0.9216, "num_tokens": 97646490.0, "step": 1540 }, { "epoch": 2.838323353293413, "grad_norm": 0.16476671397686005, "learning_rate": 0.00026365868631062, "loss": 0.8723, "num_tokens": 97711751.0, "step": 1541 }, { "epoch": 2.8401658222017505, "grad_norm": 0.1725228875875473, "learning_rate": 0.00026350521792510746, "loss": 0.895, "num_tokens": 97774503.0, "step": 1542 }, { "epoch": 2.8420082911100875, "grad_norm": 0.17373448610305786, "learning_rate": 0.0002633517495395948, "loss": 0.8666, "num_tokens": 97837283.0, "step": 1543 }, { "epoch": 2.843850760018425, "grad_norm": 0.1759302020072937, "learning_rate": 0.0002631982811540823, "loss": 0.8662, "num_tokens": 97899998.0, "step": 1544 }, { "epoch": 2.845693228926762, "grad_norm": 0.16506004333496094, "learning_rate": 0.00026304481276856966, "loss": 0.8257, "num_tokens": 97963444.0, "step": 1545 }, { "epoch": 2.847535697835099, "grad_norm": 0.16711178421974182, "learning_rate": 0.00026289134438305713, "loss": 0.8882, "num_tokens": 98026973.0, "step": 1546 }, { "epoch": 2.849378166743436, "grad_norm": 0.1704748123884201, "learning_rate": 0.0002627378759975445, "loss": 0.8305, "num_tokens": 98091251.0, "step": 1547 }, { "epoch": 2.8512206356517735, "grad_norm": 0.18302534520626068, "learning_rate": 0.00026258440761203197, "loss": 0.8083, "num_tokens": 98155076.0, "step": 1548 }, { "epoch": 2.8530631045601105, "grad_norm": 0.16991528868675232, "learning_rate": 0.00026243093922651933, "loss": 0.9302, "num_tokens": 98217643.0, "step": 1549 }, { "epoch": 2.854905573468448, "grad_norm": 0.1761588603258133, "learning_rate": 0.0002622774708410068, "loss": 0.8524, "num_tokens": 98280970.0, "step": 1550 }, { "epoch": 2.856748042376785, "grad_norm": 0.17404231429100037, "learning_rate": 0.00026212400245549417, "loss": 0.9179, "num_tokens": 98344173.0, "step": 1551 }, { "epoch": 2.858590511285122, "grad_norm": 0.20156294107437134, "learning_rate": 0.00026197053406998164, "loss": 0.9172, "num_tokens": 98405217.0, "step": 1552 }, { "epoch": 2.860432980193459, "grad_norm": 0.16533537209033966, "learning_rate": 0.000261817065684469, "loss": 0.8241, "num_tokens": 98469561.0, "step": 1553 }, { "epoch": 2.8622754491017965, "grad_norm": 0.1783110499382019, "learning_rate": 0.0002616635972989564, "loss": 0.8316, "num_tokens": 98532623.0, "step": 1554 }, { "epoch": 2.8641179180101335, "grad_norm": 0.17590516805648804, "learning_rate": 0.00026151012891344384, "loss": 0.9001, "num_tokens": 98595232.0, "step": 1555 }, { "epoch": 2.865960386918471, "grad_norm": 0.17514266073703766, "learning_rate": 0.00026135666052793126, "loss": 0.8101, "num_tokens": 98657588.0, "step": 1556 }, { "epoch": 2.867802855826808, "grad_norm": 0.17770273983478546, "learning_rate": 0.0002612031921424187, "loss": 0.9195, "num_tokens": 98721540.0, "step": 1557 }, { "epoch": 2.869645324735145, "grad_norm": 0.16457872092723846, "learning_rate": 0.0002610497237569061, "loss": 0.8778, "num_tokens": 98784722.0, "step": 1558 }, { "epoch": 2.871487793643482, "grad_norm": 0.17271296679973602, "learning_rate": 0.0002608962553713935, "loss": 0.8629, "num_tokens": 98848784.0, "step": 1559 }, { "epoch": 2.8733302625518196, "grad_norm": 0.1697774976491928, "learning_rate": 0.0002607427869858809, "loss": 0.8165, "num_tokens": 98912410.0, "step": 1560 }, { "epoch": 2.8751727314601565, "grad_norm": 0.18373943865299225, "learning_rate": 0.00026058931860036835, "loss": 0.7957, "num_tokens": 98974569.0, "step": 1561 }, { "epoch": 2.877015200368494, "grad_norm": 0.18497443199157715, "learning_rate": 0.0002604358502148557, "loss": 0.8965, "num_tokens": 99037810.0, "step": 1562 }, { "epoch": 2.878857669276831, "grad_norm": 0.18348988890647888, "learning_rate": 0.0002602823818293432, "loss": 0.8618, "num_tokens": 99102073.0, "step": 1563 }, { "epoch": 2.8807001381851682, "grad_norm": 0.16340500116348267, "learning_rate": 0.00026012891344383054, "loss": 0.9297, "num_tokens": 99164777.0, "step": 1564 }, { "epoch": 2.882542607093505, "grad_norm": 0.1634228229522705, "learning_rate": 0.000259975445058318, "loss": 0.9151, "num_tokens": 99228085.0, "step": 1565 }, { "epoch": 2.8843850760018426, "grad_norm": 0.1776060163974762, "learning_rate": 0.0002598219766728054, "loss": 0.8156, "num_tokens": 99290740.0, "step": 1566 }, { "epoch": 2.8862275449101795, "grad_norm": 0.36051756143569946, "learning_rate": 0.00025966850828729285, "loss": 0.8526, "num_tokens": 99353357.0, "step": 1567 }, { "epoch": 2.888070013818517, "grad_norm": 0.16523820161819458, "learning_rate": 0.0002595150399017802, "loss": 0.9237, "num_tokens": 99417804.0, "step": 1568 }, { "epoch": 2.889912482726854, "grad_norm": 0.17544612288475037, "learning_rate": 0.00025936157151626763, "loss": 0.9253, "num_tokens": 99481208.0, "step": 1569 }, { "epoch": 2.8917549516351913, "grad_norm": 0.1695338785648346, "learning_rate": 0.00025920810313075505, "loss": 0.8348, "num_tokens": 99545448.0, "step": 1570 }, { "epoch": 2.893597420543528, "grad_norm": 0.17342506349086761, "learning_rate": 0.00025905463474524247, "loss": 0.8877, "num_tokens": 99608718.0, "step": 1571 }, { "epoch": 2.8954398894518656, "grad_norm": 0.1688850075006485, "learning_rate": 0.0002589011663597299, "loss": 0.879, "num_tokens": 99673030.0, "step": 1572 }, { "epoch": 2.8972823583602025, "grad_norm": 0.17778171598911285, "learning_rate": 0.0002587476979742173, "loss": 0.9142, "num_tokens": 99736586.0, "step": 1573 }, { "epoch": 2.89912482726854, "grad_norm": 0.17917901277542114, "learning_rate": 0.0002585942295887047, "loss": 0.911, "num_tokens": 99800755.0, "step": 1574 }, { "epoch": 2.900967296176877, "grad_norm": 0.16729219257831573, "learning_rate": 0.00025844076120319214, "loss": 0.9507, "num_tokens": 99864555.0, "step": 1575 }, { "epoch": 2.9028097650852143, "grad_norm": 0.16688989102840424, "learning_rate": 0.00025828729281767956, "loss": 0.8634, "num_tokens": 99929227.0, "step": 1576 }, { "epoch": 2.904652233993551, "grad_norm": 0.1752895563840866, "learning_rate": 0.000258133824432167, "loss": 0.8573, "num_tokens": 99991815.0, "step": 1577 }, { "epoch": 2.9064947029018886, "grad_norm": 0.17227163910865784, "learning_rate": 0.0002579803560466544, "loss": 0.7937, "num_tokens": 100054595.0, "step": 1578 }, { "epoch": 2.9083371718102256, "grad_norm": 0.17118190228939056, "learning_rate": 0.0002578268876611418, "loss": 0.854, "num_tokens": 100118321.0, "step": 1579 }, { "epoch": 2.910179640718563, "grad_norm": 0.1650063544511795, "learning_rate": 0.00025767341927562923, "loss": 0.8971, "num_tokens": 100182465.0, "step": 1580 }, { "epoch": 2.9120221096269, "grad_norm": 0.17309200763702393, "learning_rate": 0.00025751995089011665, "loss": 0.894, "num_tokens": 100246421.0, "step": 1581 }, { "epoch": 2.9138645785352373, "grad_norm": 0.18739920854568481, "learning_rate": 0.00025736648250460406, "loss": 0.8616, "num_tokens": 100309934.0, "step": 1582 }, { "epoch": 2.9157070474435742, "grad_norm": 0.1774493157863617, "learning_rate": 0.0002572130141190915, "loss": 0.8673, "num_tokens": 100371422.0, "step": 1583 }, { "epoch": 2.9175495163519116, "grad_norm": 0.1730918288230896, "learning_rate": 0.00025705954573357884, "loss": 0.8815, "num_tokens": 100435372.0, "step": 1584 }, { "epoch": 2.9193919852602486, "grad_norm": 0.1724010854959488, "learning_rate": 0.0002569060773480663, "loss": 0.8553, "num_tokens": 100497873.0, "step": 1585 }, { "epoch": 2.921234454168586, "grad_norm": 0.16980569064617157, "learning_rate": 0.0002567526089625537, "loss": 0.8414, "num_tokens": 100560578.0, "step": 1586 }, { "epoch": 2.9230769230769234, "grad_norm": 0.1787833422422409, "learning_rate": 0.00025659914057704115, "loss": 0.8704, "num_tokens": 100624257.0, "step": 1587 }, { "epoch": 2.9249193919852603, "grad_norm": 0.17083413898944855, "learning_rate": 0.0002564456721915285, "loss": 0.9127, "num_tokens": 100689029.0, "step": 1588 }, { "epoch": 2.9267618608935972, "grad_norm": 0.16461075842380524, "learning_rate": 0.000256292203806016, "loss": 0.9087, "num_tokens": 100752577.0, "step": 1589 }, { "epoch": 2.9286043298019346, "grad_norm": 0.17876477539539337, "learning_rate": 0.00025613873542050335, "loss": 0.8096, "num_tokens": 100816775.0, "step": 1590 }, { "epoch": 2.930446798710272, "grad_norm": 0.1743822991847992, "learning_rate": 0.0002559852670349908, "loss": 0.8406, "num_tokens": 100879540.0, "step": 1591 }, { "epoch": 2.932289267618609, "grad_norm": 0.17934519052505493, "learning_rate": 0.0002558317986494782, "loss": 0.8098, "num_tokens": 100942941.0, "step": 1592 }, { "epoch": 2.934131736526946, "grad_norm": 0.17595845460891724, "learning_rate": 0.00025567833026396566, "loss": 0.8519, "num_tokens": 101005932.0, "step": 1593 }, { "epoch": 2.9359742054352833, "grad_norm": 0.17553862929344177, "learning_rate": 0.000255524861878453, "loss": 0.7757, "num_tokens": 101068218.0, "step": 1594 }, { "epoch": 2.9378166743436207, "grad_norm": 0.1687108725309372, "learning_rate": 0.0002553713934929405, "loss": 0.8674, "num_tokens": 101132189.0, "step": 1595 }, { "epoch": 2.9396591432519577, "grad_norm": 0.17192941904067993, "learning_rate": 0.00025521792510742786, "loss": 0.8659, "num_tokens": 101195815.0, "step": 1596 }, { "epoch": 2.9415016121602946, "grad_norm": 0.16362158954143524, "learning_rate": 0.00025506445672191533, "loss": 0.8372, "num_tokens": 101258390.0, "step": 1597 }, { "epoch": 2.943344081068632, "grad_norm": 0.1757393628358841, "learning_rate": 0.0002549109883364027, "loss": 0.8608, "num_tokens": 101321206.0, "step": 1598 }, { "epoch": 2.9451865499769694, "grad_norm": 0.181174635887146, "learning_rate": 0.00025475751995089016, "loss": 0.9219, "num_tokens": 101384390.0, "step": 1599 }, { "epoch": 2.9470290188853063, "grad_norm": 0.1747407466173172, "learning_rate": 0.00025460405156537753, "loss": 0.8954, "num_tokens": 101447903.0, "step": 1600 }, { "epoch": 2.9488714877936433, "grad_norm": 0.17393234372138977, "learning_rate": 0.00025445058317986495, "loss": 0.9391, "num_tokens": 101509746.0, "step": 1601 }, { "epoch": 2.9507139567019807, "grad_norm": 0.1769677698612213, "learning_rate": 0.00025429711479435236, "loss": 0.839, "num_tokens": 101573202.0, "step": 1602 }, { "epoch": 2.952556425610318, "grad_norm": 0.18556934595108032, "learning_rate": 0.0002541436464088398, "loss": 0.8471, "num_tokens": 101636455.0, "step": 1603 }, { "epoch": 2.954398894518655, "grad_norm": 0.17087626457214355, "learning_rate": 0.0002539901780233272, "loss": 0.8838, "num_tokens": 101700716.0, "step": 1604 }, { "epoch": 2.956241363426992, "grad_norm": 0.17766809463500977, "learning_rate": 0.0002538367096378146, "loss": 0.8403, "num_tokens": 101764126.0, "step": 1605 }, { "epoch": 2.9580838323353293, "grad_norm": 0.1708325743675232, "learning_rate": 0.00025368324125230203, "loss": 0.876, "num_tokens": 101828471.0, "step": 1606 }, { "epoch": 2.9599263012436667, "grad_norm": 0.17169401049613953, "learning_rate": 0.00025352977286678945, "loss": 0.8308, "num_tokens": 101890600.0, "step": 1607 }, { "epoch": 2.9617687701520037, "grad_norm": 0.1759345382452011, "learning_rate": 0.00025337630448127687, "loss": 0.9213, "num_tokens": 101952961.0, "step": 1608 }, { "epoch": 2.9636112390603406, "grad_norm": 0.1657620668411255, "learning_rate": 0.0002532228360957643, "loss": 0.9502, "num_tokens": 102016853.0, "step": 1609 }, { "epoch": 2.965453707968678, "grad_norm": 0.18161854147911072, "learning_rate": 0.0002530693677102517, "loss": 0.9983, "num_tokens": 102078478.0, "step": 1610 }, { "epoch": 2.9672961768770154, "grad_norm": 0.18570812046527863, "learning_rate": 0.0002529158993247391, "loss": 0.8161, "num_tokens": 102141692.0, "step": 1611 }, { "epoch": 2.9691386457853524, "grad_norm": 0.18209852278232574, "learning_rate": 0.00025276243093922654, "loss": 0.8463, "num_tokens": 102204693.0, "step": 1612 }, { "epoch": 2.9709811146936893, "grad_norm": 0.17226946353912354, "learning_rate": 0.00025260896255371396, "loss": 0.9174, "num_tokens": 102268279.0, "step": 1613 }, { "epoch": 2.9728235836020267, "grad_norm": 0.17820438742637634, "learning_rate": 0.0002524554941682014, "loss": 0.8328, "num_tokens": 102331328.0, "step": 1614 }, { "epoch": 2.974666052510364, "grad_norm": 0.17262102663516998, "learning_rate": 0.0002523020257826888, "loss": 0.8422, "num_tokens": 102395156.0, "step": 1615 }, { "epoch": 2.976508521418701, "grad_norm": 0.16888898611068726, "learning_rate": 0.00025214855739717616, "loss": 0.9131, "num_tokens": 102459541.0, "step": 1616 }, { "epoch": 2.978350990327038, "grad_norm": 0.17130553722381592, "learning_rate": 0.00025199508901166363, "loss": 0.9452, "num_tokens": 102522995.0, "step": 1617 }, { "epoch": 2.9801934592353754, "grad_norm": 0.17321966588497162, "learning_rate": 0.000251841620626151, "loss": 0.8672, "num_tokens": 102585720.0, "step": 1618 }, { "epoch": 2.9820359281437128, "grad_norm": 0.1731366217136383, "learning_rate": 0.00025168815224063846, "loss": 0.8567, "num_tokens": 102649673.0, "step": 1619 }, { "epoch": 2.9838783970520497, "grad_norm": 0.17459966242313385, "learning_rate": 0.00025153468385512583, "loss": 0.8974, "num_tokens": 102713613.0, "step": 1620 }, { "epoch": 2.9857208659603867, "grad_norm": 0.16973352432250977, "learning_rate": 0.0002513812154696133, "loss": 0.8551, "num_tokens": 102777584.0, "step": 1621 }, { "epoch": 2.987563334868724, "grad_norm": 0.1834269016981125, "learning_rate": 0.00025122774708410066, "loss": 0.8286, "num_tokens": 102840319.0, "step": 1622 }, { "epoch": 2.9894058037770614, "grad_norm": 0.16938187181949615, "learning_rate": 0.00025107427869858814, "loss": 0.8742, "num_tokens": 102903077.0, "step": 1623 }, { "epoch": 2.9912482726853984, "grad_norm": 0.17127610743045807, "learning_rate": 0.0002509208103130755, "loss": 0.9885, "num_tokens": 102966375.0, "step": 1624 }, { "epoch": 2.9930907415937353, "grad_norm": 0.16586105525493622, "learning_rate": 0.00025076734192756297, "loss": 0.8151, "num_tokens": 103030020.0, "step": 1625 }, { "epoch": 2.9949332105020727, "grad_norm": 0.16557806730270386, "learning_rate": 0.00025061387354205033, "loss": 0.8447, "num_tokens": 103094364.0, "step": 1626 }, { "epoch": 2.99677567941041, "grad_norm": 0.172907754778862, "learning_rate": 0.0002504604051565378, "loss": 0.9223, "num_tokens": 103157615.0, "step": 1627 }, { "epoch": 2.998618148318747, "grad_norm": 0.17664043605327606, "learning_rate": 0.00025030693677102517, "loss": 0.8609, "num_tokens": 103221610.0, "step": 1628 }, { "epoch": 3.0, "grad_norm": 0.20302985608577728, "learning_rate": 0.00025015346838551264, "loss": 0.8363, "num_tokens": 103269292.0, "step": 1629 }, { "epoch": 3.0, "eval_loss": 1.2086403369903564, "eval_num_tokens": 103269292.0, "eval_runtime": 2.1328, "eval_samples_per_second": 23.443, "eval_steps_per_second": 0.938, "step": 1629 } ], "logging_steps": 1.0, "max_steps": 3258, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.24375825050239e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }