Instructions to use Sathvik0101/cyber-duel-tiny-adapter with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use Sathvik0101/cyber-duel-tiny-adapter with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("google/gemma-3-270m-it") model = PeftModel.from_pretrained(base_model, "Sathvik0101/cyber-duel-tiny-adapter") - Transformers
How to use Sathvik0101/cyber-duel-tiny-adapter with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Sathvik0101/cyber-duel-tiny-adapter") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Sathvik0101/cyber-duel-tiny-adapter", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use Sathvik0101/cyber-duel-tiny-adapter with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Sathvik0101/cyber-duel-tiny-adapter" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Sathvik0101/cyber-duel-tiny-adapter", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/Sathvik0101/cyber-duel-tiny-adapter
- SGLang
How to use Sathvik0101/cyber-duel-tiny-adapter with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Sathvik0101/cyber-duel-tiny-adapter" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Sathvik0101/cyber-duel-tiny-adapter", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Sathvik0101/cyber-duel-tiny-adapter" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Sathvik0101/cyber-duel-tiny-adapter", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use Sathvik0101/cyber-duel-tiny-adapter with Docker Model Runner:
docker model run hf.co/Sathvik0101/cyber-duel-tiny-adapter
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 4500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.548675110936165, | |
| "epoch": 0.013333333333333334, | |
| "grad_norm": 6.004289150238037, | |
| "learning_rate": 1.688888888888889e-05, | |
| "loss": 3.992266082763672, | |
| "mean_token_accuracy": 0.4231670804321766, | |
| "num_tokens": 102186.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 2.2785673171281813, | |
| "epoch": 0.02666666666666667, | |
| "grad_norm": 3.0929577350616455, | |
| "learning_rate": 3.466666666666667e-05, | |
| "loss": 2.8258544921875, | |
| "mean_token_accuracy": 0.5193272314965725, | |
| "num_tokens": 204333.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.6070524707436562, | |
| "epoch": 0.04, | |
| "grad_norm": 2.344675302505493, | |
| "learning_rate": 5.244444444444445e-05, | |
| "loss": 1.4720239639282227, | |
| "mean_token_accuracy": 0.7221725225448609, | |
| "num_tokens": 306198.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.4960472501814365, | |
| "epoch": 0.05333333333333334, | |
| "grad_norm": 2.080559730529785, | |
| "learning_rate": 7.022222222222222e-05, | |
| "loss": 0.4806540012359619, | |
| "mean_token_accuracy": 0.9012673273682594, | |
| "num_tokens": 408035.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.1692034611478448, | |
| "epoch": 0.06666666666666667, | |
| "grad_norm": 1.5278408527374268, | |
| "learning_rate": 8.800000000000001e-05, | |
| "loss": 0.1592921018600464, | |
| "mean_token_accuracy": 0.9599622413516045, | |
| "num_tokens": 509962.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.11861470770090818, | |
| "epoch": 0.08, | |
| "grad_norm": 0.999698281288147, | |
| "learning_rate": 0.00010577777777777777, | |
| "loss": 0.11052950620651245, | |
| "mean_token_accuracy": 0.9685859054327011, | |
| "num_tokens": 611562.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.1036016432568431, | |
| "epoch": 0.09333333333333334, | |
| "grad_norm": 0.9115886092185974, | |
| "learning_rate": 0.00012355555555555557, | |
| "loss": 0.0914052426815033, | |
| "mean_token_accuracy": 0.9704682394862175, | |
| "num_tokens": 713684.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.09701150320470334, | |
| "epoch": 0.10666666666666667, | |
| "grad_norm": 0.6500758528709412, | |
| "learning_rate": 0.00014133333333333334, | |
| "loss": 0.08168401718139648, | |
| "mean_token_accuracy": 0.9728480890393257, | |
| "num_tokens": 815728.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.0902867017313838, | |
| "epoch": 0.12, | |
| "grad_norm": 0.4816068112850189, | |
| "learning_rate": 0.00015911111111111112, | |
| "loss": 0.0673690140247345, | |
| "mean_token_accuracy": 0.9743824899196625, | |
| "num_tokens": 917588.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.07180177625268698, | |
| "epoch": 0.13333333333333333, | |
| "grad_norm": 0.42101994156837463, | |
| "learning_rate": 0.0001768888888888889, | |
| "loss": 0.05838126540184021, | |
| "mean_token_accuracy": 0.975058288872242, | |
| "num_tokens": 1020041.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.06279958104714752, | |
| "epoch": 0.14666666666666667, | |
| "grad_norm": 0.41531553864479065, | |
| "learning_rate": 0.0001946666666666667, | |
| "loss": 0.05505728721618652, | |
| "mean_token_accuracy": 0.976372754573822, | |
| "num_tokens": 1121933.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.05904992977157235, | |
| "epoch": 0.16, | |
| "grad_norm": 0.5666757822036743, | |
| "learning_rate": 0.00019999470763544457, | |
| "loss": 0.052491378784179685, | |
| "mean_token_accuracy": 0.9762984499335289, | |
| "num_tokens": 1223670.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.05695097530260682, | |
| "epoch": 0.17333333333333334, | |
| "grad_norm": 0.39107683300971985, | |
| "learning_rate": 0.00019996878719840213, | |
| "loss": 0.05221613645553589, | |
| "mean_token_accuracy": 0.9769444420933724, | |
| "num_tokens": 1325903.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.05454709641635418, | |
| "epoch": 0.18666666666666668, | |
| "grad_norm": 0.2881831228733063, | |
| "learning_rate": 0.00019992127221406275, | |
| "loss": 0.05105168223381042, | |
| "mean_token_accuracy": 0.9766697883605957, | |
| "num_tokens": 1427883.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.05568597661331296, | |
| "epoch": 0.2, | |
| "grad_norm": 0.2969810962677002, | |
| "learning_rate": 0.00019985217294627577, | |
| "loss": 0.05190561413764953, | |
| "mean_token_accuracy": 0.9768449172377587, | |
| "num_tokens": 1529850.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.05605392120778561, | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 0.39327648282051086, | |
| "learning_rate": 0.00019976150432137423, | |
| "loss": 0.05125090479850769, | |
| "mean_token_accuracy": 0.9767352715134621, | |
| "num_tokens": 1631796.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.05631188191473484, | |
| "epoch": 0.22666666666666666, | |
| "grad_norm": 0.2569703757762909, | |
| "learning_rate": 0.00019964928592495045, | |
| "loss": 0.05136184692382813, | |
| "mean_token_accuracy": 0.9767047330737114, | |
| "num_tokens": 1733431.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.054749509692192076, | |
| "epoch": 0.24, | |
| "grad_norm": 0.2503352761268616, | |
| "learning_rate": 0.00019951554199762526, | |
| "loss": 0.04927194118499756, | |
| "mean_token_accuracy": 0.9772127717733383, | |
| "num_tokens": 1835736.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.053956403583288196, | |
| "epoch": 0.25333333333333335, | |
| "grad_norm": 0.26568838953971863, | |
| "learning_rate": 0.00019936030142981182, | |
| "loss": 0.04831983149051666, | |
| "mean_token_accuracy": 0.9772727772593498, | |
| "num_tokens": 1937395.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.05297513753175735, | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 0.21782436966896057, | |
| "learning_rate": 0.00019918359775547489, | |
| "loss": 0.048703563213348386, | |
| "mean_token_accuracy": 0.9776117220520973, | |
| "num_tokens": 2039661.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.05235615810379386, | |
| "epoch": 0.28, | |
| "grad_norm": 0.2456953078508377, | |
| "learning_rate": 0.00019898546914488697, | |
| "loss": 0.04742903709411621, | |
| "mean_token_accuracy": 0.9779680415987968, | |
| "num_tokens": 2141312.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.05012538954615593, | |
| "epoch": 0.29333333333333333, | |
| "grad_norm": 0.17193332314491272, | |
| "learning_rate": 0.00019876595839638314, | |
| "loss": 0.04511936604976654, | |
| "mean_token_accuracy": 0.978802102804184, | |
| "num_tokens": 2243220.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.050425101164728404, | |
| "epoch": 0.30666666666666664, | |
| "grad_norm": 0.19117344915866852, | |
| "learning_rate": 0.00019852511292711608, | |
| "loss": 0.04454375207424164, | |
| "mean_token_accuracy": 0.9793910697102547, | |
| "num_tokens": 2345110.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.0502777012065053, | |
| "epoch": 0.32, | |
| "grad_norm": 0.1484805941581726, | |
| "learning_rate": 0.0001982629847628132, | |
| "loss": 0.045093965530395505, | |
| "mean_token_accuracy": 0.9782336875796318, | |
| "num_tokens": 2446814.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.04916129466146231, | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.17659035325050354, | |
| "learning_rate": 0.0001979796305265386, | |
| "loss": 0.04536721706390381, | |
| "mean_token_accuracy": 0.9788262486457825, | |
| "num_tokens": 2548699.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.04801498837769032, | |
| "epoch": 0.3466666666666667, | |
| "grad_norm": 0.18467392027378082, | |
| "learning_rate": 0.0001976751114264616, | |
| "loss": 0.04428495168685913, | |
| "mean_token_accuracy": 0.9791656643152237, | |
| "num_tokens": 2650925.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.04973381711170077, | |
| "epoch": 0.36, | |
| "grad_norm": 0.22871969640254974, | |
| "learning_rate": 0.0001973494932426351, | |
| "loss": 0.04659122526645661, | |
| "mean_token_accuracy": 0.9777900949120522, | |
| "num_tokens": 2753152.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.050069388933479786, | |
| "epoch": 0.37333333333333335, | |
| "grad_norm": 0.14215655624866486, | |
| "learning_rate": 0.00019700284631278623, | |
| "loss": 0.04543479979038238, | |
| "mean_token_accuracy": 0.9784642964601517, | |
| "num_tokens": 2855157.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.048892225697636606, | |
| "epoch": 0.38666666666666666, | |
| "grad_norm": 0.14485321938991547, | |
| "learning_rate": 0.00019663524551712236, | |
| "loss": 0.043998023867607115, | |
| "mean_token_accuracy": 0.9789358124136924, | |
| "num_tokens": 2957430.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.049546369817107916, | |
| "epoch": 0.4, | |
| "grad_norm": 0.1522541642189026, | |
| "learning_rate": 0.0001962467702621562, | |
| "loss": 0.04526585042476654, | |
| "mean_token_accuracy": 0.9789461970329285, | |
| "num_tokens": 3059857.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.048749705869704486, | |
| "epoch": 0.41333333333333333, | |
| "grad_norm": 0.14776450395584106, | |
| "learning_rate": 0.00019583750446355286, | |
| "loss": 0.04488187730312347, | |
| "mean_token_accuracy": 0.9790951684117317, | |
| "num_tokens": 3161377.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.04819442732259631, | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 0.155587837100029, | |
| "learning_rate": 0.000195407536528003, | |
| "loss": 0.04454294443130493, | |
| "mean_token_accuracy": 0.9792696803808212, | |
| "num_tokens": 3263597.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.048739112261682746, | |
| "epoch": 0.44, | |
| "grad_norm": 0.24131548404693604, | |
| "learning_rate": 0.0001949569593341258, | |
| "loss": 0.04449517726898193, | |
| "mean_token_accuracy": 0.9789462149143219, | |
| "num_tokens": 3365773.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.04729501772671938, | |
| "epoch": 0.4533333333333333, | |
| "grad_norm": 0.16851578652858734, | |
| "learning_rate": 0.00019448587021240611, | |
| "loss": 0.0436316579580307, | |
| "mean_token_accuracy": 0.9790461182594299, | |
| "num_tokens": 3467719.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.048864346370100974, | |
| "epoch": 0.4666666666666667, | |
| "grad_norm": 0.17274609208106995, | |
| "learning_rate": 0.00019399437092416967, | |
| "loss": 0.04535620212554932, | |
| "mean_token_accuracy": 0.9788791447877884, | |
| "num_tokens": 3569559.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.04898029724135995, | |
| "epoch": 0.48, | |
| "grad_norm": 0.13499416410923004, | |
| "learning_rate": 0.00019348256763960145, | |
| "loss": 0.045434945821762086, | |
| "mean_token_accuracy": 0.9788094267249108, | |
| "num_tokens": 3671491.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.04580554729327559, | |
| "epoch": 0.49333333333333335, | |
| "grad_norm": 0.12506447732448578, | |
| "learning_rate": 0.00019295057091481147, | |
| "loss": 0.04356709420681, | |
| "mean_token_accuracy": 0.9791021943092346, | |
| "num_tokens": 3773051.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.047521025873720646, | |
| "epoch": 0.5066666666666667, | |
| "grad_norm": 0.121482253074646, | |
| "learning_rate": 0.00019239849566795323, | |
| "loss": 0.044592976570129395, | |
| "mean_token_accuracy": 0.9786569505929947, | |
| "num_tokens": 3875663.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.045532725658267735, | |
| "epoch": 0.52, | |
| "grad_norm": 0.13711974024772644, | |
| "learning_rate": 0.00019182646115439996, | |
| "loss": 0.042892631888389585, | |
| "mean_token_accuracy": 0.979731023311615, | |
| "num_tokens": 3977742.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.04748789621517062, | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 0.126457080245018, | |
| "learning_rate": 0.00019123459094098398, | |
| "loss": 0.04508825838565826, | |
| "mean_token_accuracy": 0.9783048242330551, | |
| "num_tokens": 4079943.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.045889181550592184, | |
| "epoch": 0.5466666666666666, | |
| "grad_norm": 0.12796172499656677, | |
| "learning_rate": 0.00019062301287930446, | |
| "loss": 0.04326332211494446, | |
| "mean_token_accuracy": 0.979296863079071, | |
| "num_tokens": 4181963.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 0.045128315966576335, | |
| "epoch": 0.56, | |
| "grad_norm": 0.0813562199473381, | |
| "learning_rate": 0.00018999185907811009, | |
| "loss": 0.04314403533935547, | |
| "mean_token_accuracy": 0.9794226452708245, | |
| "num_tokens": 4283940.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 0.04633188545703888, | |
| "epoch": 0.5733333333333334, | |
| "grad_norm": 0.13212576508522034, | |
| "learning_rate": 0.00018934126587476162, | |
| "loss": 0.04438722729682922, | |
| "mean_token_accuracy": 0.9792284339666366, | |
| "num_tokens": 4386033.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 0.046954588033258915, | |
| "epoch": 0.5866666666666667, | |
| "grad_norm": 0.24543477594852448, | |
| "learning_rate": 0.0001886713738057815, | |
| "loss": 0.04496486783027649, | |
| "mean_token_accuracy": 0.978602097928524, | |
| "num_tokens": 4488033.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 0.047627194225788115, | |
| "epoch": 0.6, | |
| "grad_norm": 0.15973004698753357, | |
| "learning_rate": 0.000187982327576496, | |
| "loss": 0.0447381466627121, | |
| "mean_token_accuracy": 0.978855662047863, | |
| "num_tokens": 4590393.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.049009975790977475, | |
| "epoch": 0.6133333333333333, | |
| "grad_norm": 0.4588961899280548, | |
| "learning_rate": 0.000187274276029777, | |
| "loss": 0.04679847955703735, | |
| "mean_token_accuracy": 0.9788309365510941, | |
| "num_tokens": 4692314.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 0.05283641302958131, | |
| "epoch": 0.6266666666666667, | |
| "grad_norm": 0.17900370061397552, | |
| "learning_rate": 0.00018654737211389004, | |
| "loss": 0.04886095821857452, | |
| "mean_token_accuracy": 0.9779917612671852, | |
| "num_tokens": 4794297.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 0.05194324087351561, | |
| "epoch": 0.64, | |
| "grad_norm": 0.2685967683792114, | |
| "learning_rate": 0.00018580177284945566, | |
| "loss": 0.04925000071525574, | |
| "mean_token_accuracy": 0.9787736907601357, | |
| "num_tokens": 4896719.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 0.04687528889626265, | |
| "epoch": 0.6533333333333333, | |
| "grad_norm": 0.3776164948940277, | |
| "learning_rate": 0.0001850376392955307, | |
| "loss": 0.04358056485652924, | |
| "mean_token_accuracy": 0.9792398914694787, | |
| "num_tokens": 4998801.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 0.04969303589314222, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.10363394021987915, | |
| "learning_rate": 0.00018425513651481747, | |
| "loss": 0.04642247259616852, | |
| "mean_token_accuracy": 0.9783516511321068, | |
| "num_tokens": 5100997.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.047921424824744464, | |
| "epoch": 0.68, | |
| "grad_norm": 0.1332525759935379, | |
| "learning_rate": 0.00018345443353800839, | |
| "loss": 0.04439827501773834, | |
| "mean_token_accuracy": 0.9791212469339371, | |
| "num_tokens": 5202682.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 0.047575213573873044, | |
| "epoch": 0.6933333333333334, | |
| "grad_norm": 0.08405883610248566, | |
| "learning_rate": 0.00018263570332727275, | |
| "loss": 0.043652302026748656, | |
| "mean_token_accuracy": 0.9786113709211349, | |
| "num_tokens": 5304249.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 0.04774442110210657, | |
| "epoch": 0.7066666666666667, | |
| "grad_norm": 0.09579049050807953, | |
| "learning_rate": 0.00018179912273889501, | |
| "loss": 0.043841779232025146, | |
| "mean_token_accuracy": 0.9791841998696327, | |
| "num_tokens": 5406457.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 0.04760089740157127, | |
| "epoch": 0.72, | |
| "grad_norm": 0.13812078535556793, | |
| "learning_rate": 0.00018094487248507127, | |
| "loss": 0.04469398260116577, | |
| "mean_token_accuracy": 0.9787818253040313, | |
| "num_tokens": 5508325.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 0.04628140116110444, | |
| "epoch": 0.7333333333333333, | |
| "grad_norm": 0.09030942618846893, | |
| "learning_rate": 0.00018007313709487334, | |
| "loss": 0.043077632784843445, | |
| "mean_token_accuracy": 0.9798856094479561, | |
| "num_tokens": 5609876.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 0.04589016325771809, | |
| "epoch": 0.7466666666666667, | |
| "grad_norm": 0.0854763314127922, | |
| "learning_rate": 0.00017918410487438805, | |
| "loss": 0.04384036958217621, | |
| "mean_token_accuracy": 0.9791762813925743, | |
| "num_tokens": 5712340.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 0.04689710335806012, | |
| "epoch": 0.76, | |
| "grad_norm": 0.10074414312839508, | |
| "learning_rate": 0.00017827796786604042, | |
| "loss": 0.04416438341140747, | |
| "mean_token_accuracy": 0.979088181257248, | |
| "num_tokens": 5814598.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 0.04654768798500299, | |
| "epoch": 0.7733333333333333, | |
| "grad_norm": 0.07522693276405334, | |
| "learning_rate": 0.0001773549218071105, | |
| "loss": 0.0432561069726944, | |
| "mean_token_accuracy": 0.9793283045291901, | |
| "num_tokens": 5916277.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 0.0449189274571836, | |
| "epoch": 0.7866666666666666, | |
| "grad_norm": 0.12037090212106705, | |
| "learning_rate": 0.00017641516608745114, | |
| "loss": 0.04267836213111877, | |
| "mean_token_accuracy": 0.9796097055077553, | |
| "num_tokens": 6018305.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 0.04518893817439675, | |
| "epoch": 0.8, | |
| "grad_norm": 0.15295696258544922, | |
| "learning_rate": 0.0001754589037064175, | |
| "loss": 0.04324706792831421, | |
| "mean_token_accuracy": 0.9793181642889977, | |
| "num_tokens": 6120161.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 0.0459614584222436, | |
| "epoch": 0.8133333333333334, | |
| "grad_norm": 0.10844975709915161, | |
| "learning_rate": 0.0001744863412290165, | |
| "loss": 0.04338730275630951, | |
| "mean_token_accuracy": 0.9787795886397361, | |
| "num_tokens": 6221926.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 0.04700327459722757, | |
| "epoch": 0.8266666666666667, | |
| "grad_norm": 0.12464659661054611, | |
| "learning_rate": 0.00017349768874128603, | |
| "loss": 0.04424178600311279, | |
| "mean_token_accuracy": 0.9791146576404571, | |
| "num_tokens": 6323994.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 0.045251396391540764, | |
| "epoch": 0.84, | |
| "grad_norm": 0.10585556924343109, | |
| "learning_rate": 0.00017249315980491373, | |
| "loss": 0.04233089089393616, | |
| "mean_token_accuracy": 0.980115057528019, | |
| "num_tokens": 6425801.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 0.04711138280108571, | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 0.10078904032707214, | |
| "learning_rate": 0.0001714729714111049, | |
| "loss": 0.043426957726478574, | |
| "mean_token_accuracy": 0.9791831955313682, | |
| "num_tokens": 6527510.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 0.04563735323026776, | |
| "epoch": 0.8666666666666667, | |
| "grad_norm": 0.10202273726463318, | |
| "learning_rate": 0.00017043734393370965, | |
| "loss": 0.043241679668426514, | |
| "mean_token_accuracy": 0.9791531518101693, | |
| "num_tokens": 6630052.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 0.04624767201021314, | |
| "epoch": 0.88, | |
| "grad_norm": 0.1017850786447525, | |
| "learning_rate": 0.0001693865010816192, | |
| "loss": 0.043641078472137454, | |
| "mean_token_accuracy": 0.9791532784700394, | |
| "num_tokens": 6732187.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 0.04555416237562895, | |
| "epoch": 0.8933333333333333, | |
| "grad_norm": 0.0906793549656868, | |
| "learning_rate": 0.00016832066985044195, | |
| "loss": 0.04301130175590515, | |
| "mean_token_accuracy": 0.9790184095501899, | |
| "num_tokens": 6834270.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 0.044891719426959756, | |
| "epoch": 0.9066666666666666, | |
| "grad_norm": 0.06667148321866989, | |
| "learning_rate": 0.00016724008047346947, | |
| "loss": 0.04192114770412445, | |
| "mean_token_accuracy": 0.9799642145633698, | |
| "num_tokens": 6936310.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 0.04586669374257326, | |
| "epoch": 0.92, | |
| "grad_norm": 0.12085918337106705, | |
| "learning_rate": 0.0001661449663719432, | |
| "loss": 0.04404585361480713, | |
| "mean_token_accuracy": 0.9786775410175323, | |
| "num_tokens": 7037928.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 0.04691507248207927, | |
| "epoch": 0.9333333333333333, | |
| "grad_norm": 0.09447435289621353, | |
| "learning_rate": 0.00016503556410463234, | |
| "loss": 0.04427667260169983, | |
| "mean_token_accuracy": 0.9788988634943963, | |
| "num_tokens": 7139966.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 0.04686050089076162, | |
| "epoch": 0.9466666666666667, | |
| "grad_norm": 0.07748451828956604, | |
| "learning_rate": 0.0001639121133167342, | |
| "loss": 0.043699628114700316, | |
| "mean_token_accuracy": 0.9789900943636894, | |
| "num_tokens": 7242243.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 0.04621442370116711, | |
| "epoch": 0.96, | |
| "grad_norm": 0.0875391811132431, | |
| "learning_rate": 0.0001627748566881077, | |
| "loss": 0.0435163140296936, | |
| "mean_token_accuracy": 0.9793973177671432, | |
| "num_tokens": 7344333.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 0.04617999196052551, | |
| "epoch": 0.9733333333333334, | |
| "grad_norm": 0.11651453375816345, | |
| "learning_rate": 0.00016162403988085147, | |
| "loss": 0.0438153475522995, | |
| "mean_token_accuracy": 0.9788163512945175, | |
| "num_tokens": 7446501.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 0.04541895473375916, | |
| "epoch": 0.9866666666666667, | |
| "grad_norm": 0.10714145004749298, | |
| "learning_rate": 0.0001604599114862375, | |
| "loss": 0.043173199892044066, | |
| "mean_token_accuracy": 0.9791891872882843, | |
| "num_tokens": 7548187.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 0.04610758051276207, | |
| "epoch": 1.0, | |
| "grad_norm": 0.1056915670633316, | |
| "learning_rate": 0.0001592827229710124, | |
| "loss": 0.04365978240966797, | |
| "mean_token_accuracy": 0.9787515595555305, | |
| "num_tokens": 7650185.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 0.04553080843761563, | |
| "epoch": 1.0133333333333334, | |
| "grad_norm": 0.08358001708984375, | |
| "learning_rate": 0.00015809272862307724, | |
| "loss": 0.04281379580497742, | |
| "mean_token_accuracy": 0.9787902727723121, | |
| "num_tokens": 7751822.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 0.04557240409776568, | |
| "epoch": 1.0266666666666666, | |
| "grad_norm": 0.0894247367978096, | |
| "learning_rate": 0.00015689018549655813, | |
| "loss": 0.043633687496185306, | |
| "mean_token_accuracy": 0.9793074056506157, | |
| "num_tokens": 7853924.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 0.04621814098209143, | |
| "epoch": 1.04, | |
| "grad_norm": 0.060622621327638626, | |
| "learning_rate": 0.00015567535335627916, | |
| "loss": 0.043806785345077516, | |
| "mean_token_accuracy": 0.9790619671344757, | |
| "num_tokens": 7955729.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 0.04529289873316884, | |
| "epoch": 1.0533333333333332, | |
| "grad_norm": 0.06778731197118759, | |
| "learning_rate": 0.0001544484946216499, | |
| "loss": 0.04349397122859955, | |
| "mean_token_accuracy": 0.9791216805577279, | |
| "num_tokens": 8057521.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 0.045565437898039816, | |
| "epoch": 1.0666666666666667, | |
| "grad_norm": 0.09741676598787308, | |
| "learning_rate": 0.00015320987430997939, | |
| "loss": 0.043324217200279236, | |
| "mean_token_accuracy": 0.9791115581989288, | |
| "num_tokens": 8159337.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 0.04597685588523746, | |
| "epoch": 1.08, | |
| "grad_norm": 0.09679801762104034, | |
| "learning_rate": 0.00015195975997922892, | |
| "loss": 0.04302051663398743, | |
| "mean_token_accuracy": 0.9793232962489128, | |
| "num_tokens": 8262074.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 0.04526777658611536, | |
| "epoch": 1.0933333333333333, | |
| "grad_norm": 0.10501035302877426, | |
| "learning_rate": 0.00015069842167021635, | |
| "loss": 0.043459060788154605, | |
| "mean_token_accuracy": 0.9790220081806182, | |
| "num_tokens": 8363286.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 0.04562727101147175, | |
| "epoch": 1.1066666666666667, | |
| "grad_norm": 0.07695911824703217, | |
| "learning_rate": 0.00014942613184828335, | |
| "loss": 0.04361176192760467, | |
| "mean_token_accuracy": 0.978962479531765, | |
| "num_tokens": 8464992.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 0.04388966728001833, | |
| "epoch": 1.12, | |
| "grad_norm": 0.10466761142015457, | |
| "learning_rate": 0.00014814316534443982, | |
| "loss": 0.04218283891677856, | |
| "mean_token_accuracy": 0.9791669443249702, | |
| "num_tokens": 8567083.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 0.04554249225184322, | |
| "epoch": 1.1333333333333333, | |
| "grad_norm": 0.07236190885305405, | |
| "learning_rate": 0.0001468497992959965, | |
| "loss": 0.043398627638816835, | |
| "mean_token_accuracy": 0.9791699111461639, | |
| "num_tokens": 8669135.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 0.043595219124108554, | |
| "epoch": 1.1466666666666667, | |
| "grad_norm": 0.06271807104349136, | |
| "learning_rate": 0.00014554631308669994, | |
| "loss": 0.042030200362205505, | |
| "mean_token_accuracy": 0.979636350274086, | |
| "num_tokens": 8771085.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 0.04456626381725073, | |
| "epoch": 1.16, | |
| "grad_norm": 0.11451169848442078, | |
| "learning_rate": 0.00014423298828638195, | |
| "loss": 0.04222625195980072, | |
| "mean_token_accuracy": 0.9794944658875465, | |
| "num_tokens": 8873283.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 0.04446439165621996, | |
| "epoch": 1.1733333333333333, | |
| "grad_norm": 0.1023312583565712, | |
| "learning_rate": 0.00014291010859013688, | |
| "loss": 0.04255003333091736, | |
| "mean_token_accuracy": 0.979724471271038, | |
| "num_tokens": 8975472.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 0.04486837210133672, | |
| "epoch": 1.1866666666666668, | |
| "grad_norm": 0.10332223773002625, | |
| "learning_rate": 0.00014157795975703986, | |
| "loss": 0.04269057214260101, | |
| "mean_token_accuracy": 0.9796782404184341, | |
| "num_tokens": 9078026.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 0.04620604543015361, | |
| "epoch": 1.2, | |
| "grad_norm": 0.06070537120103836, | |
| "learning_rate": 0.00014023682954841907, | |
| "loss": 0.044662383198738095, | |
| "mean_token_accuracy": 0.9784179985523224, | |
| "num_tokens": 9180444.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 0.04559714160859585, | |
| "epoch": 1.2133333333333334, | |
| "grad_norm": 0.18560439348220825, | |
| "learning_rate": 0.00013888700766569566, | |
| "loss": 0.04349713623523712, | |
| "mean_token_accuracy": 0.9794085487723351, | |
| "num_tokens": 9282562.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 0.0467754821293056, | |
| "epoch": 1.2266666666666666, | |
| "grad_norm": 0.08615751564502716, | |
| "learning_rate": 0.00013752878568780446, | |
| "loss": 0.04393337666988373, | |
| "mean_token_accuracy": 0.97873145788908, | |
| "num_tokens": 9384267.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 0.04674078449606896, | |
| "epoch": 1.24, | |
| "grad_norm": 0.1094692274928093, | |
| "learning_rate": 0.00013616245700820922, | |
| "loss": 0.04425840079784393, | |
| "mean_token_accuracy": 0.9783810645341873, | |
| "num_tokens": 9486293.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 0.04517263481393456, | |
| "epoch": 1.2533333333333334, | |
| "grad_norm": 0.0624544620513916, | |
| "learning_rate": 0.0001347883167715258, | |
| "loss": 0.04288272559642792, | |
| "mean_token_accuracy": 0.9790759727358818, | |
| "num_tokens": 9587687.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 0.045213503576815126, | |
| "epoch": 1.2666666666666666, | |
| "grad_norm": 0.1179802417755127, | |
| "learning_rate": 0.00013340666180976712, | |
| "loss": 0.04305934309959412, | |
| "mean_token_accuracy": 0.9792578309774399, | |
| "num_tokens": 9689568.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 0.04414475904777646, | |
| "epoch": 1.28, | |
| "grad_norm": 0.10094133019447327, | |
| "learning_rate": 0.0001320177905782236, | |
| "loss": 0.04242780804634094, | |
| "mean_token_accuracy": 0.9795284524559975, | |
| "num_tokens": 9791805.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 0.04556956263259053, | |
| "epoch": 1.2933333333333334, | |
| "grad_norm": 0.07614333927631378, | |
| "learning_rate": 0.0001306220030909931, | |
| "loss": 0.043446135520935056, | |
| "mean_token_accuracy": 0.9790474250912666, | |
| "num_tokens": 9893871.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 0.04372665649279952, | |
| "epoch": 1.3066666666666666, | |
| "grad_norm": 0.09622333198785782, | |
| "learning_rate": 0.00012921960085617373, | |
| "loss": 0.04184481799602509, | |
| "mean_token_accuracy": 0.979928120970726, | |
| "num_tokens": 9995743.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 0.04449463188648224, | |
| "epoch": 1.32, | |
| "grad_norm": 0.08018497377634048, | |
| "learning_rate": 0.0001278108868107346, | |
| "loss": 0.043444639444351195, | |
| "mean_token_accuracy": 0.979103796184063, | |
| "num_tokens": 10097341.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 0.04594048615545034, | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.08098988234996796, | |
| "learning_rate": 0.00012639616525507717, | |
| "loss": 0.04326811134815216, | |
| "mean_token_accuracy": 0.9793805435299874, | |
| "num_tokens": 10199817.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 0.044195070117712024, | |
| "epoch": 1.3466666666666667, | |
| "grad_norm": 0.07928124070167542, | |
| "learning_rate": 0.00012497574178730266, | |
| "loss": 0.04292008876800537, | |
| "mean_token_accuracy": 0.979155270755291, | |
| "num_tokens": 10301704.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "entropy": 0.04565720958635211, | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 0.07645630836486816, | |
| "learning_rate": 0.00012354992323719877, | |
| "loss": 0.04377688765525818, | |
| "mean_token_accuracy": 0.9790802374482155, | |
| "num_tokens": 10404032.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "entropy": 0.044813665375113484, | |
| "epoch": 1.3733333333333333, | |
| "grad_norm": 0.0589720793068409, | |
| "learning_rate": 0.0001221190175999606, | |
| "loss": 0.04262206256389618, | |
| "mean_token_accuracy": 0.9795415893197059, | |
| "num_tokens": 10505610.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "entropy": 0.04555217456072569, | |
| "epoch": 1.3866666666666667, | |
| "grad_norm": 0.11566988378763199, | |
| "learning_rate": 0.00012068333396965968, | |
| "loss": 0.04380977749824524, | |
| "mean_token_accuracy": 0.9788099125027656, | |
| "num_tokens": 10606782.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "entropy": 0.04532764628529549, | |
| "epoch": 1.4, | |
| "grad_norm": 0.086255744099617, | |
| "learning_rate": 0.00011924318247247568, | |
| "loss": 0.04329647421836853, | |
| "mean_token_accuracy": 0.9791126802563668, | |
| "num_tokens": 10708263.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 0.04514106567949057, | |
| "epoch": 1.4133333333333333, | |
| "grad_norm": 0.06086282059550285, | |
| "learning_rate": 0.00011779887419970512, | |
| "loss": 0.04245937764644623, | |
| "mean_token_accuracy": 0.9797914355993271, | |
| "num_tokens": 10810300.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "entropy": 0.04454901767894626, | |
| "epoch": 1.4266666666666667, | |
| "grad_norm": 0.07433643192052841, | |
| "learning_rate": 0.00011635072114056162, | |
| "loss": 0.043132221698760985, | |
| "mean_token_accuracy": 0.9791502475738525, | |
| "num_tokens": 10912165.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "entropy": 0.04529751744121313, | |
| "epoch": 1.44, | |
| "grad_norm": 0.13444772362709045, | |
| "learning_rate": 0.00011489903611478229, | |
| "loss": 0.043829315900802614, | |
| "mean_token_accuracy": 0.9784928604960441, | |
| "num_tokens": 11014107.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "entropy": 0.045276003703474996, | |
| "epoch": 1.4533333333333334, | |
| "grad_norm": 0.06211255118250847, | |
| "learning_rate": 0.00011344413270505457, | |
| "loss": 0.04307844340801239, | |
| "mean_token_accuracy": 0.9793669879436493, | |
| "num_tokens": 11116149.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "entropy": 0.04517210628837347, | |
| "epoch": 1.4666666666666668, | |
| "grad_norm": 0.07761016488075256, | |
| "learning_rate": 0.00011198632518927832, | |
| "loss": 0.04319383502006531, | |
| "mean_token_accuracy": 0.9791072577238082, | |
| "num_tokens": 11217550.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 0.043730517756193875, | |
| "epoch": 1.48, | |
| "grad_norm": 0.08502429723739624, | |
| "learning_rate": 0.00011052592847267781, | |
| "loss": 0.0423270434141159, | |
| "mean_token_accuracy": 0.9796715095639229, | |
| "num_tokens": 11319372.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "entropy": 0.04452117690816522, | |
| "epoch": 1.4933333333333334, | |
| "grad_norm": 0.06671646982431412, | |
| "learning_rate": 0.00010906325801977804, | |
| "loss": 0.04296606779098511, | |
| "mean_token_accuracy": 0.9795390352606773, | |
| "num_tokens": 11421402.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "entropy": 0.04468898214399815, | |
| "epoch": 1.5066666666666668, | |
| "grad_norm": 0.08121279627084732, | |
| "learning_rate": 0.00010759862978626031, | |
| "loss": 0.04153239727020264, | |
| "mean_token_accuracy": 0.9799500927329063, | |
| "num_tokens": 11523747.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "entropy": 0.04545955043286085, | |
| "epoch": 1.52, | |
| "grad_norm": 0.05693936347961426, | |
| "learning_rate": 0.00010613236015071195, | |
| "loss": 0.04396485388278961, | |
| "mean_token_accuracy": 0.9788213685154915, | |
| "num_tokens": 11625877.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "entropy": 0.046351166628301146, | |
| "epoch": 1.5333333333333332, | |
| "grad_norm": 0.09166613221168518, | |
| "learning_rate": 0.00010466476584628413, | |
| "loss": 0.043498843908309937, | |
| "mean_token_accuracy": 0.9791526988148689, | |
| "num_tokens": 11727555.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 0.045797071792185305, | |
| "epoch": 1.5466666666666666, | |
| "grad_norm": 0.0821656882762909, | |
| "learning_rate": 0.00010319616389227369, | |
| "loss": 0.043224507570266725, | |
| "mean_token_accuracy": 0.9792197465896606, | |
| "num_tokens": 11829191.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "entropy": 0.0452940653078258, | |
| "epoch": 1.56, | |
| "grad_norm": 0.07786799967288971, | |
| "learning_rate": 0.00010172687152564273, | |
| "loss": 0.04384516477584839, | |
| "mean_token_accuracy": 0.9784497052431107, | |
| "num_tokens": 11931301.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "entropy": 0.04483237583190203, | |
| "epoch": 1.5733333333333333, | |
| "grad_norm": 0.08482241630554199, | |
| "learning_rate": 0.00010025720613249136, | |
| "loss": 0.04273432493209839, | |
| "mean_token_accuracy": 0.9794994488358497, | |
| "num_tokens": 12033500.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "entropy": 0.045613698475062844, | |
| "epoch": 1.5866666666666667, | |
| "grad_norm": 0.0863715335726738, | |
| "learning_rate": 9.878748517949829e-05, | |
| "loss": 0.04371984004974365, | |
| "mean_token_accuracy": 0.9791261553764343, | |
| "num_tokens": 12135440.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "entropy": 0.04589881300926209, | |
| "epoch": 1.6, | |
| "grad_norm": 0.062190357595682144, | |
| "learning_rate": 9.731802614534383e-05, | |
| "loss": 0.04390855133533478, | |
| "mean_token_accuracy": 0.9788092419505119, | |
| "num_tokens": 12237789.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 0.04429604625329375, | |
| "epoch": 1.6133333333333333, | |
| "grad_norm": 0.06404758989810944, | |
| "learning_rate": 9.584914645213045e-05, | |
| "loss": 0.042604264616966245, | |
| "mean_token_accuracy": 0.9796271160244941, | |
| "num_tokens": 12339966.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "entropy": 0.04499910678714514, | |
| "epoch": 1.6266666666666667, | |
| "grad_norm": 0.06570903211832047, | |
| "learning_rate": 9.438116339681545e-05, | |
| "loss": 0.04222431182861328, | |
| "mean_token_accuracy": 0.9794401109218598, | |
| "num_tokens": 12441867.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "entropy": 0.04458219092339277, | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 0.06039030849933624, | |
| "learning_rate": 9.291439408267093e-05, | |
| "loss": 0.04276288151741028, | |
| "mean_token_accuracy": 0.9794755399227142, | |
| "num_tokens": 12544334.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "entropy": 0.04523820038884878, | |
| "epoch": 1.6533333333333333, | |
| "grad_norm": 0.09730029851198196, | |
| "learning_rate": 9.144915535078509e-05, | |
| "loss": 0.043028077483177184, | |
| "mean_token_accuracy": 0.9791945442557335, | |
| "num_tokens": 12646733.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "entropy": 0.04477119510993362, | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.0753539651632309, | |
| "learning_rate": 8.998576371162073e-05, | |
| "loss": 0.04317043125629425, | |
| "mean_token_accuracy": 0.9792640700936317, | |
| "num_tokens": 12748659.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 0.044788467884063723, | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 0.07562968134880066, | |
| "learning_rate": 8.852453527664466e-05, | |
| "loss": 0.04256285130977631, | |
| "mean_token_accuracy": 0.979301193356514, | |
| "num_tokens": 12850375.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "entropy": 0.045563530456274745, | |
| "epoch": 1.6933333333333334, | |
| "grad_norm": 0.08481646329164505, | |
| "learning_rate": 8.706578569004392e-05, | |
| "loss": 0.043007442355155946, | |
| "mean_token_accuracy": 0.9794534996151925, | |
| "num_tokens": 12952926.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "entropy": 0.04439763380214572, | |
| "epoch": 1.7066666666666666, | |
| "grad_norm": 0.07377834618091583, | |
| "learning_rate": 8.560983006054208e-05, | |
| "loss": 0.04233894348144531, | |
| "mean_token_accuracy": 0.9793659463524819, | |
| "num_tokens": 13055094.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "entropy": 0.04448066912591457, | |
| "epoch": 1.72, | |
| "grad_norm": 0.06845632195472717, | |
| "learning_rate": 8.415698289333213e-05, | |
| "loss": 0.04230453968048096, | |
| "mean_token_accuracy": 0.9793373107910156, | |
| "num_tokens": 13157565.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "entropy": 0.04516846965998411, | |
| "epoch": 1.7333333333333334, | |
| "grad_norm": 0.0826217532157898, | |
| "learning_rate": 8.270755802213896e-05, | |
| "loss": 0.043338698148727414, | |
| "mean_token_accuracy": 0.9791581705212593, | |
| "num_tokens": 13259373.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 0.045483655855059625, | |
| "epoch": 1.7466666666666666, | |
| "grad_norm": 0.09278784692287445, | |
| "learning_rate": 8.126186854142752e-05, | |
| "loss": 0.043374094367027285, | |
| "mean_token_accuracy": 0.9789844870567321, | |
| "num_tokens": 13361653.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "entropy": 0.044713820703327654, | |
| "epoch": 1.76, | |
| "grad_norm": 0.06657784432172775, | |
| "learning_rate": 7.982022673877022e-05, | |
| "loss": 0.04237607717514038, | |
| "mean_token_accuracy": 0.9793095976114273, | |
| "num_tokens": 13463283.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "entropy": 0.044877147488296035, | |
| "epoch": 1.7733333333333334, | |
| "grad_norm": 0.08266546577215195, | |
| "learning_rate": 7.838294402738875e-05, | |
| "loss": 0.04311709105968475, | |
| "mean_token_accuracy": 0.9791682615876198, | |
| "num_tokens": 13565428.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "entropy": 0.04468537019565701, | |
| "epoch": 1.7866666666666666, | |
| "grad_norm": 0.07597433030605316, | |
| "learning_rate": 7.695033087888489e-05, | |
| "loss": 0.0424690306186676, | |
| "mean_token_accuracy": 0.9796170979738236, | |
| "num_tokens": 13667448.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "entropy": 0.04455111119896173, | |
| "epoch": 1.8, | |
| "grad_norm": 0.06538581848144531, | |
| "learning_rate": 7.55226967561746e-05, | |
| "loss": 0.04193790853023529, | |
| "mean_token_accuracy": 0.9794035986065864, | |
| "num_tokens": 13769362.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 0.043454491440206765, | |
| "epoch": 1.8133333333333335, | |
| "grad_norm": 0.05730016157031059, | |
| "learning_rate": 7.410035004664011e-05, | |
| "loss": 0.04141553640365601, | |
| "mean_token_accuracy": 0.9800622522830963, | |
| "num_tokens": 13871782.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "entropy": 0.044676115922629836, | |
| "epoch": 1.8266666666666667, | |
| "grad_norm": 0.04646085202693939, | |
| "learning_rate": 7.268359799551416e-05, | |
| "loss": 0.04284192621707916, | |
| "mean_token_accuracy": 0.9793128624558449, | |
| "num_tokens": 13973630.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "entropy": 0.04494037302210927, | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 0.09230729192495346, | |
| "learning_rate": 7.12727466395112e-05, | |
| "loss": 0.043046200275421144, | |
| "mean_token_accuracy": 0.9793307974934577, | |
| "num_tokens": 14075906.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "entropy": 0.045368336327373984, | |
| "epoch": 1.8533333333333335, | |
| "grad_norm": 0.04331463947892189, | |
| "learning_rate": 6.986810074071932e-05, | |
| "loss": 0.042864075303077696, | |
| "mean_token_accuracy": 0.978898110985756, | |
| "num_tokens": 14177856.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "entropy": 0.04510376630350947, | |
| "epoch": 1.8666666666666667, | |
| "grad_norm": 0.09033851325511932, | |
| "learning_rate": 6.846996372076786e-05, | |
| "loss": 0.04259768426418305, | |
| "mean_token_accuracy": 0.9792723521590233, | |
| "num_tokens": 14280019.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 0.04520597280934453, | |
| "epoch": 1.88, | |
| "grad_norm": 0.04347246140241623, | |
| "learning_rate": 6.707863759528446e-05, | |
| "loss": 0.043121880292892455, | |
| "mean_token_accuracy": 0.9790245160460472, | |
| "num_tokens": 14382127.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "entropy": 0.045137868728488684, | |
| "epoch": 1.8933333333333333, | |
| "grad_norm": 0.08444561064243317, | |
| "learning_rate": 6.569442290865564e-05, | |
| "loss": 0.042786693572998045, | |
| "mean_token_accuracy": 0.9794920086860657, | |
| "num_tokens": 14484156.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "entropy": 0.0450214795768261, | |
| "epoch": 1.9066666666666667, | |
| "grad_norm": 0.06270349770784378, | |
| "learning_rate": 6.431761866910549e-05, | |
| "loss": 0.04266757369041443, | |
| "mean_token_accuracy": 0.9790657863020897, | |
| "num_tokens": 14586261.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "entropy": 0.04571379153057933, | |
| "epoch": 1.92, | |
| "grad_norm": 0.059830646961927414, | |
| "learning_rate": 6.294852228410585e-05, | |
| "loss": 0.043165019154548644, | |
| "mean_token_accuracy": 0.9789528846740723, | |
| "num_tokens": 14688252.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "entropy": 0.04564494509249926, | |
| "epoch": 1.9333333333333333, | |
| "grad_norm": 0.2881755828857422, | |
| "learning_rate": 6.158742949613263e-05, | |
| "loss": 0.042789730429649356, | |
| "mean_token_accuracy": 0.9789565414190292, | |
| "num_tokens": 14790706.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 0.04481498738750815, | |
| "epoch": 1.9466666666666668, | |
| "grad_norm": 0.0739307701587677, | |
| "learning_rate": 6.023463431878159e-05, | |
| "loss": 0.04184747338294983, | |
| "mean_token_accuracy": 0.9795544907450676, | |
| "num_tokens": 14892667.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "entropy": 0.045400716736912726, | |
| "epoch": 1.96, | |
| "grad_norm": 0.0694345086812973, | |
| "learning_rate": 5.889042897325755e-05, | |
| "loss": 0.04274559020996094, | |
| "mean_token_accuracy": 0.9791734784841537, | |
| "num_tokens": 14994588.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "entropy": 0.045871376898139714, | |
| "epoch": 1.9733333333333334, | |
| "grad_norm": 0.06866899877786636, | |
| "learning_rate": 5.7555103825250914e-05, | |
| "loss": 0.043129801750183105, | |
| "mean_token_accuracy": 0.979410058259964, | |
| "num_tokens": 15096814.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "entropy": 0.04594316426664591, | |
| "epoch": 1.9866666666666668, | |
| "grad_norm": 0.07196313887834549, | |
| "learning_rate": 5.622894732221482e-05, | |
| "loss": 0.04333162605762482, | |
| "mean_token_accuracy": 0.9789909616112709, | |
| "num_tokens": 15198781.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "entropy": 0.046280243806540965, | |
| "epoch": 2.0, | |
| "grad_norm": 0.07306694984436035, | |
| "learning_rate": 5.491224593105695e-05, | |
| "loss": 0.04286535978317261, | |
| "mean_token_accuracy": 0.9792644336819649, | |
| "num_tokens": 15300370.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 0.044749976880848405, | |
| "epoch": 2.013333333333333, | |
| "grad_norm": 0.06247550994157791, | |
| "learning_rate": 5.360528407625873e-05, | |
| "loss": 0.04155576527118683, | |
| "mean_token_accuracy": 0.979676017165184, | |
| "num_tokens": 15402333.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "entropy": 0.045135741028934716, | |
| "epoch": 2.026666666666667, | |
| "grad_norm": 0.09815753251314163, | |
| "learning_rate": 5.2308344078436344e-05, | |
| "loss": 0.042350149154663085, | |
| "mean_token_accuracy": 0.979559974372387, | |
| "num_tokens": 15504158.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "entropy": 0.045068098604679106, | |
| "epoch": 2.04, | |
| "grad_norm": 0.09551538527011871, | |
| "learning_rate": 5.1021706093355414e-05, | |
| "loss": 0.04268674254417419, | |
| "mean_token_accuracy": 0.9792046830058098, | |
| "num_tokens": 15605979.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "entropy": 0.0467217774130404, | |
| "epoch": 2.0533333333333332, | |
| "grad_norm": 0.0750860869884491, | |
| "learning_rate": 4.974564805141405e-05, | |
| "loss": 0.04325474202632904, | |
| "mean_token_accuracy": 0.9788183540105819, | |
| "num_tokens": 15708226.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "entropy": 0.045709628332406285, | |
| "epoch": 2.066666666666667, | |
| "grad_norm": 0.08207862824201584, | |
| "learning_rate": 4.848044559760624e-05, | |
| "loss": 0.043493375182151794, | |
| "mean_token_accuracy": 0.9793010488152504, | |
| "num_tokens": 15810035.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "entropy": 0.04442885173484683, | |
| "epoch": 2.08, | |
| "grad_norm": 0.06018839031457901, | |
| "learning_rate": 4.7226372031978735e-05, | |
| "loss": 0.0418207585811615, | |
| "mean_token_accuracy": 0.9797791764140129, | |
| "num_tokens": 15912192.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "entropy": 0.046121115796267986, | |
| "epoch": 2.0933333333333333, | |
| "grad_norm": 0.06739337742328644, | |
| "learning_rate": 4.598369825059522e-05, | |
| "loss": 0.04348099529743195, | |
| "mean_token_accuracy": 0.9789452716708184, | |
| "num_tokens": 16013752.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "entropy": 0.04560723854228854, | |
| "epoch": 2.1066666666666665, | |
| "grad_norm": 0.05784814432263374, | |
| "learning_rate": 4.475269268701868e-05, | |
| "loss": 0.04268187880516052, | |
| "mean_token_accuracy": 0.9791408717632294, | |
| "num_tokens": 16115637.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "entropy": 0.045645091123878954, | |
| "epoch": 2.12, | |
| "grad_norm": 0.05607442185282707, | |
| "learning_rate": 4.353362125432674e-05, | |
| "loss": 0.042373275756835936, | |
| "mean_token_accuracy": 0.979694114625454, | |
| "num_tokens": 16217990.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "entropy": 0.04457983383908868, | |
| "epoch": 2.1333333333333333, | |
| "grad_norm": 0.09050878137350082, | |
| "learning_rate": 4.232674728767082e-05, | |
| "loss": 0.042291298508644104, | |
| "mean_token_accuracy": 0.9795105144381523, | |
| "num_tokens": 16319781.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "entropy": 0.04519128203392029, | |
| "epoch": 2.1466666666666665, | |
| "grad_norm": 0.06114558130502701, | |
| "learning_rate": 4.113233148739224e-05, | |
| "loss": 0.04246037602424622, | |
| "mean_token_accuracy": 0.9795787811279297, | |
| "num_tokens": 16422036.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "entropy": 0.045624539349228145, | |
| "epoch": 2.16, | |
| "grad_norm": 0.06515778601169586, | |
| "learning_rate": 3.9950631862707964e-05, | |
| "loss": 0.04316512644290924, | |
| "mean_token_accuracy": 0.9788484647870064, | |
| "num_tokens": 16524417.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "entropy": 0.04569779820740223, | |
| "epoch": 2.1733333333333333, | |
| "grad_norm": 0.08130136877298355, | |
| "learning_rate": 3.8781903675976775e-05, | |
| "loss": 0.04316212832927704, | |
| "mean_token_accuracy": 0.9789097234606743, | |
| "num_tokens": 16626474.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "entropy": 0.04466199018061161, | |
| "epoch": 2.1866666666666665, | |
| "grad_norm": 0.06522400677204132, | |
| "learning_rate": 3.762639938755974e-05, | |
| "loss": 0.04167875051498413, | |
| "mean_token_accuracy": 0.979556742310524, | |
| "num_tokens": 16728484.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "entropy": 0.044957845285534856, | |
| "epoch": 2.2, | |
| "grad_norm": 0.07835223525762558, | |
| "learning_rate": 3.648436860128525e-05, | |
| "loss": 0.041939809918403625, | |
| "mean_token_accuracy": 0.9797166779637336, | |
| "num_tokens": 16830621.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "entropy": 0.04469237914308906, | |
| "epoch": 2.2133333333333334, | |
| "grad_norm": 0.07076659053564072, | |
| "learning_rate": 3.535605801053147e-05, | |
| "loss": 0.04294973611831665, | |
| "mean_token_accuracy": 0.9787584990262985, | |
| "num_tokens": 16932449.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "entropy": 0.044177047722041604, | |
| "epoch": 2.2266666666666666, | |
| "grad_norm": 0.0865534245967865, | |
| "learning_rate": 3.424171134493756e-05, | |
| "loss": 0.041136741638183594, | |
| "mean_token_accuracy": 0.9797752141952515, | |
| "num_tokens": 17034746.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "entropy": 0.044158230628818275, | |
| "epoch": 2.24, | |
| "grad_norm": 0.09348734468221664, | |
| "learning_rate": 3.314156931775449e-05, | |
| "loss": 0.04184678792953491, | |
| "mean_token_accuracy": 0.979484710097313, | |
| "num_tokens": 17137032.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "entropy": 0.04505048170685768, | |
| "epoch": 2.2533333333333334, | |
| "grad_norm": 0.04819338768720627, | |
| "learning_rate": 3.205586957384838e-05, | |
| "loss": 0.04278863370418549, | |
| "mean_token_accuracy": 0.9789488822221756, | |
| "num_tokens": 17238981.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "entropy": 0.044143668562173846, | |
| "epoch": 2.2666666666666666, | |
| "grad_norm": 0.08243514597415924, | |
| "learning_rate": 3.09848466383657e-05, | |
| "loss": 0.04165869653224945, | |
| "mean_token_accuracy": 0.9797174796462059, | |
| "num_tokens": 17341204.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "entropy": 0.04463189765810967, | |
| "epoch": 2.2800000000000002, | |
| "grad_norm": 0.06700066477060318, | |
| "learning_rate": 2.9928731866073135e-05, | |
| "loss": 0.041824132204055786, | |
| "mean_token_accuracy": 0.9796530723571777, | |
| "num_tokens": 17443109.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "entropy": 0.044507946353405714, | |
| "epoch": 2.2933333333333334, | |
| "grad_norm": 0.059370577335357666, | |
| "learning_rate": 2.8887753391381924e-05, | |
| "loss": 0.04232283234596253, | |
| "mean_token_accuracy": 0.9795172438025475, | |
| "num_tokens": 17544670.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "entropy": 0.04427545545622706, | |
| "epoch": 2.3066666666666666, | |
| "grad_norm": 0.08195611089468002, | |
| "learning_rate": 2.7862136079067646e-05, | |
| "loss": 0.042314866185188295, | |
| "mean_token_accuracy": 0.9798214435577393, | |
| "num_tokens": 17647059.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "entropy": 0.04503831313923001, | |
| "epoch": 2.32, | |
| "grad_norm": 0.06154360994696617, | |
| "learning_rate": 2.6852101475696843e-05, | |
| "loss": 0.04239094257354736, | |
| "mean_token_accuracy": 0.979605621099472, | |
| "num_tokens": 17749005.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "entropy": 0.04526049355044961, | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 0.07333716750144958, | |
| "learning_rate": 2.585786776176985e-05, | |
| "loss": 0.04255903661251068, | |
| "mean_token_accuracy": 0.9788812786340714, | |
| "num_tokens": 17851383.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "entropy": 0.04530645264312625, | |
| "epoch": 2.3466666666666667, | |
| "grad_norm": 0.06190125271677971, | |
| "learning_rate": 2.487964970459118e-05, | |
| "loss": 0.042575931549072264, | |
| "mean_token_accuracy": 0.9791432306170463, | |
| "num_tokens": 17953577.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "entropy": 0.04435355756431818, | |
| "epoch": 2.36, | |
| "grad_norm": 0.08465747535228729, | |
| "learning_rate": 2.3917658611876904e-05, | |
| "loss": 0.04138871431350708, | |
| "mean_token_accuracy": 0.9799614399671555, | |
| "num_tokens": 18055293.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "entropy": 0.04456534581258893, | |
| "epoch": 2.3733333333333335, | |
| "grad_norm": 0.0772717297077179, | |
| "learning_rate": 2.297210228610952e-05, | |
| "loss": 0.04198825061321258, | |
| "mean_token_accuracy": 0.9794510439038276, | |
| "num_tokens": 18157289.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "entropy": 0.04461102448403835, | |
| "epoch": 2.3866666666666667, | |
| "grad_norm": 0.08000056445598602, | |
| "learning_rate": 2.2043184979649933e-05, | |
| "loss": 0.041901758313179015, | |
| "mean_token_accuracy": 0.9796808436512947, | |
| "num_tokens": 18258778.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "entropy": 0.04491544393822551, | |
| "epoch": 2.4, | |
| "grad_norm": 0.0720711350440979, | |
| "learning_rate": 2.1131107350616187e-05, | |
| "loss": 0.042588868737220766, | |
| "mean_token_accuracy": 0.9793313190340995, | |
| "num_tokens": 18360839.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "entropy": 0.045493978820741174, | |
| "epoch": 2.413333333333333, | |
| "grad_norm": 0.09875239431858063, | |
| "learning_rate": 2.0236066419538934e-05, | |
| "loss": 0.04313438236713409, | |
| "mean_token_accuracy": 0.9793697372078896, | |
| "num_tokens": 18462252.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "entropy": 0.04539180537685752, | |
| "epoch": 2.4266666666666667, | |
| "grad_norm": 0.04752529039978981, | |
| "learning_rate": 1.9358255526802303e-05, | |
| "loss": 0.041815349459648134, | |
| "mean_token_accuracy": 0.9794102787971497, | |
| "num_tokens": 18564453.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "entropy": 0.044612882751971485, | |
| "epoch": 2.44, | |
| "grad_norm": 0.05158265680074692, | |
| "learning_rate": 1.8497864290879953e-05, | |
| "loss": 0.04235563278198242, | |
| "mean_token_accuracy": 0.9792704641819, | |
| "num_tokens": 18666497.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "entropy": 0.045019051525741816, | |
| "epoch": 2.453333333333333, | |
| "grad_norm": 0.0648743286728859, | |
| "learning_rate": 1.7655078567375028e-05, | |
| "loss": 0.04204939901828766, | |
| "mean_token_accuracy": 0.9794104173779488, | |
| "num_tokens": 18768455.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "entropy": 0.04469795366749167, | |
| "epoch": 2.466666666666667, | |
| "grad_norm": 0.05884250998497009, | |
| "learning_rate": 1.683008040887285e-05, | |
| "loss": 0.04209013283252716, | |
| "mean_token_accuracy": 0.9796774923801422, | |
| "num_tokens": 18870275.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "entropy": 0.04474199656397104, | |
| "epoch": 2.48, | |
| "grad_norm": 0.051543645560741425, | |
| "learning_rate": 1.6023048025615405e-05, | |
| "loss": 0.04179444909095764, | |
| "mean_token_accuracy": 0.9795808404684067, | |
| "num_tokens": 18972156.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "entropy": 0.04483764311298728, | |
| "epoch": 2.493333333333333, | |
| "grad_norm": 0.10630819946527481, | |
| "learning_rate": 1.5234155747005486e-05, | |
| "loss": 0.042180657386779785, | |
| "mean_token_accuracy": 0.9794986173510551, | |
| "num_tokens": 19074197.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "entropy": 0.04558736402541399, | |
| "epoch": 2.506666666666667, | |
| "grad_norm": 0.08093755692243576, | |
| "learning_rate": 1.4463573983949341e-05, | |
| "loss": 0.04298904240131378, | |
| "mean_token_accuracy": 0.9790481492877007, | |
| "num_tokens": 19176367.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "entropy": 0.04453156525269151, | |
| "epoch": 2.52, | |
| "grad_norm": 0.0727071687579155, | |
| "learning_rate": 1.3711469192045723e-05, | |
| "loss": 0.041091355681419375, | |
| "mean_token_accuracy": 0.9804318726062775, | |
| "num_tokens": 19278992.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "entropy": 0.04554087147116661, | |
| "epoch": 2.533333333333333, | |
| "grad_norm": 0.0910055935382843, | |
| "learning_rate": 1.297800383562926e-05, | |
| "loss": 0.04345537126064301, | |
| "mean_token_accuracy": 0.9786257922649384, | |
| "num_tokens": 19380593.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "entropy": 0.04596257032826543, | |
| "epoch": 2.546666666666667, | |
| "grad_norm": 0.0877053365111351, | |
| "learning_rate": 1.2263336352676235e-05, | |
| "loss": 0.04255788326263428, | |
| "mean_token_accuracy": 0.9795473828911782, | |
| "num_tokens": 19482278.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "entropy": 0.044655687548220156, | |
| "epoch": 2.56, | |
| "grad_norm": 0.10276857763528824, | |
| "learning_rate": 1.1567621120579753e-05, | |
| "loss": 0.0418385773897171, | |
| "mean_token_accuracy": 0.9795376226305962, | |
| "num_tokens": 19584297.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "entropy": 0.04575161607936025, | |
| "epoch": 2.5733333333333333, | |
| "grad_norm": 0.09059888869524002, | |
| "learning_rate": 1.089100842280234e-05, | |
| "loss": 0.042618009448051455, | |
| "mean_token_accuracy": 0.9796013042330742, | |
| "num_tokens": 19686257.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "entropy": 0.04560979856178164, | |
| "epoch": 2.586666666666667, | |
| "grad_norm": 0.048925597220659256, | |
| "learning_rate": 1.0233644416412791e-05, | |
| "loss": 0.04292104840278625, | |
| "mean_token_accuracy": 0.9794995337724686, | |
| "num_tokens": 19788450.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "entropy": 0.0455952113494277, | |
| "epoch": 2.6, | |
| "grad_norm": 0.048526402562856674, | |
| "learning_rate": 9.595671100514214e-06, | |
| "loss": 0.042637795209884644, | |
| "mean_token_accuracy": 0.9797911092638969, | |
| "num_tokens": 19890524.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "entropy": 0.04548884928226471, | |
| "epoch": 2.6133333333333333, | |
| "grad_norm": 0.06042620167136192, | |
| "learning_rate": 8.977226285570606e-06, | |
| "loss": 0.04222815930843353, | |
| "mean_token_accuracy": 0.9794741749763489, | |
| "num_tokens": 19992209.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "entropy": 0.045671455282717946, | |
| "epoch": 2.626666666666667, | |
| "grad_norm": 0.07702252268791199, | |
| "learning_rate": 8.378443563637828e-06, | |
| "loss": 0.042873308062553406, | |
| "mean_token_accuracy": 0.9794026196002961, | |
| "num_tokens": 20093703.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "entropy": 0.04522231016308069, | |
| "epoch": 2.64, | |
| "grad_norm": 0.07133087515830994, | |
| "learning_rate": 7.799452279506125e-06, | |
| "loss": 0.042153152823448184, | |
| "mean_token_accuracy": 0.9797803938388825, | |
| "num_tokens": 20195947.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "entropy": 0.04628952695056796, | |
| "epoch": 2.6533333333333333, | |
| "grad_norm": 0.06586236506700516, | |
| "learning_rate": 7.240377502759932e-06, | |
| "loss": 0.043617674708366395, | |
| "mean_token_accuracy": 0.9784920737147331, | |
| "num_tokens": 20298043.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "entropy": 0.045405203476548195, | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.06839724630117416, | |
| "learning_rate": 6.70134000076118e-06, | |
| "loss": 0.04227378368377686, | |
| "mean_token_accuracy": 0.979735977947712, | |
| "num_tokens": 20399972.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "entropy": 0.045020535588264465, | |
| "epoch": 2.68, | |
| "grad_norm": 0.07815848290920258, | |
| "learning_rate": 6.182456212562093e-06, | |
| "loss": 0.04192916452884674, | |
| "mean_token_accuracy": 0.9796771243214607, | |
| "num_tokens": 20501675.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "entropy": 0.04609425235539675, | |
| "epoch": 2.6933333333333334, | |
| "grad_norm": 0.05290106683969498, | |
| "learning_rate": 5.68383822375278e-06, | |
| "loss": 0.042898637056350705, | |
| "mean_token_accuracy": 0.9792009994387627, | |
| "num_tokens": 20603651.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "entropy": 0.0457917626015842, | |
| "epoch": 2.7066666666666666, | |
| "grad_norm": 0.0704483613371849, | |
| "learning_rate": 5.205593742249326e-06, | |
| "loss": 0.0423770546913147, | |
| "mean_token_accuracy": 0.9790433034300804, | |
| "num_tokens": 20705702.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "entropy": 0.044912660401314496, | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 0.058434613049030304, | |
| "learning_rate": 4.747826075027506e-06, | |
| "loss": 0.04174522757530212, | |
| "mean_token_accuracy": 0.9795982718467713, | |
| "num_tokens": 20807336.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "entropy": 0.045613402500748634, | |
| "epoch": 2.7333333333333334, | |
| "grad_norm": 0.08788046985864639, | |
| "learning_rate": 4.310634105807065e-06, | |
| "loss": 0.04344511330127716, | |
| "mean_token_accuracy": 0.9793641656637192, | |
| "num_tokens": 20909744.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "entropy": 0.04498438341543078, | |
| "epoch": 2.7466666666666666, | |
| "grad_norm": 0.06054578721523285, | |
| "learning_rate": 3.894112273691697e-06, | |
| "loss": 0.041690278053283694, | |
| "mean_token_accuracy": 0.9799363717436791, | |
| "num_tokens": 21011520.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "entropy": 0.04519799826666713, | |
| "epoch": 2.76, | |
| "grad_norm": 0.06741084903478622, | |
| "learning_rate": 3.4983505527688586e-06, | |
| "loss": 0.042607730627059935, | |
| "mean_token_accuracy": 0.979535199701786, | |
| "num_tokens": 21113638.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "entropy": 0.04527061656117439, | |
| "epoch": 2.7733333333333334, | |
| "grad_norm": 0.053430285304784775, | |
| "learning_rate": 3.1234344326742657e-06, | |
| "loss": 0.04179522097110748, | |
| "mean_token_accuracy": 0.979697409272194, | |
| "num_tokens": 21215783.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "entropy": 0.045730549935251476, | |
| "epoch": 2.7866666666666666, | |
| "grad_norm": 0.07262956351041794, | |
| "learning_rate": 2.7694449001250512e-06, | |
| "loss": 0.042841532826423646, | |
| "mean_token_accuracy": 0.9794132426381111, | |
| "num_tokens": 21317798.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "entropy": 0.04552676072344184, | |
| "epoch": 2.8, | |
| "grad_norm": 0.06751976907253265, | |
| "learning_rate": 2.4364584214254695e-06, | |
| "loss": 0.04251702129840851, | |
| "mean_token_accuracy": 0.9793218955397606, | |
| "num_tokens": 21419787.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "entropy": 0.045480293966829774, | |
| "epoch": 2.8133333333333335, | |
| "grad_norm": 0.0856935977935791, | |
| "learning_rate": 2.124546925949389e-06, | |
| "loss": 0.04228883981704712, | |
| "mean_token_accuracy": 0.9794924795627594, | |
| "num_tokens": 21521816.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "entropy": 0.04522721925750375, | |
| "epoch": 2.8266666666666667, | |
| "grad_norm": 0.04721014201641083, | |
| "learning_rate": 1.8337777906023978e-06, | |
| "loss": 0.04205127358436585, | |
| "mean_token_accuracy": 0.9795928984880448, | |
| "num_tokens": 21623696.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "entropy": 0.0451619129627943, | |
| "epoch": 2.84, | |
| "grad_norm": 0.06828150898218155, | |
| "learning_rate": 1.5642138252677019e-06, | |
| "loss": 0.041848546266555785, | |
| "mean_token_accuracy": 0.9796140640974045, | |
| "num_tokens": 21726066.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "entropy": 0.04501318633556366, | |
| "epoch": 2.8533333333333335, | |
| "grad_norm": 0.08222071826457977, | |
| "learning_rate": 1.3159132592382772e-06, | |
| "loss": 0.04213366806507111, | |
| "mean_token_accuracy": 0.9795982599258423, | |
| "num_tokens": 21828178.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "entropy": 0.0461537716910243, | |
| "epoch": 2.8666666666666667, | |
| "grad_norm": 0.0802520290017128, | |
| "learning_rate": 1.0889297286386102e-06, | |
| "loss": 0.04323468208312988, | |
| "mean_token_accuracy": 0.9791506737470627, | |
| "num_tokens": 21929963.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "entropy": 0.04528212863951921, | |
| "epoch": 2.88, | |
| "grad_norm": 0.08974730968475342, | |
| "learning_rate": 8.833122648386871e-07, | |
| "loss": 0.042816996574401855, | |
| "mean_token_accuracy": 0.9789806365966797, | |
| "num_tokens": 22032092.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "entropy": 0.045245842542499304, | |
| "epoch": 2.8933333333333335, | |
| "grad_norm": 0.05283057317137718, | |
| "learning_rate": 6.991052838624113e-07, | |
| "loss": 0.04174770712852478, | |
| "mean_token_accuracy": 0.9798634141683579, | |
| "num_tokens": 22134281.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "entropy": 0.045284852758049964, | |
| "epoch": 2.9066666666666667, | |
| "grad_norm": 0.0722041130065918, | |
| "learning_rate": 5.363485767933663e-07, | |
| "loss": 0.041790124773979184, | |
| "mean_token_accuracy": 0.979168464243412, | |
| "num_tokens": 22236085.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "entropy": 0.04504124140366912, | |
| "epoch": 2.92, | |
| "grad_norm": 0.06595401465892792, | |
| "learning_rate": 3.9507730117926967e-07, | |
| "loss": 0.04146735072135925, | |
| "mean_token_accuracy": 0.9801181107759476, | |
| "num_tokens": 22338053.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "entropy": 0.04522117590531707, | |
| "epoch": 2.9333333333333336, | |
| "grad_norm": 0.06364521384239197, | |
| "learning_rate": 2.7532197343758115e-07, | |
| "loss": 0.04191155731678009, | |
| "mean_token_accuracy": 0.9794103637337684, | |
| "num_tokens": 22440208.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "entropy": 0.045472448039799926, | |
| "epoch": 2.9466666666666668, | |
| "grad_norm": 0.0597660131752491, | |
| "learning_rate": 1.7710846226355328e-07, | |
| "loss": 0.04289998710155487, | |
| "mean_token_accuracy": 0.9792811706662178, | |
| "num_tokens": 22542219.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "entropy": 0.04583751475438476, | |
| "epoch": 2.96, | |
| "grad_norm": 0.08572968095541, | |
| "learning_rate": 1.0045798304220145e-07, | |
| "loss": 0.0427745521068573, | |
| "mean_token_accuracy": 0.9792221873998642, | |
| "num_tokens": 22644025.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "entropy": 0.04562570815905929, | |
| "epoch": 2.9733333333333336, | |
| "grad_norm": 0.0797945037484169, | |
| "learning_rate": 4.5387093265591986e-08, | |
| "loss": 0.04286653101444245, | |
| "mean_token_accuracy": 0.9792360305786133, | |
| "num_tokens": 22745968.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "entropy": 0.045168190728873014, | |
| "epoch": 2.986666666666667, | |
| "grad_norm": 0.07274357974529266, | |
| "learning_rate": 1.1907688956136477e-08, | |
| "loss": 0.04201154708862305, | |
| "mean_token_accuracy": 0.9799786448478699, | |
| "num_tokens": 22848205.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "entropy": 0.045816550869494675, | |
| "epoch": 3.0, | |
| "grad_norm": 0.06689723581075668, | |
| "learning_rate": 2.70020969361795e-11, | |
| "loss": 0.042978566884994504, | |
| "mean_token_accuracy": 0.9794494539499283, | |
| "num_tokens": 22950555.0, | |
| "step": 4500 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 4500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.5251560037074944e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |