{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 256, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.021636501551347463, "epoch": 0.078125, "grad_norm": 20.69505500793457, "learning_rate": 4.99248235291948e-07, "logits/chosen": -2.3179785959016144, "logits/rejected": -2.7809631647550237, "logps/chosen": -53.45781261920929, "logps/rejected": -13.337021085619927, "loss": 0.657944631576538, "mean_token_accuracy": 0.9655282508581877, "num_tokens": 204176.0, "rewards/accuracies": 0.7125, "rewards/chosen": 0.07551017855228678, "rewards/margins": 0.07638100921347686, "rewards/rejected": -0.0008708309359446442, "step": 20 }, { "entropy": 0.020904664618501555, "epoch": 0.15625, "grad_norm": 11.934975624084473, "learning_rate": 4.860089321383249e-07, "logits/chosen": -2.3715720799283035, "logits/rejected": -2.752338156064994, "logps/chosen": -47.57736205458641, "logps/rejected": -9.89167308807373, "loss": 0.5478899478912354, "mean_token_accuracy": 0.9694307189434767, "num_tokens": 408876.0, "rewards/accuracies": 0.94375, "rewards/chosen": 0.3461680102278478, "rewards/margins": 0.35350809887168, "rewards/rejected": -0.007340087751276414, "step": 40 }, { "entropy": 0.020513483681133947, "epoch": 0.234375, "grad_norm": 9.691418647766113, "learning_rate": 4.570784837330908e-07, "logits/chosen": -2.1192375238603347, "logits/rejected": -2.53035075049539, "logps/chosen": -45.57326656579971, "logps/rejected": -8.606099864840507, "loss": 0.4686126232147217, "mean_token_accuracy": 0.9693366013467312, "num_tokens": 612001.0, "rewards/accuracies": 0.984375, "rewards/chosen": 0.603796785371378, "rewards/margins": 0.6143899171147496, "rewards/rejected": -0.01059312963174186, "step": 60 }, { "entropy": 0.021713076254945916, "epoch": 0.3125, "grad_norm": 7.871086120605469, "learning_rate": 4.143803421422659e-07, "logits/chosen": -2.057814393609575, "logits/rejected": -2.4942916088070772, "logps/chosen": -41.88143846988678, "logps/rejected": -14.472851443290711, "loss": 0.43726696968078616, "mean_token_accuracy": 0.9722191035747528, "num_tokens": 816161.0, "rewards/accuracies": 0.971875, "rewards/chosen": 0.7671132636489346, "rewards/margins": 0.7756887916475534, "rewards/rejected": -0.008575526656676402, "step": 80 }, { "entropy": 0.024421673139477205, "epoch": 0.390625, "grad_norm": 8.54527473449707, "learning_rate": 3.607533098450838e-07, "logits/chosen": -1.839447017270682, "logits/rejected": -2.2863276897999363, "logps/chosen": -43.57510041296482, "logps/rejected": -15.660967689752578, "loss": 0.3658822774887085, "mean_token_accuracy": 0.969692699238658, "num_tokens": 1020103.0, "rewards/accuracies": 0.996875, "rewards/chosen": 1.0035955631639808, "rewards/margins": 1.040329695912078, "rewards/rejected": -0.03673412769193263, "step": 100 }, { "entropy": 0.027722828323430804, "epoch": 0.46875, "grad_norm": 12.111254692077637, "learning_rate": 2.9976280085178514e-07, "logits/chosen": -1.7748789854871165, "logits/rejected": -2.128433231822726, "logps/chosen": -39.67851151823997, "logps/rejected": -28.724243608117103, "loss": 0.3575569152832031, "mean_token_accuracy": 0.9722051545977592, "num_tokens": 1224892.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.0726068197051064, "rewards/margins": 1.118667737254873, "rewards/rejected": -0.046060921821819534, "step": 120 }, { "entropy": 0.025926758550940576, "epoch": 0.546875, "grad_norm": 6.636215686798096, "learning_rate": 2.3546379277238103e-07, "logits/chosen": -1.7601451944264634, "logits/rejected": -2.249298545209453, "logps/chosen": -41.20892730355263, "logps/rejected": -18.242869859933855, "loss": 0.3083898305892944, "mean_token_accuracy": 0.9701874993741513, "num_tokens": 1428552.0, "rewards/accuracies": 0.996875, "rewards/chosen": 1.2622473721392453, "rewards/margins": 1.3315380293875934, "rewards/rejected": -0.06929065904007245, "step": 140 }, { "entropy": 0.022180175604671604, "epoch": 0.625, "grad_norm": 6.083482265472412, "learning_rate": 1.7213123005989384e-07, "logits/chosen": -1.7182350378556475, "logits/rejected": -2.1864362422005548, "logps/chosen": -38.325451070070265, "logps/rejected": -7.12018935084343, "loss": 0.319840145111084, "mean_token_accuracy": 0.9720365267246962, "num_tokens": 1632891.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.288850279757753, "rewards/margins": 1.3134813922923059, "rewards/rejected": -0.024631122405355654, "step": 160 }, { "entropy": 0.021702260659276364, "epoch": 0.703125, "grad_norm": 5.438543796539307, "learning_rate": 1.139758026728543e-07, "logits/chosen": -1.726502916036394, "logits/rejected": -2.1333350788717915, "logps/chosen": -37.58671213388443, "logps/rejected": -10.681715652346611, "loss": 0.2964779376983643, "mean_token_accuracy": 0.9728810664266347, "num_tokens": 1837723.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.3554319092072546, "rewards/margins": 1.3989118587225675, "rewards/rejected": -0.04347995595167049, "step": 180 }, { "entropy": 0.024732174295513686, "epoch": 0.78125, "grad_norm": 11.189217567443848, "learning_rate": 6.486399672274889e-08, "logits/chosen": -1.4975229611778882, "logits/rejected": -1.9543126219106246, "logps/chosen": -39.10779959261417, "logps/rejected": -12.893563643097878, "loss": 0.28920350074768064, "mean_token_accuracy": 0.9711573887616396, "num_tokens": 2041338.0, "rewards/accuracies": 0.996875, "rewards/chosen": 1.4374190052039921, "rewards/margins": 1.4665920787490905, "rewards/rejected": -0.0291730740777723, "step": 200 }, { "entropy": 0.023455908713549432, "epoch": 0.859375, "grad_norm": 6.344812393188477, "learning_rate": 2.806102964753604e-08, "logits/chosen": -1.6520432056236487, "logits/rejected": -2.0055418614354914, "logps/chosen": -37.61689207553864, "logps/rejected": -9.698537448048592, "loss": 0.28933255672454833, "mean_token_accuracy": 0.9727728921920061, "num_tokens": 2244964.0, "rewards/accuracies": 0.996875, "rewards/chosen": 1.408182951901108, "rewards/margins": 1.4448629548773169, "rewards/rejected": -0.036680009391000114, "step": 220 }, { "entropy": 0.028627422009276416, "epoch": 0.9375, "grad_norm": 5.446918964385986, "learning_rate": 6.013760965774106e-09, "logits/chosen": -1.6270668677924687, "logits/rejected": -2.0212430516390407, "logps/chosen": -39.404715886712076, "logps/rejected": -21.10182505249977, "loss": 0.27057514190673826, "mean_token_accuracy": 0.9717823456972837, "num_tokens": 2450198.0, "rewards/accuracies": 0.996875, "rewards/chosen": 1.5084342845715581, "rewards/margins": 1.5687194842845202, "rewards/rejected": -0.06028519994646046, "step": 240 } ], "logging_steps": 20, "max_steps": 256, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1757219055360000.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }