{ "model": "EleutherAI/pythia-1b", "n_layers": 16, "n_heads": 8, "d_head": 256, "n_pairs": 128, "k_dead": 91, "theta": 10000.0, "T_train": 2048, "L_crit_pred": 15, "alpha": 0.9374717358310781, "n_eval_chunks": 100, "chunk_size": 1024, "n_lambada": 200, "baseline": { "ppl": 14.973609132474365, "nll": 2.7062892603874205, "std": 0.2595318035858822, "lambada_acc": 0.57 }, "sweeps": [ { "L_prune": 11, "n_pruned_entries": 12124160, "frac_qk_pruned": 0.09033203125, "ppl": 16.53646806228249, "nll": 2.805568127632141, "std": 0.2573620626824769, "delta_ppl": 1.562858929808126, "delta_nll": 0.09927886724472046, "lambada_acc": 0.405, "delta_lambada": -0.16499999999999992, "verdict": "DEGRADED" }, { "L_prune": 13, "n_pruned_entries": 7274496, "frac_qk_pruned": 0.05419921875, "ppl": 15.548956277713987, "nll": 2.743993515968323, "std": 0.25814710014606357, "delta_ppl": 0.5753471452396219, "delta_nll": 0.0377042555809024, "lambada_acc": 0.55, "delta_lambada": -0.019999999999999907, "verdict": "DEGRADED" }, { "L_prune": 15, "n_pruned_entries": 2424832, "frac_qk_pruned": 0.01806640625, "ppl": 15.16368330360712, "nll": 2.7189033126831053, "std": 0.2584823516819064, "delta_ppl": 0.19007417113275515, "delta_nll": 0.012614052295684797, "lambada_acc": 0.54, "delta_lambada": -0.029999999999999916, "verdict": "OK" }, { "L_prune": 16, "n_pruned_entries": 0, "frac_qk_pruned": 0.0, "ppl": 14.973609132474365, "nll": 2.7062892603874205, "std": 0.2595318035858822, "delta_ppl": 0.0, "delta_nll": 0.0, "lambada_acc": 0.57, "delta_lambada": 0.0, "verdict": "OK" } ] }