Spaces:
Running
Running
File size: 1,926 Bytes
535348a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | {
"model": "EleutherAI/pythia-2.8b",
"n_layers": 32,
"n_heads": 32,
"d_head": 80,
"n_pairs": 40,
"k_dead": 29,
"theta": 10000.0,
"T_train": 2048,
"L_crit_pred": 21,
"alpha": 0.6700912468018047,
"n_eval_chunks": 100,
"chunk_size": 1024,
"n_lambada": 200,
"baseline": {
"ppl": 11.497348191010678,
"nll": 2.4421164166927336,
"std": 0.2581671149222198,
"lambada_acc": 0.615
},
"sweeps": [
{
"L_prune": 17,
"n_pruned_entries": 54067200,
"frac_qk_pruned": 0.12890625,
"ppl": 14.078933084106032,
"nll": 2.6446795725822447,
"std": 0.2594730613347292,
"delta_ppl": 2.5815848930953536,
"delta_nll": 0.20256315588951113,
"lambada_acc": 0.425,
"delta_lambada": -0.19,
"verdict": "DEGRADED"
},
{
"L_prune": 19,
"n_pruned_entries": 46858240,
"frac_qk_pruned": 0.11171875,
"ppl": 13.294295816582924,
"nll": 2.587335057258606,
"std": 0.2611973881683509,
"delta_ppl": 1.7969476255722459,
"delta_nll": 0.1452186405658722,
"lambada_acc": 0.485,
"delta_lambada": -0.13,
"verdict": "DEGRADED"
},
{
"L_prune": 21,
"n_pruned_entries": 39649280,
"frac_qk_pruned": 0.09453125,
"ppl": 12.887260668117273,
"nll": 2.556239278316498,
"std": 0.26105946628430654,
"delta_ppl": 1.389912477106595,
"delta_nll": 0.1141228616237644,
"lambada_acc": 0.475,
"delta_lambada": -0.14,
"verdict": "DEGRADED"
},
{
"L_prune": 23,
"n_pruned_entries": 32440320,
"frac_qk_pruned": 0.07734375,
"ppl": 12.418053153651558,
"nll": 2.519151313304901,
"std": 0.2594559840569815,
"delta_ppl": 0.9207049626408796,
"delta_nll": 0.07703489661216745,
"lambada_acc": 0.53,
"delta_lambada": -0.08499999999999996,
"verdict": "DEGRADED"
}
]
} |