taf-agent / data /exp_gamma_field /info_horizon_results.json
karlexmarin's picture
feat: ship paper artefacts + CLI diagnostic alongside browser tool
535348a
raw
history blame
11.4 kB
{
"horizon_results": [
{
"name": "pythia-14m",
"theta": 10000,
"T_train": 2048,
"gamma": 0.685,
"arch": "MHA",
"phase": "B",
"d_horizon_alpha1": 2643.7820305491095,
"d_horizon_alpha4": 10575.128122196438,
"alpha_opt_4xTtrain": 3.0985913003949634
},
{
"name": "pythia-31m",
"theta": 10000,
"T_train": 2048,
"gamma": 1.235,
"arch": "MHA",
"phase": "B",
"d_horizon_alpha1": null,
"d_horizon_alpha4": null,
"alpha_opt_4xTtrain": null
},
{
"name": "pythia-70m",
"theta": 10000,
"T_train": 2048,
"gamma": 0.748,
"arch": "MHA",
"phase": "B",
"d_horizon_alpha1": 2038.7975841991988,
"d_horizon_alpha4": 8155.190336796795,
"alpha_opt_4xTtrain": 4.018054594280709
},
{
"name": "pythia-160m",
"theta": 10000,
"T_train": 2048,
"gamma": 0.511,
"arch": "MHA",
"phase": "B",
"d_horizon_alpha1": 4576.773209797773,
"d_horizon_alpha4": 18307.092839191093,
"alpha_opt_4xTtrain": 1.789907348361264
},
{
"name": "pythia-410m",
"theta": 10000,
"T_train": 2048,
"gamma": 1.022,
"arch": "MHA",
"phase": "A",
"d_horizon_alpha1": null,
"d_horizon_alpha4": null,
"alpha_opt_4xTtrain": null
},
{
"name": "pythia-1b",
"theta": 10000,
"T_train": 2048,
"gamma": 0.931,
"arch": "MHA",
"phase": "A",
"d_horizon_alpha1": 505.3378343021414,
"d_horizon_alpha4": 2021.3513372085656,
"alpha_opt_4xTtrain": 16.210937404504737
},
{
"name": "pythia-1.4b",
"theta": 10000,
"T_train": 2048,
"gamma": 0.705,
"arch": "MHA",
"phase": "A",
"d_horizon_alpha1": 2446.879770674857,
"d_horizon_alpha4": 9787.519082699428,
"alpha_opt_4xTtrain": 3.3479372783978767
},
{
"name": "pythia-2.8b",
"theta": 10000,
"T_train": 2048,
"gamma": 0.674,
"arch": "MHA",
"phase": "A",
"d_horizon_alpha1": 2754.0837594601494,
"d_horizon_alpha4": 11016.335037840598,
"alpha_opt_4xTtrain": 2.974491960115906
},
{
"name": "gpt2-117m",
"theta": null,
"T_train": 1024,
"gamma": 1.023,
"arch": "AbsPE",
"phase": "B",
"d_horizon_alpha1": null,
"d_horizon_alpha4": null,
"alpha_opt_4xTtrain": null
},
{
"name": "gpt2-345m",
"theta": null,
"T_train": 1024,
"gamma": 0.784,
"arch": "AbsPE",
"phase": "A",
"d_horizon_alpha1": null,
"d_horizon_alpha4": null,
"alpha_opt_4xTtrain": null
},
{
"name": "gpt2-774m",
"theta": null,
"T_train": 1024,
"gamma": 0.753,
"arch": "AbsPE",
"phase": "A",
"d_horizon_alpha1": null,
"d_horizon_alpha4": null,
"alpha_opt_4xTtrain": null
},
{
"name": "gpt2-1.5b",
"theta": null,
"T_train": 1024,
"gamma": 1.01,
"arch": "AbsPE",
"phase": "B",
"d_horizon_alpha1": null,
"d_horizon_alpha4": null,
"alpha_opt_4xTtrain": null
},
{
"name": "Qwen2.5-0.5B",
"theta": 1000000,
"T_train": 8192,
"gamma": 1.028,
"arch": "GQA-MHA",
"phase": "B",
"d_horizon_alpha1": null,
"d_horizon_alpha4": null,
"alpha_opt_4xTtrain": null
},
{
"name": "Qwen2.5-3B",
"theta": 1000000,
"T_train": 8192,
"gamma": 0.772,
"arch": "GQA-n2",
"phase": "B",
"d_horizon_alpha1": 181964.2732624524,
"d_horizon_alpha4": 727857.0930498096,
"alpha_opt_4xTtrain": 0.1800793057477704
},
{
"name": "Qwen2.5-7B",
"theta": 1000000,
"T_train": 8192,
"gamma": 0.997,
"arch": "GQA",
"phase": "H",
"d_horizon_alpha1": 2124.507104215969,
"d_horizon_alpha4": 8498.028416863875,
"alpha_opt_4xTtrain": 15.423812862274593
},
{
"name": "gemma-2-9b",
"theta": null,
"T_train": 8192,
"gamma": 0.628,
"arch": "SWA",
"phase": "B",
"d_horizon_alpha1": null,
"d_horizon_alpha4": null,
"alpha_opt_4xTtrain": null
},
{
"name": "phi-3-mini",
"theta": null,
"T_train": 4096,
"gamma": 0.63,
"arch": "SWA",
"phase": "B",
"d_horizon_alpha1": null,
"d_horizon_alpha4": null,
"alpha_opt_4xTtrain": null
},
{
"name": "Llama-2-7b",
"theta": 10000,
"T_train": 4096,
"gamma": 1.026,
"arch": "MHA",
"phase": "B",
"d_horizon_alpha1": null,
"d_horizon_alpha4": null,
"alpha_opt_4xTtrain": null
},
{
"name": "Llama-3-8B",
"theta": 500000,
"T_train": 8192,
"gamma": 1.045,
"arch": "GQA",
"phase": "A",
"d_horizon_alpha1": null,
"d_horizon_alpha4": null,
"alpha_opt_4xTtrain": null
},
{
"name": "Mistral-7B",
"theta": 10000,
"T_train": 8192,
"gamma": 1.061,
"arch": "GQA",
"phase": "B",
"d_horizon_alpha1": null,
"d_horizon_alpha4": null,
"alpha_opt_4xTtrain": null
},
{
"name": "DeepSeek-7B",
"theta": 10000,
"T_train": 4096,
"gamma": 0.947,
"arch": "MHA",
"phase": "A",
"d_horizon_alpha1": 384.9682527261125,
"d_horizon_alpha4": 1539.87301090445,
"alpha_opt_4xTtrain": 42.55935361936579
},
{
"name": "phi-2",
"theta": 10000,
"T_train": 2048,
"gamma": 1.045,
"arch": "MHA",
"phase": "B",
"d_horizon_alpha1": null,
"d_horizon_alpha4": null,
"alpha_opt_4xTtrain": null
},
{
"name": "bloom-7b1",
"theta": null,
"T_train": 2048,
"gamma": 1.151,
"arch": "ALiBi",
"phase": "B",
"d_horizon_alpha1": null,
"d_horizon_alpha4": null,
"alpha_opt_4xTtrain": null
},
{
"name": "mamba-2.8b",
"theta": null,
"T_train": 2048,
"gamma": 0.703,
"arch": "SSM",
"phase": "A",
"d_horizon_alpha1": null,
"d_horizon_alpha4": null,
"alpha_opt_4xTtrain": null
},
{
"name": "gpt-j-6B",
"theta": null,
"T_train": 2048,
"gamma": 0.897,
"arch": "AbsPE",
"phase": "A",
"d_horizon_alpha1": null,
"d_horizon_alpha4": null,
"alpha_opt_4xTtrain": null
}
],
"phase_diagram": [
{
"name": "pythia-14m",
"gamma_rand": 1.004,
"gamma_text": 0.685,
"delta_gamma": 0.31899999999999995,
"arch": "MHA",
"phase": "B",
"region": "pre-IH"
},
{
"name": "pythia-31m",
"gamma_rand": 1.54,
"gamma_text": 1.235,
"delta_gamma": 0.30499999999999994,
"arch": "MHA",
"phase": "B",
"region": "pre-IH"
},
{
"name": "pythia-70m",
"gamma_rand": 1.171,
"gamma_text": 0.748,
"delta_gamma": 0.42300000000000004,
"arch": "MHA",
"phase": "B",
"region": "pre-IH"
},
{
"name": "pythia-160m",
"gamma_rand": 1.017,
"gamma_text": 0.511,
"delta_gamma": 0.5059999999999999,
"arch": "MHA",
"phase": "B",
"region": "pre-IH"
},
{
"name": "pythia-410m",
"gamma_rand": 0.936,
"gamma_text": 1.022,
"delta_gamma": -0.08599999999999997,
"arch": "MHA",
"phase": "A",
"region": "post-IH"
},
{
"name": "pythia-1b",
"gamma_rand": 0.713,
"gamma_text": 0.931,
"delta_gamma": -0.21800000000000008,
"arch": "MHA",
"phase": "A",
"region": "post-IH"
},
{
"name": "pythia-1.4b",
"gamma_rand": 0.688,
"gamma_text": 0.705,
"delta_gamma": -0.017000000000000015,
"arch": "MHA",
"phase": "A",
"region": "post-IH"
},
{
"name": "pythia-2.8b",
"gamma_rand": 0.551,
"gamma_text": 0.674,
"delta_gamma": -0.123,
"arch": "MHA",
"phase": "A",
"region": "post-IH"
},
{
"name": "gpt2-117m",
"gamma_rand": 1.051,
"gamma_text": 1.023,
"delta_gamma": 0.028000000000000025,
"arch": "AbsPE",
"phase": "B",
"region": "pre-IH"
},
{
"name": "gpt2-345m",
"gamma_rand": 0.741,
"gamma_text": 0.784,
"delta_gamma": -0.04300000000000004,
"arch": "AbsPE",
"phase": "A",
"region": "post-IH"
},
{
"name": "gpt2-774m",
"gamma_rand": 0.727,
"gamma_text": 0.753,
"delta_gamma": -0.026000000000000023,
"arch": "AbsPE",
"phase": "A",
"region": "post-IH"
},
{
"name": "gpt2-1.5b",
"gamma_rand": 1.024,
"gamma_text": 1.01,
"delta_gamma": 0.014000000000000012,
"arch": "AbsPE",
"phase": "B",
"region": "pre-IH"
},
{
"name": "Qwen2.5-0.5B",
"gamma_rand": 0.919,
"gamma_text": 1.028,
"delta_gamma": -0.10899999999999999,
"arch": "GQA-MHA",
"phase": "B",
"region": "post-IH"
},
{
"name": "Qwen2.5-3B",
"gamma_rand": 0.964,
"gamma_text": 0.772,
"delta_gamma": 0.19199999999999995,
"arch": "GQA-n2",
"phase": "B",
"region": "pre-IH"
},
{
"name": "Qwen2.5-7B",
"gamma_rand": 0.827,
"gamma_text": 0.997,
"delta_gamma": -0.17000000000000004,
"arch": "GQA",
"phase": "H",
"region": "post-IH"
},
{
"name": "gemma-2-9b",
"gamma_rand": 1.135,
"gamma_text": 0.628,
"delta_gamma": 0.507,
"arch": "SWA",
"phase": "B",
"region": "pre-IH"
},
{
"name": "phi-3-mini",
"gamma_rand": 1.037,
"gamma_text": 0.63,
"delta_gamma": 0.4069999999999999,
"arch": "SWA",
"phase": "B",
"region": "pre-IH"
},
{
"name": "Llama-3-8B",
"gamma_rand": 0.759,
"gamma_text": 1.045,
"delta_gamma": -0.2859999999999999,
"arch": "GQA",
"phase": "A",
"region": "post-IH"
},
{
"name": "Mistral-7B",
"gamma_rand": 0.83,
"gamma_text": 1.061,
"delta_gamma": -0.23099999999999998,
"arch": "GQA",
"phase": "B",
"region": "post-IH"
},
{
"name": "DeepSeek-7B",
"gamma_rand": 0.91,
"gamma_text": 0.947,
"delta_gamma": -0.03699999999999992,
"arch": "MHA",
"phase": "A",
"region": "post-IH"
},
{
"name": "phi-2",
"gamma_rand": 0.871,
"gamma_text": 1.045,
"delta_gamma": -0.17399999999999993,
"arch": "MHA",
"phase": "B",
"region": "post-IH"
},
{
"name": "gpt-j-6B",
"gamma_rand": 0.835,
"gamma_text": 0.897,
"delta_gamma": -0.062000000000000055,
"arch": "AbsPE",
"phase": "A",
"region": "post-IH"
}
],
"formula": {
"d_horizon": "theta * (1-gamma)*sqrt(2) / (1+gamma)",
"theta_design": "T_eval * sqrt(2) * (1+gamma) / (2*(1-gamma))",
"alpha_opt": "theta_design / theta_train",
"note": "d_horizon = T_eval when theta = theta_design. Inverse relationship.",
"validation": "pythia-70m: d_horizon(alpha=1)=2046, collapse at L=4096=2*d_horizon \u2713"
}
}