| { |
| "config": { |
| "model_X": "Qwen/Qwen2.5-3B-Instruct", |
| "model_Y": "meta-llama/Llama-3.2-3B-Instruct", |
| "hub_repo": "CK0607/cross-model-lora-prediction-3b", |
| "lora": { |
| "r": 16, |
| "alpha": 32, |
| "targets": [ |
| "q_proj", |
| "k_proj", |
| "v_proj", |
| "o_proj", |
| "gate_proj", |
| "up_proj", |
| "down_proj" |
| ], |
| "dropout": 0.0 |
| }, |
| "train": { |
| "examples": 1500, |
| "eval_examples": 300, |
| "round5_eval_examples": 100, |
| "round5_fast_surrogate": true, |
| "epochs": 3.0, |
| "bs": 8, |
| "lr": 0.0002, |
| "max_len": 512 |
| }, |
| "r4_anchor_names": [ |
| "gsm8k", |
| "mbpp", |
| "sciq", |
| "arc_easy", |
| "openbookqa", |
| "svamp", |
| "multiarith", |
| "mmlu_high_school_biology", |
| "math_counting_easy", |
| "humaneval", |
| "mmlu_high_school_physics", |
| "mbpp_sanitized", |
| "mmlu_elementary_math", |
| "math_algebra_easy", |
| "aqua_rat", |
| "medmcqa_easy" |
| ], |
| "r5_requested_new_anchors": [ |
| "aqua_rat_numeric", |
| "math_counting_easy", |
| "mawps", |
| "mbpp_sanitized", |
| "humaneval", |
| "conala_curated", |
| "medmcqa_easy", |
| "pubmedqa_pqal" |
| ], |
| "r5_new_anchor_names": [ |
| "aqua_rat_numeric", |
| "math_counting_easy", |
| "mawps", |
| "mbpp_sanitized", |
| "humaneval", |
| "conala_curated", |
| "medmcqa_easy", |
| "pubmedqa_pqal" |
| ], |
| "dropped": [], |
| "pool_anchor_names": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "heldouts": [ |
| "gsm_hard", |
| "gsm8k_test_500", |
| "mbpp_test_held", |
| "mbpp_plus", |
| "openbookqa_test" |
| ], |
| "N_values": [ |
| 4, |
| 8, |
| 12, |
| 16, |
| 24 |
| ], |
| "methods": [ |
| "mean", |
| "global_ridge", |
| "topk8_global_ridge" |
| ], |
| "seeds": [ |
| 11, |
| 22, |
| 33, |
| 44, |
| 55 |
| ] |
| }, |
| "dataset_audit": { |
| "tasks": { |
| "aqua_rat_numeric": { |
| "domain": "math", |
| "kind": "math_num", |
| "dataset": "deepmind/aqua_rat", |
| "config": "raw", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Solve the math problem. Respond with only the final answer.\n\nProblem: From (1, 2, 3, 4, 5, 6), one number is picked out and replaced and one number is picked out again. If the sum of the 2 numbers is 9, what is the probability that the 2 numbers included the number 5?\n\nFinal answer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "1/2" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "math_counting_easy": { |
| "domain": "math", |
| "kind": "math_solution", |
| "dataset": "EleutherAI/hendrycks_math", |
| "config": "counting_and_probability", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Solve the math problem. Respond with only the final answer.\n\nProblem: Alex has 10 different kinds of lunch meat and 9 different kinds of cheese. If he wants to make a sandwich with one kind of meat and two kinds of cheese, how many different sandwiches could he make? (It does not matter in which order he chooses the two types of cheese.)\n\nFinal answer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "360" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "mawps": { |
| "domain": "math", |
| "kind": "math_num", |
| "dataset": "MU-NLPC/Calc-mawps", |
| "config": null, |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Solve the math problem. Respond with only the final answer.\n\nProblem: Marilyn had 279 Beg. Daryl clasped some Beg. Now Marilyn has 59 Beg. How many did Daryl claspeds?\n\nFinal answer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "220" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "mbpp_sanitized": { |
| "domain": "code", |
| "kind": "code", |
| "dataset": "google-research-datasets/mbpp", |
| "config": "sanitized", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Write a Python function that solves the task. Respond with only code.\n\nTask: Write a function to reverse words seperated by spaces in a given string.\n\nTests:\nassert reverse_words(\"python program\")==(\"program python\")\nassert reverse_words(\"java language\")==(\"language java\")\nassert reverse_words(\"indian man\")==(\"man indian\")\n\nCode:" |
| }, |
| { |
| "role": "assistant", |
| "content": "def reverse_words(s):\n return ' '.join(reversed(s.split()))" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "humaneval": { |
| "domain": "code", |
| "kind": "code", |
| "dataset": "openai/openai_humaneval", |
| "config": null, |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Complete the following Python function. Respond with only the completion code.\n\ndef below_threshold(l: list, t: int):\n \"\"\"Return True if all numbers in the list l are below threshold t.\n >>> below_threshold([1, 2, 4, 10], 100)\n True\n >>> below_threshold([1, 20, 4, 10], 5)\n False\n \"\"\"" |
| }, |
| { |
| "role": "assistant", |
| "content": "for e in l:\n if e >= t:\n return False\n return True" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "conala_curated": { |
| "domain": "code", |
| "kind": "code", |
| "dataset": "codeparrot/conala-mined-curated", |
| "config": null, |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Write a Python code snippet for this intent. Respond with only code.\n\nIntent: print all contacts in class all_contacts\n\nCode:" |
| }, |
| { |
| "role": "assistant", |
| "content": "all_contacts = []" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "medmcqa_easy": { |
| "domain": "science", |
| "kind": "mcq", |
| "dataset": "openlifescienceai/medmcqa", |
| "config": null, |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [ |
| "A", |
| "B", |
| "C", |
| "D", |
| "E", |
| "F", |
| "G", |
| "H" |
| ], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Answer the multiple-choice question. Respond with only the option letter.\n\nApolipoprotein A-I (Apo A-I) is found in which of the following lipid components?\nA. LDL\nB. HDL\nC. VLDL\nD. Chylomicron\nAnswer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "B" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "pubmedqa_pqal": { |
| "domain": "science", |
| "kind": "short", |
| "dataset": "qiaojin/PubMedQA", |
| "config": "pqa_labeled", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [ |
| "yes", |
| "no", |
| "maybe" |
| ], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Answer the biomedical question with only one of: yes, no, maybe.\n\nContext: To assess whether eligibility to an adjuvant chemotherapy protocol in itself represents a good prognostic factor after radical cystectomy for bladder cancer. Between April 1984 and May 1989, our institution entered 35 patients with invasive bladder cancer into the Swiss Group for Clinical and Epidemiological Cancer Research (SAKK) study 09/84. They were randomly assigned to either observation or three postoperative courses of cisplatin monotherapy after cystectomy. This study had a negative result. The outcome of these 35 patients (protocol group) was compared with an age- and tumor-stage-matched cohort (matched group; n = 35) who also underwent cystectomy during the same period, but were not entered into the SAKK study, as well as the remaining 57 patients treated during the study period for the same indication (remaining group). Median overall survival decreased from 76.3 months in the protocol group to 52.1 months in the matched group and to 20.3 months in the remaining group. The respective times of median recurrence-free survival were 67.2, 16.0, and 9.4 months. Tumor progression occurred in 46% of the protocol group compared with 69% in the matched group and 65% in the remaining group (P<.05). Cancer-related death was noted in 40% of the protocol group, 57% in the matched group, and 56% in the remaining group.\n\nQuestion: Is eligibility for a chemotherapy protocol a good prognostic factor for invasive bladder cancer after radical cystectomy?\n\nAnswer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "yes" |
| } |
| ] |
| }, |
| "ok": true |
| } |
| }, |
| "dropped": [], |
| "kept": [ |
| "aqua_rat_numeric", |
| "math_counting_easy", |
| "mawps", |
| "mbpp_sanitized", |
| "humaneval", |
| "conala_curated", |
| "medmcqa_easy", |
| "pubmedqa_pqal" |
| ], |
| "anchor_domain_counts": { |
| "math": 3, |
| "code": 3, |
| "science": 2 |
| } |
| }, |
| "training": [ |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "aqua_rat_numeric", |
| "ok": true, |
| "gpu": 0 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "humaneval", |
| "ok": true, |
| "gpu": 0 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "aqua_rat_numeric", |
| "ok": true, |
| "gpu": 1 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "humaneval", |
| "ok": true, |
| "gpu": 1 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "math_counting_easy", |
| "ok": true, |
| "gpu": 2 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "conala_curated", |
| "ok": true, |
| "gpu": 2 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "math_counting_easy", |
| "ok": true, |
| "gpu": 3 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "conala_curated", |
| "ok": true, |
| "gpu": 3 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "mawps", |
| "ok": true, |
| "gpu": 4 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "medmcqa_easy", |
| "ok": true, |
| "gpu": 4 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "mawps", |
| "ok": true, |
| "gpu": 5 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "medmcqa_easy", |
| "ok": true, |
| "gpu": 5 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "mbpp_sanitized", |
| "ok": true, |
| "gpu": 6 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "pubmedqa_pqal", |
| "ok": true, |
| "gpu": 6 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "mbpp_sanitized", |
| "ok": true, |
| "gpu": 7 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "pubmedqa_pqal", |
| "ok": true, |
| "gpu": 7 |
| } |
| ], |
| "baselines": { |
| "gsm_hard": { |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15 |
| }, |
| "gsm8k_test_500": { |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333 |
| }, |
| "mbpp_test_held": { |
| "base_Y": 0.23, |
| "oracle": 0.32 |
| }, |
| "mbpp_plus": { |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45 |
| }, |
| "openbookqa_test": { |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333 |
| } |
| }, |
| "records": [ |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 4, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r5:aqua_rat_numeric", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.06235950804304804, |
| "gap_recovered": -0.011236445657138083 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 4, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r5:aqua_rat_numeric", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07221313142228401, |
| "gap_recovered": 0.10245920871866164 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 4, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r5:aqua_rat_numeric", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:multiarith", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.0722087583596679, |
| "gap_recovered": 0.10240875030386039 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 4, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r5:aqua_rat_numeric", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.09861895769491964, |
| "gap_recovered": 0.08727636419493583 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 4, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r5:aqua_rat_numeric", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12459708992092089, |
| "gap_recovered": 0.20904885900431666 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 4, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r5:aqua_rat_numeric", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:multiarith", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12459502910745557, |
| "gap_recovered": 0.209039198941198 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 4, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r5:aqua_rat_numeric", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.24167782528647064, |
| "gap_recovered": 0.12975361429411808 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 4, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r5:aqua_rat_numeric", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25468766738628523, |
| "gap_recovered": 0.27430741540316916 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 4, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r5:aqua_rat_numeric", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:multiarith", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25469019236235785, |
| "gap_recovered": 0.2743354706928649 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 4, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r5:aqua_rat_numeric", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2398360331853231, |
| "gap_recovered": 0.09929728507995608 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 4, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r5:aqua_rat_numeric", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.274934759222228, |
| "gap_recovered": 0.24972039666669132 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 4, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r5:aqua_rat_numeric", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:multiarith", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2749305629182136, |
| "gap_recovered": 0.2497024125066297 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 4, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r5:aqua_rat_numeric", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.733175716372742, |
| "gap_recovered": 0.08478920624173929 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 4, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r5:aqua_rat_numeric", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7639258190681194, |
| "gap_recovered": 0.19728958195653454 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 4, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r5:aqua_rat_numeric", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:multiarith", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7639315212458029, |
| "gap_recovered": 0.1973104435822058 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 4, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.06060013856010876, |
| "gap_recovered": -0.03153686276797588 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 4, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.0706267873720191, |
| "gap_recovered": 0.08415523890791271 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 4, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07063988874698508, |
| "gap_recovered": 0.0843064086190586 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 4, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.09357568675074085, |
| "gap_recovered": 0.06363603164409773 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 4, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.1202605464540679, |
| "gap_recovered": 0.18872131150344332 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 4, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.1203002280750494, |
| "gap_recovered": 0.1889073191017941 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 4, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.23954079837634648, |
| "gap_recovered": 0.10600887084829412 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 4, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25191722294379926, |
| "gap_recovered": 0.2435246993755472 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 4, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25194909498609347, |
| "gap_recovered": 0.24387883317881623 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 4, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.23348182179461952, |
| "gap_recovered": 0.07206495054836931 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 4, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.26477579495002485, |
| "gap_recovered": 0.2061819783572493 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 4, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2648303948599717, |
| "gap_recovered": 0.20641597797130715 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 4, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7315477586888719, |
| "gap_recovered": 0.07883326349587293 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 4, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7632082435454445, |
| "gap_recovered": 0.19466430565406528 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 4, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7632605181891342, |
| "gap_recovered": 0.19485555435049115 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 4, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.04166666666666667, |
| "gap_recovered": -0.25000000000000006 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 4, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07145326948713983, |
| "gap_recovered": 0.09369157100545951 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 4, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "selected_topk": [ |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math", |
| "r4:mbpp" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07145337636443391, |
| "gap_recovered": 0.09369280420500663 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 4, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.026666666666666672, |
| "gap_recovered": -0.25 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 4, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12266466074976429, |
| "gap_recovered": 0.1999905972645201 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 4, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "selected_topk": [ |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math", |
| "r4:mbpp" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12266483613814431, |
| "gap_recovered": 0.19999141939755147 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 4, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.20750000000000002, |
| "gap_recovered": -0.24999999999999992 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 4, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2502199395771684, |
| "gap_recovered": 0.2246659953018713 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 4, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "selected_topk": [ |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:mbpp" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25022055001094423, |
| "gap_recovered": 0.22467277789938025 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 4, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.15833333333333333, |
| "gap_recovered": -0.25000000000000006 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 4, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.26078072169731403, |
| "gap_recovered": 0.18906023584563153 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 4, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "selected_topk": [ |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:mbpp" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2607817048313974, |
| "gap_recovered": 0.1890644492774174 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 4, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.6416666666666666, |
| "gap_recovered": -0.2500000000000001 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 4, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7652856901322288, |
| "gap_recovered": 0.20226471999595919 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 4, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "selected_topk": [ |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:mbpp" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7652856620426836, |
| "gap_recovered": 0.20226461722933023 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 4, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.06258799388490875, |
| "gap_recovered": -0.008600070558745234 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 4, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07188610471528153, |
| "gap_recovered": 0.09868582363786381 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 4, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "selected_topk": [ |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07188549907728174, |
| "gap_recovered": 0.09867883550709697 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 4, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.09898812831133263, |
| "gap_recovered": 0.08900685145937172 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 4, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12376669171212734, |
| "gap_recovered": 0.20515636740059692 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 4, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "selected_topk": [ |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12377916621065689, |
| "gap_recovered": 0.20521484161245418 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 4, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.24100257454247312, |
| "gap_recovered": 0.12225082824970122 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 4, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25219830919956343, |
| "gap_recovered": 0.24664787999514912 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 4, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "selected_topk": [ |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2522135422969687, |
| "gap_recovered": 0.2468171366329853 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 4, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.23734664177072462, |
| "gap_recovered": 0.0886284647316769 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 4, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2656234004031653, |
| "gap_recovered": 0.20981457315642263 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 4, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "selected_topk": [ |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.26564392632451556, |
| "gap_recovered": 0.2099025413907809 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 4, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7332305190755033, |
| "gap_recovered": 0.08498970393476829 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 4, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7634382407418613, |
| "gap_recovered": 0.19550575881168777 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 4, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "selected_topk": [ |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7634770604933815, |
| "gap_recovered": 0.19564778229285923 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 4, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.04166666666666667, |
| "gap_recovered": -0.25000000000000006 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 4, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.06984950443793989, |
| "gap_recovered": 0.07518658966853708 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 4, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r5:math_counting_easy", |
| "r4:sciq" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.06985002991796911, |
| "gap_recovered": 0.07519265289964354 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 4, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.026666666666666672, |
| "gap_recovered": -0.25 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 4, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.11843885312135194, |
| "gap_recovered": 0.1801821240063372 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 4, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:sciq", |
| "r5:math_counting_easy" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.11843983968098959, |
| "gap_recovered": 0.1801867485046387 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 4, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.20750000000000002, |
| "gap_recovered": -0.24999999999999992 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 4, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2514841386778601, |
| "gap_recovered": 0.23871265197622352 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 4, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r5:math_counting_easy", |
| "r4:sciq" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2514852855534389, |
| "gap_recovered": 0.23872539503821 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 4, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.15833333333333333, |
| "gap_recovered": -0.25000000000000006 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 4, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.26363710987156835, |
| "gap_recovered": 0.20130189944957863 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 4, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r5:math_counting_easy", |
| "r4:sciq" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.26363701395604805, |
| "gap_recovered": 0.20130148838306303 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 4, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.6416666666666666, |
| "gap_recovered": -0.2500000000000001 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 4, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7588161622518781, |
| "gap_recovered": 0.17859571555565168 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 4, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:sciq", |
| "r5:math_counting_easy" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7588159375355161, |
| "gap_recovered": 0.1785948934226201 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 8, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.04166666666666667, |
| "gap_recovered": -0.25000000000000006 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 8, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07245608132461023, |
| "gap_recovered": 0.10526247682242572 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 8, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:multiarith", |
| "r4:mmlu_elementary_math", |
| "r5:medmcqa_easy", |
| "r4:mbpp", |
| "r5:mbpp_sanitized", |
| "r4:gsm8k" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.0724562861727572, |
| "gap_recovered": 0.10526484045489079 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 8, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.026666666666666672, |
| "gap_recovered": -0.25 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 8, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12521504895440466, |
| "gap_recovered": 0.21194554197377186 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 8, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:multiarith", |
| "r4:mmlu_elementary_math", |
| "r5:medmcqa_easy", |
| "r4:mbpp", |
| "r5:mbpp_sanitized", |
| "r5:aqua_rat_numeric" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.1252146762540971, |
| "gap_recovered": 0.21194379494108018 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 8, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.20750000000000002, |
| "gap_recovered": -0.24999999999999992 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 8, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25471247319517465, |
| "gap_recovered": 0.27458303550194046 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 8, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:multiarith", |
| "r4:mmlu_elementary_math", |
| "r5:medmcqa_easy", |
| "r4:mbpp", |
| "r5:mbpp_sanitized", |
| "r4:gsm8k" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25471275066507276, |
| "gap_recovered": 0.2745861185008084 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 8, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.15833333333333333, |
| "gap_recovered": -0.25000000000000006 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 8, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.27493516686318936, |
| "gap_recovered": 0.24972214369938295 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 8, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:multiarith", |
| "r4:mmlu_elementary_math", |
| "r5:medmcqa_easy", |
| "r4:mbpp", |
| "r5:mbpp_sanitized", |
| "r4:gsm8k" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2749353107364699, |
| "gap_recovered": 0.24972276029915658 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 8, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.6416666666666666, |
| "gap_recovered": -0.2500000000000001 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 8, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7678827371268436, |
| "gap_recovered": 0.21176611143967194 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 8, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:multiarith", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:multiarith", |
| "r5:medmcqa_easy", |
| "r4:mbpp", |
| "r5:mbpp_sanitized", |
| "r4:gsm8k" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.767877849545972, |
| "gap_recovered": 0.21174823004623933 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 8, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.05140127220373045, |
| "gap_recovered": -0.13767762841849493 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 8, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.0708979350671001, |
| "gap_recovered": 0.08728386615884724 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 8, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:humaneval", |
| "r5:humaneval", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:aqua_rat_numeric" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.0708976678738649, |
| "gap_recovered": 0.0872807831599795 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 8, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.06954419015467853, |
| "gap_recovered": -0.04901160864994442 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 8, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12091904212688578, |
| "gap_recovered": 0.19180800996977712 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 8, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:humaneval", |
| "r5:humaneval", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:aqua_rat_numeric" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12091888866205326, |
| "gap_recovered": 0.19180729060337467 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 8, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.22929394602775574, |
| "gap_recovered": -0.007845044136047438 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 8, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25210894539438444, |
| "gap_recovered": 0.24565494882649372 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 8, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:humaneval", |
| "r5:humaneval", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:mawps" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2521090841293335, |
| "gap_recovered": 0.24565649032592798 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 8, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2074264666129803, |
| "gap_recovered": -0.03960085737294156 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 8, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2652467161759563, |
| "gap_recovered": 0.20820021218266982 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 8, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:humaneval", |
| "r5:humaneval", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:mawps" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2652471477957978, |
| "gap_recovered": 0.20820206198199043 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 8, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.6989556559749033, |
| "gap_recovered": -0.040406136677183 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 8, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7639373076921222, |
| "gap_recovered": 0.1973316135077643 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 8, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:humaneval", |
| "r5:humaneval", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:mawps" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7639364930953102, |
| "gap_recovered": 0.19732863327552524 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 8, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.0646524178844759, |
| "gap_recovered": 0.015220206359337235 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 8, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07239382530080862, |
| "gap_recovered": 0.10454413808625329 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 8, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r4:mbpp" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07239352248180872, |
| "gap_recovered": 0.1045406440208698 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 8, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.10447578956340922, |
| "gap_recovered": 0.11473026357848072 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 8, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12504286141231144, |
| "gap_recovered": 0.21113841287020987 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 8, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r4:mbpp" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.1250425544826464, |
| "gap_recovered": 0.21113697413740498 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 8, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.24428923323236665, |
| "gap_recovered": 0.1587692581374071 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 8, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2547727211590471, |
| "gap_recovered": 0.27525245732274534 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 8, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r4:mbpp" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25477285064499955, |
| "gap_recovered": 0.27525389605555045 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 8, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.24603687565902185, |
| "gap_recovered": 0.12587232425295075 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 8, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.27493519084206947, |
| "gap_recovered": 0.24972224646601196 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 8, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r4:mbpp" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.27493516686318936, |
| "gap_recovered": 0.24972214369938295 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 8, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7419641204812061, |
| "gap_recovered": 0.11694190419953457 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 8, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7681897558563057, |
| "gap_recovered": 0.21288935069380135 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 8, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r4:mbpp" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7681880985731366, |
| "gap_recovered": 0.21288328746269514 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 8, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.04166666666666667, |
| "gap_recovered": -0.25000000000000006 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 8, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07217366698144498, |
| "gap_recovered": 0.10200384978590354 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 8, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r4:sciq", |
| "r4:gsm8k" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07217657938770865, |
| "gap_recovered": 0.10203745447356127 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 8, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.026666666666666672, |
| "gap_recovered": -0.25 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 8, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12450159094799526, |
| "gap_recovered": 0.2086012075687278 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 8, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r4:sciq", |
| "r4:arc_easy" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12450937380735902, |
| "gap_recovered": 0.20863768972199542 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 8, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.20750000000000002, |
| "gap_recovered": -0.24999999999999992 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 8, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25476326868451876, |
| "gap_recovered": 0.2751474298279861 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 8, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r4:arc_easy", |
| "r4:sciq" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2547594395999251, |
| "gap_recovered": 0.275104884443612 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 8, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.15833333333333333, |
| "gap_recovered": -0.25000000000000006 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 8, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2749258150999574, |
| "gap_recovered": 0.2496820647141031 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 8, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r4:arc_easy", |
| "r4:sciq" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2749346633067076, |
| "gap_recovered": 0.2497199856001755 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 8, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.6416666666666666, |
| "gap_recovered": -0.2500000000000001 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 8, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7647975219255206, |
| "gap_recovered": 0.20047873875190492 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 8, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:sciq", |
| "r5:mbpp_sanitized", |
| "r4:gsm8k" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7648052746400066, |
| "gap_recovered": 0.20050710234148764 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 8, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.04166666666666667, |
| "gap_recovered": -0.25000000000000006 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 8, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07169886460249453, |
| "gap_recovered": 0.09652536079801373 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 8, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:mbpp_sanitized", |
| "r5:math_counting_easy", |
| "r4:sciq", |
| "r5:aqua_rat_numeric" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07169961274355308, |
| "gap_recovered": 0.09653399319484314 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 8, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.026666666666666672, |
| "gap_recovered": -0.25 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 8, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.1234050847196031, |
| "gap_recovered": 0.20346133462313953 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 8, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:mbpp_sanitized", |
| "r4:sciq", |
| "r5:math_counting_easy", |
| "r5:aqua_rat_numeric" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.1234069701446884, |
| "gap_recovered": 0.2034701725532269 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 8, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.20750000000000002, |
| "gap_recovered": -0.24999999999999992 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 8, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2520854159470262, |
| "gap_recovered": 0.2453935105225134 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 8, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:mbpp_sanitized", |
| "r4:arc_easy", |
| "r5:math_counting_easy", |
| "r4:sciq" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2520842228264644, |
| "gap_recovered": 0.24538025362738236 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 8, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.15833333333333333, |
| "gap_recovered": -0.25000000000000006 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 8, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2653153916885113, |
| "gap_recovered": 0.2084945358079056 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 8, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r4:arc_easy", |
| "r4:sciq" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2653136891880255, |
| "gap_recovered": 0.2084872393772521 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 8, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.6416666666666666, |
| "gap_recovered": -0.2500000000000001 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 8, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7619699721226747, |
| "gap_recovered": 0.19013404435124903 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 8, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:sciq", |
| "r5:mbpp_sanitized", |
| "r5:math_counting_easy", |
| "r5:aqua_rat_numeric" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7619703091972175, |
| "gap_recovered": 0.19013527755079598 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 12, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.046795635689264065, |
| "gap_recovered": -0.19081958820079933 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 12, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07289376165675021, |
| "gap_recovered": 0.11031263450096393 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 12, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:mmlu_elementary_math", |
| "r4:mmlu_high_school_biology", |
| "r4:svamp", |
| "r5:medmcqa_easy", |
| "r4:mbpp" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07289361024725027, |
| "gap_recovered": 0.11031088746827233 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 12, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.05766524479306978, |
| "gap_recovered": -0.10469416503248542 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 12, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12629289824387124, |
| "gap_recovered": 0.21699796051814646 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 12, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:mmlu_elementary_math", |
| "r4:mmlu_high_school_biology", |
| "r4:svamp", |
| "r5:medmcqa_easy", |
| "r4:mbpp" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.1262937751857714, |
| "gap_recovered": 0.21700207118330342 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 12, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.22348956247855878, |
| "gap_recovered": -0.07233819468268035 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 12, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25477145404651247, |
| "gap_recovered": 0.27523837829458286 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 12, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math", |
| "r4:svamp", |
| "r5:medmcqa_easy", |
| "r4:mbpp" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2547717407654072, |
| "gap_recovered": 0.27524156406008 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 12, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.19311567914897, |
| "gap_recovered": -0.10093280364727152 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 12, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2749361260183927, |
| "gap_recovered": 0.24972625436454002 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 12, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math", |
| "r4:svamp", |
| "r5:medmcqa_easy", |
| "r4:mbpp" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2749361020395126, |
| "gap_recovered": 0.249726151597911 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 12, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.6818556116093164, |
| "gap_recovered": -0.10296727460006182 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 12, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7701793945246729, |
| "gap_recovered": 0.22016851655368141 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 12, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mmlu_high_school_biology", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:svamp", |
| "r5:medmcqa_easy", |
| "r4:mbpp" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7701706024970131, |
| "gap_recovered": 0.2201363505988285 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 12, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.057627453502567344, |
| "gap_recovered": -0.06583707497037687 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 12, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07227199409199858, |
| "gap_recovered": 0.10313839336921435 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 12, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:humaneval", |
| "r5:humaneval", |
| "r4:mmlu_high_school_biology", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r4:openbookqa", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07227138845399879, |
| "gap_recovered": 0.10313140523844752 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 12, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.08585960651266165, |
| "gap_recovered": 0.0274669055281015 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 12, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12446026506095098, |
| "gap_recovered": 0.20840749247320772 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 12, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:humaneval", |
| "r5:humaneval", |
| "r4:mmlu_high_school_biology", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r4:openbookqa", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12446679827810705, |
| "gap_recovered": 0.20843811692862682 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 12, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.23572552253460063, |
| "gap_recovered": 0.06361691705111805 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 12, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25255285098634916, |
| "gap_recovered": 0.25058723318165727 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 12, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:humaneval", |
| "r5:humaneval", |
| "r4:mmlu_high_school_biology", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r4:openbookqa", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25256959167020077, |
| "gap_recovered": 0.2507732407800084 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 12, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2237824846958292, |
| "gap_recovered": 0.030496362982125127 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 12, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.26642122570125537, |
| "gap_recovered": 0.21323382443395156 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 12, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:humaneval", |
| "r5:humaneval", |
| "r4:mmlu_high_school_biology", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r4:openbookqa", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.266440984298443, |
| "gap_recovered": 0.2133185041361842 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 12, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7208439004010168, |
| "gap_recovered": 0.039672806345183394 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 12, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7701774563460514, |
| "gap_recovered": 0.22016142565628571 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 12, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r5:humaneval", |
| "r4:svamp", |
| "r4:openbookqa", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.770183636046004, |
| "gap_recovered": 0.22018403431464906 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 12, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.05790650121096908, |
| "gap_recovered": -0.06261729371958759 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 12, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07281855566748258, |
| "gap_recovered": 0.10944487308633737 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 12, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07281329196074915, |
| "gap_recovered": 0.10938413800864398 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 12, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.08668351535139414, |
| "gap_recovered": 0.03132897820966002 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 12, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.1260294868206156, |
| "gap_recovered": 0.21576321947163563 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 12, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12603113108667835, |
| "gap_recovered": 0.21577092696880476 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 12, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.23607687341755837, |
| "gap_recovered": 0.06752081575064844 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 12, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.254778455536941, |
| "gap_recovered": 0.27531617263267777 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 12, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2547751813921435, |
| "gap_recovered": 0.2752797932460388 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 12, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.22509549623248223, |
| "gap_recovered": 0.036123555282066684 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 12, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.27492768545260377, |
| "gap_recovered": 0.24969008051115896 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 12, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.27493065883373397, |
| "gap_recovered": 0.24970282357314552 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 12, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.718775217752347, |
| "gap_recovered": 0.03210445519151349 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 12, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7686479525182439, |
| "gap_recovered": 0.21456567994479495 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 12, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7686677837371826, |
| "gap_recovered": 0.21463823318481448 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 12, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.05730614473079814, |
| "gap_recovered": -0.06954448387540614 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 12, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07286305224758456, |
| "gap_recovered": 0.1099582951644372 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 12, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:openbookqa", |
| "r5:medmcqa_easy", |
| "r5:conala_curated" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07285891075243896, |
| "gap_recovered": 0.10991050868198791 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 12, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.0851024329525301, |
| "gap_recovered": 0.023917654464984846 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 12, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.126309691681259, |
| "gap_recovered": 0.21707667975590156 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 12, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:openbookqa", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12630539466594828, |
| "gap_recovered": 0.2170565374966326 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 12, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.23555245531016383, |
| "gap_recovered": 0.061693947890709144 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 12, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25476923428732773, |
| "gap_recovered": 0.27521371430364133 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 12, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r4:openbookqa", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2547697984761205, |
| "gap_recovered": 0.2752199830680057 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 12, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.22392417589823407, |
| "gap_recovered": 0.03110361099243171 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 12, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.274942360527214, |
| "gap_recovered": 0.24975297368805993 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 12, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r4:openbookqa", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.27493696527919553, |
| "gap_recovered": 0.24972985119655225 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 12, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7177663255559987, |
| "gap_recovered": 0.02841338618048329 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 12, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7681811604554626, |
| "gap_recovered": 0.21285790410535108 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 12, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:openbookqa", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy", |
| "r5:conala_curated" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7681842503054388, |
| "gap_recovered": 0.21286920843453233 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 12, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.04166666666666667, |
| "gap_recovered": -0.25000000000000006 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 12, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07221353221213682, |
| "gap_recovered": 0.10246383321696324 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 12, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math", |
| "r4:openbookqa", |
| "r5:conala_curated", |
| "r5:mbpp_sanitized", |
| "r5:math_counting_easy" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.0722140398792837, |
| "gap_recovered": 0.10246969091481181 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 12, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.026666666666666672, |
| "gap_recovered": -0.25 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 12, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12460340390260194, |
| "gap_recovered": 0.20907845579344658 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 12, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math", |
| "r4:openbookqa", |
| "r5:conala_curated", |
| "r5:mbpp_sanitized", |
| "r4:sciq" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.1246049604744747, |
| "gap_recovered": 0.20908575222410014 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 12, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.20750000000000002, |
| "gap_recovered": -0.24999999999999992 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 12, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2525556534323199, |
| "gap_recovered": 0.2506183714702209 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 12, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:conala_curated", |
| "r4:openbookqa", |
| "r5:mbpp_sanitized", |
| "r4:arc_easy" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25255594015121463, |
| "gap_recovered": 0.250621557235718 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 12, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.15833333333333333, |
| "gap_recovered": -0.25000000000000006 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 12, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.26660809311373485, |
| "gap_recovered": 0.2140346847731493 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 12, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:conala_curated", |
| "r4:openbookqa", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2666099874452613, |
| "gap_recovered": 0.2140428033368342 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 12, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.6416666666666666, |
| "gap_recovered": -0.2500000000000001 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 12, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7690827786785432, |
| "gap_recovered": 0.21615650736052422 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 12, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "selected_topk": [ |
| "r4:mmlu_elementary_math", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:openbookqa", |
| "r5:conala_curated", |
| "r4:sciq", |
| "r5:mbpp_sanitized" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7690766832472264, |
| "gap_recovered": 0.2161342070020481 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 16, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.059948044563161926, |
| "gap_recovered": -0.03906102427120862 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 16, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07311635143455418, |
| "gap_recovered": 0.11288097809100972 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 16, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math", |
| "r4:mmlu_high_school_biology", |
| "r4:svamp" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07309355094515045, |
| "gap_recovered": 0.1126178955209667 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 16, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.0921708915973532, |
| "gap_recovered": 0.05705105436259312 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 16, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12692907574533047, |
| "gap_recovered": 0.21998004255623657 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 16, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math", |
| "r4:mmlu_high_school_biology", |
| "r4:svamp" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12690070667486084, |
| "gap_recovered": 0.2198470625384102 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 16, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.23833444250041042, |
| "gap_recovered": 0.09260491667122672 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 16, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2547743489824493, |
| "gap_recovered": 0.27527054424943626 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 16, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:svamp" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2547730911189112, |
| "gap_recovered": 0.2752565679879024 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 16, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2306903924065075, |
| "gap_recovered": 0.060101681742174916 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 16, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.274936221933913, |
| "gap_recovered": 0.2497266654310556 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 16, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:svamp" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.27493053893933356, |
| "gap_recovered": 0.24970230974000093 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 16, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7263416142847346, |
| "gap_recovered": 0.05978639372463909 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 16, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7703233534440227, |
| "gap_recovered": 0.22069519552691244 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 16, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mmlu_high_school_biology", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:svamp" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7700408568875543, |
| "gap_recovered": 0.21966167153983296 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 16, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.05999645997738017, |
| "gap_recovered": -0.038502384876382696 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 16, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.0728135057153373, |
| "gap_recovered": 0.10938660440773808 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 16, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r5:humaneval", |
| "r4:math_algebra_easy", |
| "r4:mmlu_high_school_biology", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07278109517590754, |
| "gap_recovered": 0.10901263664508698 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 16, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.09217455282978629, |
| "gap_recovered": 0.05706821638962322 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 16, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12601758233432114, |
| "gap_recovered": 0.21570741719213035 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 16, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r5:humaneval", |
| "r4:math_algebra_easy", |
| "r4:mmlu_high_school_biology", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12597274867967628, |
| "gap_recovered": 0.21549725943598258 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 16, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.23862580439140058, |
| "gap_recovered": 0.09584227101556184 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 16, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25477864976586967, |
| "gap_recovered": 0.2753183307318851 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 16, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r5:humaneval", |
| "r4:mmlu_high_school_biology", |
| "r4:math_algebra_easy", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25477734565734866, |
| "gap_recovered": 0.27530384063720725 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 16, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.23142021360068488, |
| "gap_recovered": 0.06322948686007801 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 16, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.27493547858863043, |
| "gap_recovered": 0.24972347966555897 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 16, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r5:humaneval", |
| "r4:mmlu_high_school_biology", |
| "r4:math_algebra_easy", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2749307787281344, |
| "gap_recovered": 0.24970333740629014 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 16, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7280278296854304, |
| "gap_recovered": 0.06595547445889202 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 16, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7704543911725625, |
| "gap_recovered": 0.22117460185083868 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 16, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mmlu_high_school_biology", |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r5:humaneval", |
| "r4:math_algebra_easy", |
| "r4:svamp", |
| "r4:openbookqa", |
| "r5:pubmedqa_pqal" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7700888338308225, |
| "gap_recovered": 0.21983719694203366 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 16, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.052866684595743826, |
| "gap_recovered": -0.12076902389526362 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 16, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.0728197847563645, |
| "gap_recovered": 0.10945905488112877 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 16, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07280703073260429, |
| "gap_recovered": 0.10931189306851101 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 16, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.07347902846062322, |
| "gap_recovered": -0.030567054090828657 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 16, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12603295074112114, |
| "gap_recovered": 0.21577945659900535 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 16, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.1260124741477528, |
| "gap_recovered": 0.21568347256759124 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 16, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.23054410515160398, |
| "gap_recovered": 0.006045612795599651 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 16, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25477470044432016, |
| "gap_recovered": 0.275274449381335 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 16, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25477317435988067, |
| "gap_recovered": 0.2752574928875629 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 16, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.21094915592807464, |
| "gap_recovered": -0.024503617451108697 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 16, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2749346393278276, |
| "gap_recovered": 0.24971988283354674 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 16, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.27493053893933356, |
| "gap_recovered": 0.24970230974000093 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 16, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7025252753838725, |
| "gap_recovered": -0.027346553473636886 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 16, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7686494412641416, |
| "gap_recovered": 0.2145711265761279 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 16, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:svamp", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:svamp", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7682971421877542, |
| "gap_recovered": 0.2132822275161742 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 16, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.05988999237959414, |
| "gap_recovered": -0.039730857158529254 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 16, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07309392501567973, |
| "gap_recovered": 0.1126222117193814 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 16, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_high_school_biology", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07306783804948304, |
| "gap_recovered": 0.11232120826326579 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 16, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.09205213174052623, |
| "gap_recovered": 0.05649436753371671 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 16, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12716354808588137, |
| "gap_recovered": 0.22107913165256893 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 16, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r5:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_high_school_biology", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12713919102460491, |
| "gap_recovered": 0.22096495792783555 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 16, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.23857803332394567, |
| "gap_recovered": 0.09531148137717402 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 16, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25479441930507796, |
| "gap_recovered": 0.27549354783419944 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 16, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r5:humaneval", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25479235677883544, |
| "gap_recovered": 0.2754706308759492 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 16, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2313646065777746, |
| "gap_recovered": 0.06299117104760542 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 16, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2749491705291573, |
| "gap_recovered": 0.24978215941067408 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 16, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r5:humaneval", |
| "r4:math_counting_easy", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2749346393278276, |
| "gap_recovered": 0.24971988283354674 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 16, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7265955999527854, |
| "gap_recovered": 0.06071560958336151 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 16, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7705321992128744, |
| "gap_recovered": 0.22145926541295533 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 16, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mmlu_high_school_biology", |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:openbookqa" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7701881865523327, |
| "gap_recovered": 0.22020068250853433 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 16, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.043656596123487115, |
| "gap_recovered": -0.2270392754982257 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 16, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07285970342570339, |
| "gap_recovered": 0.10991965491196211 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 16, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r4:openbookqa", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07284248727491534, |
| "gap_recovered": 0.10972100701825385 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 16, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.04926707914505883, |
| "gap_recovered": -0.14406056650753674 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 16, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12633001480979483, |
| "gap_recovered": 0.21717194442091328 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 16, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r4:openbookqa", |
| "r5:conala_curated" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.126309735528354, |
| "gap_recovered": 0.21707688528915942 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 16, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2197829900938889, |
| "gap_recovered": -0.11352233229012336 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 16, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.25266901838368383, |
| "gap_recovered": 0.25187798204093137 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 16, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r4:openbookqa" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2526553668646977, |
| "gap_recovered": 0.25172629849664097 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 16, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.18353273553409796, |
| "gap_recovered": -0.14200256199672304 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 16, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.26689128368750387, |
| "gap_recovered": 0.21524835866073083 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 16, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r4:openbookqa" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2668517425142486, |
| "gap_recovered": 0.2150788964896368 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 16, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.6723003910053735, |
| "gap_recovered": -0.13792539876082843 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 16, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7697641186878599, |
| "gap_recovered": 0.2186492147116826 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 16, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "selected_topk": [ |
| "r4:mmlu_elementary_math", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:openbookqa", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.769729568547216, |
| "gap_recovered": 0.21852281175810734 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 24, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.058087711663081736, |
| "gap_recovered": -0.0605264038875185 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 24, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07326510681503122, |
| "gap_recovered": 0.11459738632728325 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 24, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_high_school_physics", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07306991325027642, |
| "gap_recovered": 0.11234515288780485 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 24, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.08727496881594604, |
| "gap_recovered": 0.034101416324747065 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 24, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.1274962817663434, |
| "gap_recovered": 0.22263882077973468 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 24, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_high_school_physics", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12716514850484914, |
| "gap_recovered": 0.22108663361648034 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 24, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2363231649481017, |
| "gap_recovered": 0.07025738831224113 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 24, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2548052961250832, |
| "gap_recovered": 0.275614401389813 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 24, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2548048059282632, |
| "gap_recovered": 0.27560895475848 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 24, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.22549109979607596, |
| "gap_recovered": 0.037818999126039775 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 24, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.27496794599226154, |
| "gap_recovered": 0.24986262568112086 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 24, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2749465808101084, |
| "gap_recovered": 0.24977106061475035 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 24, |
| "seed": 11, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7208252208534328, |
| "gap_recovered": 0.03960446653694945 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 24, |
| "seed": 11, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7718177453402815, |
| "gap_recovered": 0.22616248295224942 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 24, |
| "seed": 11, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mmlu_high_school_physics", |
| "r4:mmlu_high_school_biology", |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r4:mmlu_elementary_math", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7709135990581293, |
| "gap_recovered": 0.22285463070047312 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 24, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.058087711663081736, |
| "gap_recovered": -0.0605264038875185 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 24, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07326510681503122, |
| "gap_recovered": 0.11459738632728325 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 24, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_high_school_physics", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07306991325027642, |
| "gap_recovered": 0.11234515288780485 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 24, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.08727496881594604, |
| "gap_recovered": 0.034101416324747065 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 24, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.1274962817663434, |
| "gap_recovered": 0.22263882077973468 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 24, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_high_school_physics", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12716514850484914, |
| "gap_recovered": 0.22108663361648034 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 24, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2363231649481017, |
| "gap_recovered": 0.07025738831224113 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 24, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2548052961250832, |
| "gap_recovered": 0.275614401389813 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 24, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2548048059282632, |
| "gap_recovered": 0.27560895475848 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 24, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.22549109979607596, |
| "gap_recovered": 0.037818999126039775 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 24, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.27496794599226154, |
| "gap_recovered": 0.24986262568112086 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 24, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2749465808101084, |
| "gap_recovered": 0.24977106061475035 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 24, |
| "seed": 22, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7208252208534328, |
| "gap_recovered": 0.03960446653694945 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 24, |
| "seed": 22, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7718177453402815, |
| "gap_recovered": 0.22616248295224942 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 24, |
| "seed": 22, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mmlu_high_school_physics", |
| "r4:mmlu_high_school_biology", |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r4:mmlu_elementary_math", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7709135990581293, |
| "gap_recovered": 0.22285463070047312 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 24, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.058087711663081736, |
| "gap_recovered": -0.0605264038875185 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 24, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07326510681503122, |
| "gap_recovered": 0.11459738632728325 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 24, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_high_school_physics", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07306991325027642, |
| "gap_recovered": 0.11234515288780485 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 24, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.08727496881594604, |
| "gap_recovered": 0.034101416324747065 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 24, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.1274962817663434, |
| "gap_recovered": 0.22263882077973468 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 24, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_high_school_physics", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12716514850484914, |
| "gap_recovered": 0.22108663361648034 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 24, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2363231649481017, |
| "gap_recovered": 0.07025738831224113 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 24, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2548052961250832, |
| "gap_recovered": 0.275614401389813 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 24, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2548048059282632, |
| "gap_recovered": 0.27560895475848 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 24, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.22549109979607596, |
| "gap_recovered": 0.037818999126039775 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 24, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.27496794599226154, |
| "gap_recovered": 0.24986262568112086 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 24, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2749465808101084, |
| "gap_recovered": 0.24977106061475035 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 24, |
| "seed": 33, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7208252208534328, |
| "gap_recovered": 0.03960446653694945 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 24, |
| "seed": 33, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7718177453402815, |
| "gap_recovered": 0.22616248295224942 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 24, |
| "seed": 33, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mmlu_high_school_physics", |
| "r4:mmlu_high_school_biology", |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r4:mmlu_elementary_math", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7709135990581293, |
| "gap_recovered": 0.22285463070047312 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 24, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.058087711663081736, |
| "gap_recovered": -0.0605264038875185 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 24, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07326510681503122, |
| "gap_recovered": 0.11459738632728325 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 24, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_high_school_physics", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07306991325027642, |
| "gap_recovered": 0.11234515288780485 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 24, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.08727496881594604, |
| "gap_recovered": 0.034101416324747065 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 24, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.1274962817663434, |
| "gap_recovered": 0.22263882077973468 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 24, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_high_school_physics", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12716514850484914, |
| "gap_recovered": 0.22108663361648034 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 24, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2363231649481017, |
| "gap_recovered": 0.07025738831224113 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 24, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2548052961250832, |
| "gap_recovered": 0.275614401389813 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 24, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2548048059282632, |
| "gap_recovered": 0.27560895475848 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 24, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.22549109979607596, |
| "gap_recovered": 0.037818999126039775 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 24, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.27496794599226154, |
| "gap_recovered": 0.24986262568112086 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 24, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2749465808101084, |
| "gap_recovered": 0.24977106061475035 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 24, |
| "seed": 44, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7208252208534328, |
| "gap_recovered": 0.03960446653694945 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 24, |
| "seed": 44, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7718177453402815, |
| "gap_recovered": 0.22616248295224942 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 24, |
| "seed": 44, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mmlu_high_school_physics", |
| "r4:mmlu_high_school_biology", |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r4:mmlu_elementary_math", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7709135990581293, |
| "gap_recovered": 0.22285463070047312 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 24, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.058087711663081736, |
| "gap_recovered": -0.0605264038875185 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 24, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07326510681503122, |
| "gap_recovered": 0.11459738632728325 |
| }, |
| { |
| "task": "gsm_hard", |
| "domain": "math", |
| "N": 24, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_high_school_physics", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "accuracy": 0.07306991325027642, |
| "gap_recovered": 0.11234515288780485 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 24, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.08727496881594604, |
| "gap_recovered": 0.034101416324747065 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 24, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.1274962817663434, |
| "gap_recovered": 0.22263882077973468 |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "domain": "math", |
| "N": 24, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_high_school_physics", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "accuracy": 0.12716514850484914, |
| "gap_recovered": 0.22108663361648034 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 24, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2363231649481017, |
| "gap_recovered": 0.07025738831224113 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 24, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2548052961250832, |
| "gap_recovered": 0.275614401389813 |
| }, |
| { |
| "task": "mbpp_test_held", |
| "domain": "code", |
| "N": 24, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "accuracy": 0.2548048059282632, |
| "gap_recovered": 0.27560895475848 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 24, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.22549109979607596, |
| "gap_recovered": 0.037818999126039775 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 24, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.27496794599226154, |
| "gap_recovered": 0.24986262568112086 |
| }, |
| { |
| "task": "mbpp_plus", |
| "domain": "code", |
| "N": 24, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math" |
| ], |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "accuracy": 0.2749465808101084, |
| "gap_recovered": 0.24977106061475035 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 24, |
| "seed": 55, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7208252208534328, |
| "gap_recovered": 0.03960446653694945 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 24, |
| "seed": 55, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": null, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7718177453402815, |
| "gap_recovered": 0.22616248295224942 |
| }, |
| { |
| "task": "openbookqa_test", |
| "domain": "science", |
| "N": 24, |
| "seed": 55, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "selected_topk": [ |
| "r4:mmlu_high_school_physics", |
| "r4:mmlu_high_school_biology", |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r4:mmlu_elementary_math", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith" |
| ], |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "accuracy": 0.7709135990581293, |
| "gap_recovered": 0.22285463070047312 |
| } |
| ], |
| "summary": { |
| "4": { |
| "mean": { |
| "gap_recovered_mean": -0.057793517770438306, |
| "gap_recovered_std": 0.17563038381464027, |
| "accuracy_mean": 0.25036947076019195, |
| "accuracy_std": 0.03216155322401068, |
| "per_seed_gap_recovered": [ |
| 0.07797600483072223, |
| 0.05780125075373164, |
| -0.25, |
| 0.07525515556335458, |
| -0.25 |
| ], |
| "per_seed_accuracy": [ |
| 0.27513360811650067, |
| 0.2717492408341375, |
| 0.21516666666666664, |
| 0.27463117151698846, |
| 0.21516666666666664 |
| ] |
| }, |
| "global_ridge": { |
| "gap_recovered_mean": 0.18758141994476324, |
| "gap_recovered_std": 0.012099918774017874, |
| "accuracy_mean": 0.2948275943624562, |
| "accuracy_std": 0.002092518094207094, |
| "per_seed_gap_recovered": [ |
| 0.20656509234987466, |
| 0.18344950675964355, |
| 0.18193462388268833, |
| 0.19116208060034404, |
| 0.17479579613126564 |
| ], |
| "per_seed_accuracy": [ |
| 0.2980716934039675, |
| 0.2941577190530711, |
| 0.29408085632872305, |
| 0.2953825493543998, |
| 0.29244515367211965 |
| ] |
| }, |
| "topk8_global_ridge": { |
| "gap_recovered_mean": 0.18764435011765057, |
| "gap_recovered_std": 0.012084144151669675, |
| "accuracy_mean": 0.29483878477140407, |
| "accuracy_std": 0.002090313347697144, |
| "per_seed_gap_recovered": [ |
| 0.20655925520535176, |
| 0.18367281864429347, |
| 0.18193721360173717, |
| 0.19125222748723533, |
| 0.17480023564963507 |
| ], |
| "per_seed_accuracy": [ |
| 0.2980712127986996, |
| 0.29419602497144676, |
| 0.29408122587752067, |
| 0.29539983888056087, |
| 0.29244562132879237 |
| ] |
| } |
| }, |
| "8": { |
| "mean": { |
| "gap_recovered_mean": -0.13972029274907605, |
| "gap_recovered_std": 0.16140573821969564, |
| "accuracy_mean": 0.2354215987117811, |
| "accuracy_std": 0.029564716809042605, |
| "per_seed_gap_recovered": [ |
| -0.25, |
| -0.05490825505092227, |
| 0.10630679130554208, |
| -0.25, |
| -0.25 |
| ], |
| "per_seed_accuracy": [ |
| 0.21516666666666664, |
| 0.25132430619480967, |
| 0.280283687364096, |
| 0.21516666666666664, |
| 0.21516666666666664 |
| ] |
| }, |
| "global_ridge": { |
| "gap_recovered_mean": 0.20068106569092853, |
| "gap_recovered_std": 0.012220388493588764, |
| "accuracy_mean": 0.2971712960484384, |
| "accuracy_std": 0.002230078724212129, |
| "per_seed_gap_recovered": [ |
| 0.21065586188743862, |
| 0.18605573012911042, |
| 0.21070932108780432, |
| 0.20718265812972508, |
| 0.18880175722056428 |
| ], |
| "per_seed_accuracy": [ |
| 0.2990403014928445, |
| 0.29462198929128974, |
| 0.2990668709141085, |
| 0.2982323727278874, |
| 0.29489494581606196 |
| ] |
| }, |
| "topk8_global_ridge": { |
| "gap_recovered_mean": 0.20068368007396833, |
| "gap_recovered_std": 0.012222230487989805, |
| "accuracy_mean": 0.29717193931272656, |
| "accuracy_std": 0.0022303862873171943, |
| "per_seed_gap_recovered": [ |
| 0.21065314884843503, |
| 0.18605505186935956, |
| 0.21070738907518063, |
| 0.20720142331616637, |
| 0.18880138726070012 |
| ], |
| "per_seed_accuracy": [ |
| 0.29903937467487374, |
| 0.2946218563112719, |
| 0.2990664386091561, |
| 0.29823706614834145, |
| 0.29489496081998984 |
| ] |
| } |
| }, |
| "12": { |
| "mean": { |
| "gap_recovered_mean": -0.06185165931438573, |
| "gap_recovered_std": 0.11987185088549845, |
| "accuracy_mean": 0.24987132692446656, |
| "accuracy_std": 0.02200175109794565, |
| "per_seed_gap_recovered": [ |
| -0.11435040523265969, |
| 0.01908318338723024, |
| 0.020892102142860212, |
| 0.01511682313064057, |
| -0.25 |
| ], |
| "per_seed_accuracy": [ |
| 0.24058434674383583, |
| 0.2647677935293351, |
| 0.2649075207929502, |
| 0.263930306889545, |
| 0.21516666666666664 |
| ] |
| }, |
| "global_ridge": { |
| "gap_recovered_mean": 0.20759854234498124, |
| "gap_recovered_std": 0.008070049662145362, |
| "accuracy_mean": 0.2985715409284351, |
| "accuracy_std": 0.0013586871836284732, |
| "per_seed_gap_recovered": [ |
| 0.21448874884638292, |
| 0.19910567382286332, |
| 0.21295600512932095, |
| 0.2129719134034782, |
| 0.19847037052286085 |
| ], |
| "per_seed_accuracy": [ |
| 0.2998147268980399, |
| 0.2971767584373211, |
| 0.2994404271991774, |
| 0.29941309983976955, |
| 0.29701269226786736 |
| ] |
| }, |
| "topk8_global_ridge": { |
| "gap_recovered_mean": 0.20760713363515926, |
| "gap_recovered_std": 0.00804956171172338, |
| "accuracy_mean": 0.2985733282867519, |
| "accuracy_std": 0.0013561412938367808, |
| "per_seed_gap_recovered": [ |
| 0.21448340498167906, |
| 0.1991690602795832, |
| 0.21295518299628952, |
| 0.21295721777554216, |
| 0.19847080214270246 |
| ], |
| "per_seed_accuracy": [ |
| 0.2998131661469909, |
| 0.29718647974935075, |
| 0.2994436094020975, |
| 0.2994110638958284, |
| 0.29701232223949214 |
| ] |
| } |
| }, |
| "16": { |
| "mean": { |
| "gap_recovered_mean": -0.010073316508325977, |
| "gap_recovered_std": 0.08824797021532643, |
| "accuracy_mean": 0.25940458604933203, |
| "accuracy_std": 0.015888833442519255, |
| "per_seed_gap_recovered": [ |
| 0.04609660444588505, |
| 0.04871861276955448, |
| -0.03942812722304764, |
| 0.04715635447666568, |
| -0.15291002701068745 |
| ], |
| "per_seed_accuracy": [ |
| 0.2694970770704336, |
| 0.27004897209693646, |
| 0.25407284990398366, |
| 0.26969607279492525, |
| 0.23370795838038125 |
| ] |
| }, |
| "global_ridge": { |
| "gap_recovered_mean": 0.2123188520299978, |
| "gap_recovered_std": 0.0055871937881194125, |
| "accuracy_mean": 0.2994135150717593, |
| "accuracy_std": 0.0009899319263617858, |
| "per_seed_gap_recovered": [ |
| 0.21571068517093012, |
| 0.21426208676963024, |
| 0.21296079405422877, |
| 0.21608726320595584, |
| 0.20257343094924402 |
| ], |
| "per_seed_accuracy": [ |
| 0.30001587030805393, |
| 0.29979992151534424, |
| 0.299442303306755, |
| 0.3001066524297341, |
| 0.29770282779890916 |
| ] |
| }, |
| "topk8_global_ridge": { |
| "gap_recovered_mean": 0.21201921742537935, |
| "gap_recovered_std": 0.005505111192090935, |
| "accuracy_mean": 0.2993448407787016, |
| "accuracy_std": 0.0009667310823210812, |
| "per_seed_gap_recovered": [ |
| 0.21541710146542262, |
| 0.21387085421332014, |
| 0.2126474791559681, |
| 0.21573547248182634, |
| 0.20242517981035965 |
| ], |
| "per_seed_accuracy": [ |
| 0.29994774891316206, |
| 0.29971016041437787, |
| 0.2993640720734651, |
| 0.30002444234661674, |
| 0.2976777801458863 |
| ] |
| } |
| }, |
| "24": { |
| "mean": { |
| "gap_recovered_mean": 0.02425117328249178, |
| "gap_recovered_std": 0.0, |
| "accuracy_mean": 0.26560043321532767, |
| "accuracy_std": 0.0, |
| "per_seed_gap_recovered": [ |
| 0.02425117328249178, |
| 0.02425117328249178, |
| 0.02425117328249178, |
| 0.02425117328249178, |
| 0.02425117328249178 |
| ], |
| "per_seed_accuracy": [ |
| 0.26560043321532767, |
| 0.26560043321532767, |
| 0.26560043321532767, |
| 0.26560043321532767, |
| 0.26560043321532767 |
| ] |
| }, |
| "global_ridge": { |
| "gap_recovered_mean": 0.21777514342604026, |
| "gap_recovered_std": 0.0, |
| "accuracy_mean": 0.3004704752078001, |
| "accuracy_std": 0.0, |
| "per_seed_gap_recovered": [ |
| 0.21777514342604026, |
| 0.21777514342604026, |
| 0.21777514342604026, |
| 0.21777514342604026, |
| 0.21777514342604026 |
| ], |
| "per_seed_accuracy": [ |
| 0.3004704752078001, |
| 0.3004704752078001, |
| 0.3004704752078001, |
| 0.3004704752078001, |
| 0.3004704752078001 |
| ] |
| }, |
| "topk8_global_ridge": { |
| "gap_recovered_mean": 0.21633328651559772, |
| "gap_recovered_std": 0.0, |
| "accuracy_mean": 0.3001800095103253, |
| "accuracy_std": 0.0, |
| "per_seed_gap_recovered": [ |
| 0.21633328651559772, |
| 0.21633328651559772, |
| 0.21633328651559772, |
| 0.21633328651559772, |
| 0.21633328651559772 |
| ], |
| "per_seed_accuracy": [ |
| 0.3001800095103253, |
| 0.3001800095103253, |
| 0.3001800095103253, |
| 0.3001800095103253, |
| 0.3001800095103253 |
| ] |
| } |
| } |
| } |
| } |