{ "config": { "model_X": "Qwen/Qwen2.5-3B-Instruct", "model_Y": "meta-llama/Llama-3.2-3B-Instruct", "hub_repo": "CK0607/cross-model-lora-prediction-3b", "lora": { "r": 16, "alpha": 32, "targets": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ], "dropout": 0.0 }, "train": { "examples": 1500, "eval_examples": 300, "round5_eval_examples": 100, "round5_fast_surrogate": true, "epochs": 3.0, "bs": 8, "lr": 0.0002, "max_len": 512 }, "r4_anchor_names": [ "gsm8k", "mbpp", "sciq", "arc_easy", "openbookqa", "svamp", "multiarith", "mmlu_high_school_biology", "math_counting_easy", "humaneval", "mmlu_high_school_physics", "mbpp_sanitized", "mmlu_elementary_math", "math_algebra_easy", "aqua_rat", "medmcqa_easy" ], "r5_requested_new_anchors": [ "aqua_rat_numeric", "math_counting_easy", "mawps", "mbpp_sanitized", "humaneval", "conala_curated", "medmcqa_easy", "pubmedqa_pqal" ], "r5_new_anchor_names": [ "aqua_rat_numeric", "math_counting_easy", "mawps", "mbpp_sanitized", "humaneval", "conala_curated", "medmcqa_easy", "pubmedqa_pqal" ], "dropped": [], "pool_anchor_names": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "heldouts": [ "gsm_hard", "gsm8k_test_500", "mbpp_test_held", "mbpp_plus", "openbookqa_test" ], "N_values": [ 4, 8, 12, 16, 24 ], "methods": [ "mean", "global_ridge", "topk8_global_ridge" ], "seeds": [ 11, 22, 33, 44, 55 ] }, "dataset_audit": { "tasks": { "aqua_rat_numeric": { "domain": "math", "kind": "math_num", "dataset": "deepmind/aqua_rat", "config": "raw", "train_rows_sampled": 20, "eval_rows_sampled": 10, "labels": [], "sample": { "messages": [ { "role": "user", "content": "Solve the math problem. Respond with only the final answer.\n\nProblem: From (1, 2, 3, 4, 5, 6), one number is picked out and replaced and one number is picked out again. If the sum of the 2 numbers is 9, what is the probability that the 2 numbers included the number 5?\n\nFinal answer:" }, { "role": "assistant", "content": "1/2" } ] }, "ok": true }, "math_counting_easy": { "domain": "math", "kind": "math_solution", "dataset": "EleutherAI/hendrycks_math", "config": "counting_and_probability", "train_rows_sampled": 20, "eval_rows_sampled": 10, "labels": [], "sample": { "messages": [ { "role": "user", "content": "Solve the math problem. Respond with only the final answer.\n\nProblem: Alex has 10 different kinds of lunch meat and 9 different kinds of cheese. If he wants to make a sandwich with one kind of meat and two kinds of cheese, how many different sandwiches could he make? (It does not matter in which order he chooses the two types of cheese.)\n\nFinal answer:" }, { "role": "assistant", "content": "360" } ] }, "ok": true }, "mawps": { "domain": "math", "kind": "math_num", "dataset": "MU-NLPC/Calc-mawps", "config": null, "train_rows_sampled": 20, "eval_rows_sampled": 10, "labels": [], "sample": { "messages": [ { "role": "user", "content": "Solve the math problem. Respond with only the final answer.\n\nProblem: Marilyn had 279 Beg. Daryl clasped some Beg. Now Marilyn has 59 Beg. How many did Daryl claspeds?\n\nFinal answer:" }, { "role": "assistant", "content": "220" } ] }, "ok": true }, "mbpp_sanitized": { "domain": "code", "kind": "code", "dataset": "google-research-datasets/mbpp", "config": "sanitized", "train_rows_sampled": 20, "eval_rows_sampled": 10, "labels": [], "sample": { "messages": [ { "role": "user", "content": "Write a Python function that solves the task. Respond with only code.\n\nTask: Write a function to reverse words seperated by spaces in a given string.\n\nTests:\nassert reverse_words(\"python program\")==(\"program python\")\nassert reverse_words(\"java language\")==(\"language java\")\nassert reverse_words(\"indian man\")==(\"man indian\")\n\nCode:" }, { "role": "assistant", "content": "def reverse_words(s):\n return ' '.join(reversed(s.split()))" } ] }, "ok": true }, "humaneval": { "domain": "code", "kind": "code", "dataset": "openai/openai_humaneval", "config": null, "train_rows_sampled": 20, "eval_rows_sampled": 10, "labels": [], "sample": { "messages": [ { "role": "user", "content": "Complete the following Python function. Respond with only the completion code.\n\ndef below_threshold(l: list, t: int):\n \"\"\"Return True if all numbers in the list l are below threshold t.\n >>> below_threshold([1, 2, 4, 10], 100)\n True\n >>> below_threshold([1, 20, 4, 10], 5)\n False\n \"\"\"" }, { "role": "assistant", "content": "for e in l:\n if e >= t:\n return False\n return True" } ] }, "ok": true }, "conala_curated": { "domain": "code", "kind": "code", "dataset": "codeparrot/conala-mined-curated", "config": null, "train_rows_sampled": 20, "eval_rows_sampled": 10, "labels": [], "sample": { "messages": [ { "role": "user", "content": "Write a Python code snippet for this intent. Respond with only code.\n\nIntent: print all contacts in class all_contacts\n\nCode:" }, { "role": "assistant", "content": "all_contacts = []" } ] }, "ok": true }, "medmcqa_easy": { "domain": "science", "kind": "mcq", "dataset": "openlifescienceai/medmcqa", "config": null, "train_rows_sampled": 20, "eval_rows_sampled": 10, "labels": [ "A", "B", "C", "D", "E", "F", "G", "H" ], "sample": { "messages": [ { "role": "user", "content": "Answer the multiple-choice question. Respond with only the option letter.\n\nApolipoprotein A-I (Apo A-I) is found in which of the following lipid components?\nA. LDL\nB. HDL\nC. VLDL\nD. Chylomicron\nAnswer:" }, { "role": "assistant", "content": "B" } ] }, "ok": true }, "pubmedqa_pqal": { "domain": "science", "kind": "short", "dataset": "qiaojin/PubMedQA", "config": "pqa_labeled", "train_rows_sampled": 20, "eval_rows_sampled": 10, "labels": [ "yes", "no", "maybe" ], "sample": { "messages": [ { "role": "user", "content": "Answer the biomedical question with only one of: yes, no, maybe.\n\nContext: To assess whether eligibility to an adjuvant chemotherapy protocol in itself represents a good prognostic factor after radical cystectomy for bladder cancer. Between April 1984 and May 1989, our institution entered 35 patients with invasive bladder cancer into the Swiss Group for Clinical and Epidemiological Cancer Research (SAKK) study 09/84. They were randomly assigned to either observation or three postoperative courses of cisplatin monotherapy after cystectomy. This study had a negative result. The outcome of these 35 patients (protocol group) was compared with an age- and tumor-stage-matched cohort (matched group; n = 35) who also underwent cystectomy during the same period, but were not entered into the SAKK study, as well as the remaining 57 patients treated during the study period for the same indication (remaining group). Median overall survival decreased from 76.3 months in the protocol group to 52.1 months in the matched group and to 20.3 months in the remaining group. The respective times of median recurrence-free survival were 67.2, 16.0, and 9.4 months. Tumor progression occurred in 46% of the protocol group compared with 69% in the matched group and 65% in the remaining group (P<.05). Cancer-related death was noted in 40% of the protocol group, 57% in the matched group, and 56% in the remaining group.\n\nQuestion: Is eligibility for a chemotherapy protocol a good prognostic factor for invasive bladder cancer after radical cystectomy?\n\nAnswer:" }, { "role": "assistant", "content": "yes" } ] }, "ok": true } }, "dropped": [], "kept": [ "aqua_rat_numeric", "math_counting_easy", "mawps", "mbpp_sanitized", "humaneval", "conala_curated", "medmcqa_easy", "pubmedqa_pqal" ], "anchor_domain_counts": { "math": 3, "code": 3, "science": 2 } }, "training": [ { "side": "X", "model": "Qwen/Qwen2.5-3B-Instruct", "task": "aqua_rat_numeric", "ok": true, "gpu": 0 }, { "side": "X", "model": "Qwen/Qwen2.5-3B-Instruct", "task": "humaneval", "ok": true, "gpu": 0 }, { "side": "Y", "model": "meta-llama/Llama-3.2-3B-Instruct", "task": "aqua_rat_numeric", "ok": true, "gpu": 1 }, { "side": "Y", "model": "meta-llama/Llama-3.2-3B-Instruct", "task": "humaneval", "ok": true, "gpu": 1 }, { "side": "X", "model": "Qwen/Qwen2.5-3B-Instruct", "task": "math_counting_easy", "ok": true, "gpu": 2 }, { "side": "X", "model": "Qwen/Qwen2.5-3B-Instruct", "task": "conala_curated", "ok": true, "gpu": 2 }, { "side": "Y", "model": "meta-llama/Llama-3.2-3B-Instruct", "task": "math_counting_easy", "ok": true, "gpu": 3 }, { "side": "Y", "model": "meta-llama/Llama-3.2-3B-Instruct", "task": "conala_curated", "ok": true, "gpu": 3 }, { "side": "X", "model": "Qwen/Qwen2.5-3B-Instruct", "task": "mawps", "ok": true, "gpu": 4 }, { "side": "X", "model": "Qwen/Qwen2.5-3B-Instruct", "task": "medmcqa_easy", "ok": true, "gpu": 4 }, { "side": "Y", "model": "meta-llama/Llama-3.2-3B-Instruct", "task": "mawps", "ok": true, "gpu": 5 }, { "side": "Y", "model": "meta-llama/Llama-3.2-3B-Instruct", "task": "medmcqa_easy", "ok": true, "gpu": 5 }, { "side": "X", "model": "Qwen/Qwen2.5-3B-Instruct", "task": "mbpp_sanitized", "ok": true, "gpu": 6 }, { "side": "X", "model": "Qwen/Qwen2.5-3B-Instruct", "task": "pubmedqa_pqal", "ok": true, "gpu": 6 }, { "side": "Y", "model": "meta-llama/Llama-3.2-3B-Instruct", "task": "mbpp_sanitized", "ok": true, "gpu": 7 }, { "side": "Y", "model": "meta-llama/Llama-3.2-3B-Instruct", "task": "pubmedqa_pqal", "ok": true, "gpu": 7 } ], "baselines": { "gsm_hard": { "base_Y": 0.06333333333333334, "oracle": 0.15 }, "gsm8k_test_500": { "base_Y": 0.08, "oracle": 0.29333333333333333 }, "mbpp_test_held": { "base_Y": 0.23, "oracle": 0.32 }, "mbpp_plus": { "base_Y": 0.21666666666666667, "oracle": 0.45 }, "openbookqa_test": { "base_Y": 0.71, "oracle": 0.9833333333333333 } }, "records": [ { "task": "gsm_hard", "domain": "math", "N": 4, "seed": 11, "method": "mean", "anchors": [ "r4:multiarith", "r4:mbpp_sanitized", "r5:aqua_rat_numeric", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.06235950804304804, "gap_recovered": -0.011236445657138083 }, { "task": "gsm_hard", "domain": "math", "N": 4, "seed": 11, "method": "global_ridge", "anchors": [ "r4:multiarith", "r4:mbpp_sanitized", "r5:aqua_rat_numeric", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07221313142228401, "gap_recovered": 0.10245920871866164 }, { "task": "gsm_hard", "domain": "math", "N": 4, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:multiarith", "r4:mbpp_sanitized", "r5:aqua_rat_numeric", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:multiarith", "r5:medmcqa_easy" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.0722087583596679, "gap_recovered": 0.10240875030386039 }, { "task": "gsm8k_test_500", "domain": "math", "N": 4, "seed": 11, "method": "mean", "anchors": [ "r4:multiarith", "r4:mbpp_sanitized", "r5:aqua_rat_numeric", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.09861895769491964, "gap_recovered": 0.08727636419493583 }, { "task": "gsm8k_test_500", "domain": "math", "N": 4, "seed": 11, "method": "global_ridge", "anchors": [ "r4:multiarith", "r4:mbpp_sanitized", "r5:aqua_rat_numeric", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12459708992092089, "gap_recovered": 0.20904885900431666 }, { "task": "gsm8k_test_500", "domain": "math", "N": 4, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:multiarith", "r4:mbpp_sanitized", "r5:aqua_rat_numeric", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:multiarith", "r5:medmcqa_easy" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12459502910745557, "gap_recovered": 0.209039198941198 }, { "task": "mbpp_test_held", "domain": "code", "N": 4, "seed": 11, "method": "mean", "anchors": [ "r4:multiarith", "r4:mbpp_sanitized", "r5:aqua_rat_numeric", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.24167782528647064, "gap_recovered": 0.12975361429411808 }, { "task": "mbpp_test_held", "domain": "code", "N": 4, "seed": 11, "method": "global_ridge", "anchors": [ "r4:multiarith", "r4:mbpp_sanitized", "r5:aqua_rat_numeric", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25468766738628523, "gap_recovered": 0.27430741540316916 }, { "task": "mbpp_test_held", "domain": "code", "N": 4, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:multiarith", "r4:mbpp_sanitized", "r5:aqua_rat_numeric", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:multiarith", "r5:medmcqa_easy" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25469019236235785, "gap_recovered": 0.2743354706928649 }, { "task": "mbpp_plus", "domain": "code", "N": 4, "seed": 11, "method": "mean", "anchors": [ "r4:multiarith", "r4:mbpp_sanitized", "r5:aqua_rat_numeric", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2398360331853231, "gap_recovered": 0.09929728507995608 }, { "task": "mbpp_plus", "domain": "code", "N": 4, "seed": 11, "method": "global_ridge", "anchors": [ "r4:multiarith", "r4:mbpp_sanitized", "r5:aqua_rat_numeric", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.274934759222228, "gap_recovered": 0.24972039666669132 }, { "task": "mbpp_plus", "domain": "code", "N": 4, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:multiarith", "r4:mbpp_sanitized", "r5:aqua_rat_numeric", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:multiarith", "r5:medmcqa_easy" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2749305629182136, "gap_recovered": 0.2497024125066297 }, { "task": "openbookqa_test", "domain": "science", "N": 4, "seed": 11, "method": "mean", "anchors": [ "r4:multiarith", "r4:mbpp_sanitized", "r5:aqua_rat_numeric", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.733175716372742, "gap_recovered": 0.08478920624173929 }, { "task": "openbookqa_test", "domain": "science", "N": 4, "seed": 11, "method": "global_ridge", "anchors": [ "r4:multiarith", "r4:mbpp_sanitized", "r5:aqua_rat_numeric", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7639258190681194, "gap_recovered": 0.19728958195653454 }, { "task": "openbookqa_test", "domain": "science", "N": 4, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:multiarith", "r4:mbpp_sanitized", "r5:aqua_rat_numeric", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:multiarith", "r5:medmcqa_easy" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7639315212458029, "gap_recovered": 0.1973104435822058 }, { "task": "gsm_hard", "domain": "math", "N": 4, "seed": 22, "method": "mean", "anchors": [ "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.06060013856010876, "gap_recovered": -0.03153686276797588 }, { "task": "gsm_hard", "domain": "math", "N": 4, "seed": 22, "method": "global_ridge", "anchors": [ "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.0706267873720191, "gap_recovered": 0.08415523890791271 }, { "task": "gsm_hard", "domain": "math", "N": 4, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": [ "r5:humaneval", "r4:aqua_rat", "r5:medmcqa_easy" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07063988874698508, "gap_recovered": 0.0843064086190586 }, { "task": "gsm8k_test_500", "domain": "math", "N": 4, "seed": 22, "method": "mean", "anchors": [ "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.09357568675074085, "gap_recovered": 0.06363603164409773 }, { "task": "gsm8k_test_500", "domain": "math", "N": 4, "seed": 22, "method": "global_ridge", "anchors": [ "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.1202605464540679, "gap_recovered": 0.18872131150344332 }, { "task": "gsm8k_test_500", "domain": "math", "N": 4, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": [ "r5:humaneval", "r4:aqua_rat", "r5:medmcqa_easy" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.1203002280750494, "gap_recovered": 0.1889073191017941 }, { "task": "mbpp_test_held", "domain": "code", "N": 4, "seed": 22, "method": "mean", "anchors": [ "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.23954079837634648, "gap_recovered": 0.10600887084829412 }, { "task": "mbpp_test_held", "domain": "code", "N": 4, "seed": 22, "method": "global_ridge", "anchors": [ "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25191722294379926, "gap_recovered": 0.2435246993755472 }, { "task": "mbpp_test_held", "domain": "code", "N": 4, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": [ "r5:humaneval", "r4:aqua_rat", "r5:medmcqa_easy" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25194909498609347, "gap_recovered": 0.24387883317881623 }, { "task": "mbpp_plus", "domain": "code", "N": 4, "seed": 22, "method": "mean", "anchors": [ "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.23348182179461952, "gap_recovered": 0.07206495054836931 }, { "task": "mbpp_plus", "domain": "code", "N": 4, "seed": 22, "method": "global_ridge", "anchors": [ "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.26477579495002485, "gap_recovered": 0.2061819783572493 }, { "task": "mbpp_plus", "domain": "code", "N": 4, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": [ "r5:humaneval", "r4:aqua_rat", "r5:medmcqa_easy" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2648303948599717, "gap_recovered": 0.20641597797130715 }, { "task": "openbookqa_test", "domain": "science", "N": 4, "seed": 22, "method": "mean", "anchors": [ "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7315477586888719, "gap_recovered": 0.07883326349587293 }, { "task": "openbookqa_test", "domain": "science", "N": 4, "seed": 22, "method": "global_ridge", "anchors": [ "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7632082435454445, "gap_recovered": 0.19466430565406528 }, { "task": "openbookqa_test", "domain": "science", "N": 4, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": [ "r5:humaneval", "r4:aqua_rat", "r5:medmcqa_easy" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7632605181891342, "gap_recovered": 0.19485555435049115 }, { "task": "gsm_hard", "domain": "math", "N": 4, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:mmlu_elementary_math", "r4:math_algebra_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.04166666666666667, "gap_recovered": -0.25000000000000006 }, { "task": "gsm_hard", "domain": "math", "N": 4, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:mmlu_elementary_math", "r4:math_algebra_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07145326948713983, "gap_recovered": 0.09369157100545951 }, { "task": "gsm_hard", "domain": "math", "N": 4, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:mmlu_elementary_math", "r4:math_algebra_easy" ], "selected_topk": [ "r4:math_algebra_easy", "r4:mmlu_elementary_math", "r4:mbpp" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07145337636443391, "gap_recovered": 0.09369280420500663 }, { "task": "gsm8k_test_500", "domain": "math", "N": 4, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:mmlu_elementary_math", "r4:math_algebra_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.026666666666666672, "gap_recovered": -0.25 }, { "task": "gsm8k_test_500", "domain": "math", "N": 4, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:mmlu_elementary_math", "r4:math_algebra_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12266466074976429, "gap_recovered": 0.1999905972645201 }, { "task": "gsm8k_test_500", "domain": "math", "N": 4, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:mmlu_elementary_math", "r4:math_algebra_easy" ], "selected_topk": [ "r4:math_algebra_easy", "r4:mmlu_elementary_math", "r4:mbpp" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12266483613814431, "gap_recovered": 0.19999141939755147 }, { "task": "mbpp_test_held", "domain": "code", "N": 4, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:mmlu_elementary_math", "r4:math_algebra_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.20750000000000002, "gap_recovered": -0.24999999999999992 }, { "task": "mbpp_test_held", "domain": "code", "N": 4, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:mmlu_elementary_math", "r4:math_algebra_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2502199395771684, "gap_recovered": 0.2246659953018713 }, { "task": "mbpp_test_held", "domain": "code", "N": 4, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:mmlu_elementary_math", "r4:math_algebra_easy" ], "selected_topk": [ "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:mbpp" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25022055001094423, "gap_recovered": 0.22467277789938025 }, { "task": "mbpp_plus", "domain": "code", "N": 4, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:mmlu_elementary_math", "r4:math_algebra_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.15833333333333333, "gap_recovered": -0.25000000000000006 }, { "task": "mbpp_plus", "domain": "code", "N": 4, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:mmlu_elementary_math", "r4:math_algebra_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.26078072169731403, "gap_recovered": 0.18906023584563153 }, { "task": "mbpp_plus", "domain": "code", "N": 4, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:mmlu_elementary_math", "r4:math_algebra_easy" ], "selected_topk": [ "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:mbpp" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2607817048313974, "gap_recovered": 0.1890644492774174 }, { "task": "openbookqa_test", "domain": "science", "N": 4, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:mmlu_elementary_math", "r4:math_algebra_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.6416666666666666, "gap_recovered": -0.2500000000000001 }, { "task": "openbookqa_test", "domain": "science", "N": 4, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:mmlu_elementary_math", "r4:math_algebra_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7652856901322288, "gap_recovered": 0.20226471999595919 }, { "task": "openbookqa_test", "domain": "science", "N": 4, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:mmlu_elementary_math", "r4:math_algebra_easy" ], "selected_topk": [ "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:mbpp" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7652856620426836, "gap_recovered": 0.20226461722933023 }, { "task": "gsm_hard", "domain": "math", "N": 4, "seed": 44, "method": "mean", "anchors": [ "r4:sciq", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.06258799388490875, "gap_recovered": -0.008600070558745234 }, { "task": "gsm_hard", "domain": "math", "N": 4, "seed": 44, "method": "global_ridge", "anchors": [ "r4:sciq", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07188610471528153, "gap_recovered": 0.09868582363786381 }, { "task": "gsm_hard", "domain": "math", "N": 4, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:sciq", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "selected_topk": [ "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07188549907728174, "gap_recovered": 0.09867883550709697 }, { "task": "gsm8k_test_500", "domain": "math", "N": 4, "seed": 44, "method": "mean", "anchors": [ "r4:sciq", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.09898812831133263, "gap_recovered": 0.08900685145937172 }, { "task": "gsm8k_test_500", "domain": "math", "N": 4, "seed": 44, "method": "global_ridge", "anchors": [ "r4:sciq", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12376669171212734, "gap_recovered": 0.20515636740059692 }, { "task": "gsm8k_test_500", "domain": "math", "N": 4, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:sciq", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "selected_topk": [ "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12377916621065689, "gap_recovered": 0.20521484161245418 }, { "task": "mbpp_test_held", "domain": "code", "N": 4, "seed": 44, "method": "mean", "anchors": [ "r4:sciq", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.24100257454247312, "gap_recovered": 0.12225082824970122 }, { "task": "mbpp_test_held", "domain": "code", "N": 4, "seed": 44, "method": "global_ridge", "anchors": [ "r4:sciq", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25219830919956343, "gap_recovered": 0.24664787999514912 }, { "task": "mbpp_test_held", "domain": "code", "N": 4, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:sciq", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "selected_topk": [ "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2522135422969687, "gap_recovered": 0.2468171366329853 }, { "task": "mbpp_plus", "domain": "code", "N": 4, "seed": 44, "method": "mean", "anchors": [ "r4:sciq", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.23734664177072462, "gap_recovered": 0.0886284647316769 }, { "task": "mbpp_plus", "domain": "code", "N": 4, "seed": 44, "method": "global_ridge", "anchors": [ "r4:sciq", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2656234004031653, "gap_recovered": 0.20981457315642263 }, { "task": "mbpp_plus", "domain": "code", "N": 4, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:sciq", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "selected_topk": [ "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.26564392632451556, "gap_recovered": 0.2099025413907809 }, { "task": "openbookqa_test", "domain": "science", "N": 4, "seed": 44, "method": "mean", "anchors": [ "r4:sciq", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7332305190755033, "gap_recovered": 0.08498970393476829 }, { "task": "openbookqa_test", "domain": "science", "N": 4, "seed": 44, "method": "global_ridge", "anchors": [ "r4:sciq", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7634382407418613, "gap_recovered": 0.19550575881168777 }, { "task": "openbookqa_test", "domain": "science", "N": 4, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:sciq", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "selected_topk": [ "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7634770604933815, "gap_recovered": 0.19564778229285923 }, { "task": "gsm_hard", "domain": "math", "N": 4, "seed": 55, "method": "mean", "anchors": [ "r4:sciq", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.04166666666666667, "gap_recovered": -0.25000000000000006 }, { "task": "gsm_hard", "domain": "math", "N": 4, "seed": 55, "method": "global_ridge", "anchors": [ "r4:sciq", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.06984950443793989, "gap_recovered": 0.07518658966853708 }, { "task": "gsm_hard", "domain": "math", "N": 4, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:sciq", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:humaneval" ], "selected_topk": [ "r5:humaneval", "r5:math_counting_easy", "r4:sciq" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.06985002991796911, "gap_recovered": 0.07519265289964354 }, { "task": "gsm8k_test_500", "domain": "math", "N": 4, "seed": 55, "method": "mean", "anchors": [ "r4:sciq", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.026666666666666672, "gap_recovered": -0.25 }, { "task": "gsm8k_test_500", "domain": "math", "N": 4, "seed": 55, "method": "global_ridge", "anchors": [ "r4:sciq", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.11843885312135194, "gap_recovered": 0.1801821240063372 }, { "task": "gsm8k_test_500", "domain": "math", "N": 4, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:sciq", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:humaneval" ], "selected_topk": [ "r5:humaneval", "r4:sciq", "r5:math_counting_easy" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.11843983968098959, "gap_recovered": 0.1801867485046387 }, { "task": "mbpp_test_held", "domain": "code", "N": 4, "seed": 55, "method": "mean", "anchors": [ "r4:sciq", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.20750000000000002, "gap_recovered": -0.24999999999999992 }, { "task": "mbpp_test_held", "domain": "code", "N": 4, "seed": 55, "method": "global_ridge", "anchors": [ "r4:sciq", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2514841386778601, "gap_recovered": 0.23871265197622352 }, { "task": "mbpp_test_held", "domain": "code", "N": 4, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:sciq", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:humaneval" ], "selected_topk": [ "r5:humaneval", "r5:math_counting_easy", "r4:sciq" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2514852855534389, "gap_recovered": 0.23872539503821 }, { "task": "mbpp_plus", "domain": "code", "N": 4, "seed": 55, "method": "mean", "anchors": [ "r4:sciq", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.15833333333333333, "gap_recovered": -0.25000000000000006 }, { "task": "mbpp_plus", "domain": "code", "N": 4, "seed": 55, "method": "global_ridge", "anchors": [ "r4:sciq", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.26363710987156835, "gap_recovered": 0.20130189944957863 }, { "task": "mbpp_plus", "domain": "code", "N": 4, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:sciq", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:humaneval" ], "selected_topk": [ "r5:humaneval", "r5:math_counting_easy", "r4:sciq" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.26363701395604805, "gap_recovered": 0.20130148838306303 }, { "task": "openbookqa_test", "domain": "science", "N": 4, "seed": 55, "method": "mean", "anchors": [ "r4:sciq", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.6416666666666666, "gap_recovered": -0.2500000000000001 }, { "task": "openbookqa_test", "domain": "science", "N": 4, "seed": 55, "method": "global_ridge", "anchors": [ "r4:sciq", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7588161622518781, "gap_recovered": 0.17859571555565168 }, { "task": "openbookqa_test", "domain": "science", "N": 4, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:sciq", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:humaneval" ], "selected_topk": [ "r5:humaneval", "r4:sciq", "r5:math_counting_easy" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7588159375355161, "gap_recovered": 0.1785948934226201 }, { "task": "gsm_hard", "domain": "math", "N": 8, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:multiarith", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.04166666666666667, "gap_recovered": -0.25000000000000006 }, { "task": "gsm_hard", "domain": "math", "N": 8, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:multiarith", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07245608132461023, "gap_recovered": 0.10526247682242572 }, { "task": "gsm_hard", "domain": "math", "N": 8, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:multiarith", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:multiarith", "r4:mmlu_elementary_math", "r5:medmcqa_easy", "r4:mbpp", "r5:mbpp_sanitized", "r4:gsm8k" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.0724562861727572, "gap_recovered": 0.10526484045489079 }, { "task": "gsm8k_test_500", "domain": "math", "N": 8, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:multiarith", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.026666666666666672, "gap_recovered": -0.25 }, { "task": "gsm8k_test_500", "domain": "math", "N": 8, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:multiarith", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12521504895440466, "gap_recovered": 0.21194554197377186 }, { "task": "gsm8k_test_500", "domain": "math", "N": 8, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:multiarith", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:multiarith", "r4:mmlu_elementary_math", "r5:medmcqa_easy", "r4:mbpp", "r5:mbpp_sanitized", "r5:aqua_rat_numeric" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.1252146762540971, "gap_recovered": 0.21194379494108018 }, { "task": "mbpp_test_held", "domain": "code", "N": 8, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:multiarith", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.20750000000000002, "gap_recovered": -0.24999999999999992 }, { "task": "mbpp_test_held", "domain": "code", "N": 8, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:multiarith", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25471247319517465, "gap_recovered": 0.27458303550194046 }, { "task": "mbpp_test_held", "domain": "code", "N": 8, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:multiarith", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:multiarith", "r4:mmlu_elementary_math", "r5:medmcqa_easy", "r4:mbpp", "r5:mbpp_sanitized", "r4:gsm8k" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25471275066507276, "gap_recovered": 0.2745861185008084 }, { "task": "mbpp_plus", "domain": "code", "N": 8, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:multiarith", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.15833333333333333, "gap_recovered": -0.25000000000000006 }, { "task": "mbpp_plus", "domain": "code", "N": 8, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:multiarith", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.27493516686318936, "gap_recovered": 0.24972214369938295 }, { "task": "mbpp_plus", "domain": "code", "N": 8, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:multiarith", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:multiarith", "r4:mmlu_elementary_math", "r5:medmcqa_easy", "r4:mbpp", "r5:mbpp_sanitized", "r4:gsm8k" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2749353107364699, "gap_recovered": 0.24972276029915658 }, { "task": "openbookqa_test", "domain": "science", "N": 8, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:multiarith", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.6416666666666666, "gap_recovered": -0.2500000000000001 }, { "task": "openbookqa_test", "domain": "science", "N": 8, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:multiarith", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7678827371268436, "gap_recovered": 0.21176611143967194 }, { "task": "openbookqa_test", "domain": "science", "N": 8, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:multiarith", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:multiarith", "r5:medmcqa_easy", "r4:mbpp", "r5:mbpp_sanitized", "r4:gsm8k" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.767877849545972, "gap_recovered": 0.21174823004623933 }, { "task": "gsm_hard", "domain": "math", "N": 8, "seed": 22, "method": "mean", "anchors": [ "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.05140127220373045, "gap_recovered": -0.13767762841849493 }, { "task": "gsm_hard", "domain": "math", "N": 8, "seed": 22, "method": "global_ridge", "anchors": [ "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.0708979350671001, "gap_recovered": 0.08728386615884724 }, { "task": "gsm_hard", "domain": "math", "N": 8, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:humaneval", "r5:humaneval", "r5:pubmedqa_pqal", "r4:aqua_rat", "r5:medmcqa_easy", "r5:mbpp_sanitized", "r5:aqua_rat_numeric" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.0708976678738649, "gap_recovered": 0.0872807831599795 }, { "task": "gsm8k_test_500", "domain": "math", "N": 8, "seed": 22, "method": "mean", "anchors": [ "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.06954419015467853, "gap_recovered": -0.04901160864994442 }, { "task": "gsm8k_test_500", "domain": "math", "N": 8, "seed": 22, "method": "global_ridge", "anchors": [ "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12091904212688578, "gap_recovered": 0.19180800996977712 }, { "task": "gsm8k_test_500", "domain": "math", "N": 8, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:humaneval", "r5:humaneval", "r5:pubmedqa_pqal", "r4:aqua_rat", "r5:medmcqa_easy", "r5:mbpp_sanitized", "r5:aqua_rat_numeric" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12091888866205326, "gap_recovered": 0.19180729060337467 }, { "task": "mbpp_test_held", "domain": "code", "N": 8, "seed": 22, "method": "mean", "anchors": [ "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.22929394602775574, "gap_recovered": -0.007845044136047438 }, { "task": "mbpp_test_held", "domain": "code", "N": 8, "seed": 22, "method": "global_ridge", "anchors": [ "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25210894539438444, "gap_recovered": 0.24565494882649372 }, { "task": "mbpp_test_held", "domain": "code", "N": 8, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:humaneval", "r5:humaneval", "r5:pubmedqa_pqal", "r4:aqua_rat", "r5:medmcqa_easy", "r5:mbpp_sanitized", "r5:mawps" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2521090841293335, "gap_recovered": 0.24565649032592798 }, { "task": "mbpp_plus", "domain": "code", "N": 8, "seed": 22, "method": "mean", "anchors": [ "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2074264666129803, "gap_recovered": -0.03960085737294156 }, { "task": "mbpp_plus", "domain": "code", "N": 8, "seed": 22, "method": "global_ridge", "anchors": [ "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2652467161759563, "gap_recovered": 0.20820021218266982 }, { "task": "mbpp_plus", "domain": "code", "N": 8, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:humaneval", "r5:humaneval", "r5:pubmedqa_pqal", "r4:aqua_rat", "r5:medmcqa_easy", "r5:mbpp_sanitized", "r5:mawps" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2652471477957978, "gap_recovered": 0.20820206198199043 }, { "task": "openbookqa_test", "domain": "science", "N": 8, "seed": 22, "method": "mean", "anchors": [ "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.6989556559749033, "gap_recovered": -0.040406136677183 }, { "task": "openbookqa_test", "domain": "science", "N": 8, "seed": 22, "method": "global_ridge", "anchors": [ "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7639373076921222, "gap_recovered": 0.1973316135077643 }, { "task": "openbookqa_test", "domain": "science", "N": 8, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:humaneval", "r5:humaneval", "r5:pubmedqa_pqal", "r4:aqua_rat", "r5:medmcqa_easy", "r5:mbpp_sanitized", "r5:mawps" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7639364930953102, "gap_recovered": 0.19732863327552524 }, { "task": "gsm_hard", "domain": "math", "N": 8, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.0646524178844759, "gap_recovered": 0.015220206359337235 }, { "task": "gsm_hard", "domain": "math", "N": 8, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07239382530080862, "gap_recovered": 0.10454413808625329 }, { "task": "gsm_hard", "domain": "math", "N": 8, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:math_algebra_easy", "r4:mmlu_elementary_math", "r5:pubmedqa_pqal", "r4:aqua_rat", "r4:mbpp" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07239352248180872, "gap_recovered": 0.1045406440208698 }, { "task": "gsm8k_test_500", "domain": "math", "N": 8, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.10447578956340922, "gap_recovered": 0.11473026357848072 }, { "task": "gsm8k_test_500", "domain": "math", "N": 8, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12504286141231144, "gap_recovered": 0.21113841287020987 }, { "task": "gsm8k_test_500", "domain": "math", "N": 8, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:math_algebra_easy", "r4:mmlu_elementary_math", "r5:pubmedqa_pqal", "r4:aqua_rat", "r4:mbpp" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.1250425544826464, "gap_recovered": 0.21113697413740498 }, { "task": "mbpp_test_held", "domain": "code", "N": 8, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.24428923323236665, "gap_recovered": 0.1587692581374071 }, { "task": "mbpp_test_held", "domain": "code", "N": 8, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2547727211590471, "gap_recovered": 0.27525245732274534 }, { "task": "mbpp_test_held", "domain": "code", "N": 8, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:pubmedqa_pqal", "r4:aqua_rat", "r4:mbpp" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25477285064499955, "gap_recovered": 0.27525389605555045 }, { "task": "mbpp_plus", "domain": "code", "N": 8, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.24603687565902185, "gap_recovered": 0.12587232425295075 }, { "task": "mbpp_plus", "domain": "code", "N": 8, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.27493519084206947, "gap_recovered": 0.24972224646601196 }, { "task": "mbpp_plus", "domain": "code", "N": 8, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:pubmedqa_pqal", "r4:aqua_rat", "r4:mbpp" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.27493516686318936, "gap_recovered": 0.24972214369938295 }, { "task": "openbookqa_test", "domain": "science", "N": 8, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7419641204812061, "gap_recovered": 0.11694190419953457 }, { "task": "openbookqa_test", "domain": "science", "N": 8, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7681897558563057, "gap_recovered": 0.21288935069380135 }, { "task": "openbookqa_test", "domain": "science", "N": 8, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:humaneval", "r4:math_algebra_easy", "r5:pubmedqa_pqal", "r4:aqua_rat", "r4:mbpp" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7681880985731366, "gap_recovered": 0.21288328746269514 }, { "task": "gsm_hard", "domain": "math", "N": 8, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.04166666666666667, "gap_recovered": -0.25000000000000006 }, { "task": "gsm_hard", "domain": "math", "N": 8, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07217366698144498, "gap_recovered": 0.10200384978590354 }, { "task": "gsm_hard", "domain": "math", "N": 8, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r4:sciq", "r4:gsm8k" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07217657938770865, "gap_recovered": 0.10203745447356127 }, { "task": "gsm8k_test_500", "domain": "math", "N": 8, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.026666666666666672, "gap_recovered": -0.25 }, { "task": "gsm8k_test_500", "domain": "math", "N": 8, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12450159094799526, "gap_recovered": 0.2086012075687278 }, { "task": "gsm8k_test_500", "domain": "math", "N": 8, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r4:sciq", "r4:arc_easy" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12450937380735902, "gap_recovered": 0.20863768972199542 }, { "task": "mbpp_test_held", "domain": "code", "N": 8, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.20750000000000002, "gap_recovered": -0.24999999999999992 }, { "task": "mbpp_test_held", "domain": "code", "N": 8, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25476326868451876, "gap_recovered": 0.2751474298279861 }, { "task": "mbpp_test_held", "domain": "code", "N": 8, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r4:arc_easy", "r4:sciq" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2547594395999251, "gap_recovered": 0.275104884443612 }, { "task": "mbpp_plus", "domain": "code", "N": 8, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.15833333333333333, "gap_recovered": -0.25000000000000006 }, { "task": "mbpp_plus", "domain": "code", "N": 8, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2749258150999574, "gap_recovered": 0.2496820647141031 }, { "task": "mbpp_plus", "domain": "code", "N": 8, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r4:arc_easy", "r4:sciq" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2749346633067076, "gap_recovered": 0.2497199856001755 }, { "task": "openbookqa_test", "domain": "science", "N": 8, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.6416666666666666, "gap_recovered": -0.2500000000000001 }, { "task": "openbookqa_test", "domain": "science", "N": 8, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7647975219255206, "gap_recovered": 0.20047873875190492 }, { "task": "openbookqa_test", "domain": "science", "N": 8, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:math_algebra_easy", "r4:aqua_rat", "r4:sciq", "r5:mbpp_sanitized", "r4:gsm8k" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7648052746400066, "gap_recovered": 0.20050710234148764 }, { "task": "gsm_hard", "domain": "math", "N": 8, "seed": 55, "method": "mean", "anchors": [ "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.04166666666666667, "gap_recovered": -0.25000000000000006 }, { "task": "gsm_hard", "domain": "math", "N": 8, "seed": 55, "method": "global_ridge", "anchors": [ "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07169886460249453, "gap_recovered": 0.09652536079801373 }, { "task": "gsm_hard", "domain": "math", "N": 8, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval" ], "selected_topk": [ "r5:humaneval", "r4:humaneval", "r4:math_algebra_easy", "r5:mbpp_sanitized", "r5:math_counting_easy", "r4:sciq", "r5:aqua_rat_numeric" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07169961274355308, "gap_recovered": 0.09653399319484314 }, { "task": "gsm8k_test_500", "domain": "math", "N": 8, "seed": 55, "method": "mean", "anchors": [ "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.026666666666666672, "gap_recovered": -0.25 }, { "task": "gsm8k_test_500", "domain": "math", "N": 8, "seed": 55, "method": "global_ridge", "anchors": [ "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.1234050847196031, "gap_recovered": 0.20346133462313953 }, { "task": "gsm8k_test_500", "domain": "math", "N": 8, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval" ], "selected_topk": [ "r5:humaneval", "r4:humaneval", "r4:math_algebra_easy", "r5:mbpp_sanitized", "r4:sciq", "r5:math_counting_easy", "r5:aqua_rat_numeric" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.1234069701446884, "gap_recovered": 0.2034701725532269 }, { "task": "mbpp_test_held", "domain": "code", "N": 8, "seed": 55, "method": "mean", "anchors": [ "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.20750000000000002, "gap_recovered": -0.24999999999999992 }, { "task": "mbpp_test_held", "domain": "code", "N": 8, "seed": 55, "method": "global_ridge", "anchors": [ "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2520854159470262, "gap_recovered": 0.2453935105225134 }, { "task": "mbpp_test_held", "domain": "code", "N": 8, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval" ], "selected_topk": [ "r5:humaneval", "r4:humaneval", "r4:math_algebra_easy", "r5:mbpp_sanitized", "r4:arc_easy", "r5:math_counting_easy", "r4:sciq" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2520842228264644, "gap_recovered": 0.24538025362738236 }, { "task": "mbpp_plus", "domain": "code", "N": 8, "seed": 55, "method": "mean", "anchors": [ "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.15833333333333333, "gap_recovered": -0.25000000000000006 }, { "task": "mbpp_plus", "domain": "code", "N": 8, "seed": 55, "method": "global_ridge", "anchors": [ "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2653153916885113, "gap_recovered": 0.2084945358079056 }, { "task": "mbpp_plus", "domain": "code", "N": 8, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval" ], "selected_topk": [ "r5:humaneval", "r4:humaneval", "r4:math_algebra_easy", "r5:math_counting_easy", "r5:mbpp_sanitized", "r4:arc_easy", "r4:sciq" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2653136891880255, "gap_recovered": 0.2084872393772521 }, { "task": "openbookqa_test", "domain": "science", "N": 8, "seed": 55, "method": "mean", "anchors": [ "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.6416666666666666, "gap_recovered": -0.2500000000000001 }, { "task": "openbookqa_test", "domain": "science", "N": 8, "seed": 55, "method": "global_ridge", "anchors": [ "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7619699721226747, "gap_recovered": 0.19013404435124903 }, { "task": "openbookqa_test", "domain": "science", "N": 8, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:sciq", "r4:arc_easy", "r4:humaneval", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval" ], "selected_topk": [ "r5:humaneval", "r4:humaneval", "r4:math_algebra_easy", "r4:sciq", "r5:mbpp_sanitized", "r5:math_counting_easy", "r5:aqua_rat_numeric" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7619703091972175, "gap_recovered": 0.19013527755079598 }, { "task": "gsm_hard", "domain": "math", "N": 12, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.046795635689264065, "gap_recovered": -0.19081958820079933 }, { "task": "gsm_hard", "domain": "math", "N": 12, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07289376165675021, "gap_recovered": 0.11031263450096393 }, { "task": "gsm_hard", "domain": "math", "N": 12, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:multiarith", "r4:mmlu_elementary_math", "r4:mmlu_high_school_biology", "r4:svamp", "r5:medmcqa_easy", "r4:mbpp" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07289361024725027, "gap_recovered": 0.11031088746827233 }, { "task": "gsm8k_test_500", "domain": "math", "N": 12, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.05766524479306978, "gap_recovered": -0.10469416503248542 }, { "task": "gsm8k_test_500", "domain": "math", "N": 12, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12629289824387124, "gap_recovered": 0.21699796051814646 }, { "task": "gsm8k_test_500", "domain": "math", "N": 12, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:multiarith", "r4:mmlu_elementary_math", "r4:mmlu_high_school_biology", "r4:svamp", "r5:medmcqa_easy", "r4:mbpp" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.1262937751857714, "gap_recovered": 0.21700207118330342 }, { "task": "mbpp_test_held", "domain": "code", "N": 12, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.22348956247855878, "gap_recovered": -0.07233819468268035 }, { "task": "mbpp_test_held", "domain": "code", "N": 12, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25477145404651247, "gap_recovered": 0.27523837829458286 }, { "task": "mbpp_test_held", "domain": "code", "N": 12, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:mmlu_elementary_math", "r4:svamp", "r5:medmcqa_easy", "r4:mbpp" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2547717407654072, "gap_recovered": 0.27524156406008 }, { "task": "mbpp_plus", "domain": "code", "N": 12, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.19311567914897, "gap_recovered": -0.10093280364727152 }, { "task": "mbpp_plus", "domain": "code", "N": 12, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2749361260183927, "gap_recovered": 0.24972625436454002 }, { "task": "mbpp_plus", "domain": "code", "N": 12, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:mmlu_elementary_math", "r4:svamp", "r5:medmcqa_easy", "r4:mbpp" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2749361020395126, "gap_recovered": 0.249726151597911 }, { "task": "openbookqa_test", "domain": "science", "N": 12, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.6818556116093164, "gap_recovered": -0.10296727460006182 }, { "task": "openbookqa_test", "domain": "science", "N": 12, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7701793945246729, "gap_recovered": 0.22016851655368141 }, { "task": "openbookqa_test", "domain": "science", "N": 12, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mmlu_high_school_biology", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:humaneval", "r4:multiarith", "r4:svamp", "r5:medmcqa_easy", "r4:mbpp" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7701706024970131, "gap_recovered": 0.2201363505988285 }, { "task": "gsm_hard", "domain": "math", "N": 12, "seed": 22, "method": "mean", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.057627453502567344, "gap_recovered": -0.06583707497037687 }, { "task": "gsm_hard", "domain": "math", "N": 12, "seed": 22, "method": "global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07227199409199858, "gap_recovered": 0.10313839336921435 }, { "task": "gsm_hard", "domain": "math", "N": 12, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:humaneval", "r5:humaneval", "r4:mmlu_high_school_biology", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat", "r4:openbookqa", "r5:medmcqa_easy" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07227138845399879, "gap_recovered": 0.10313140523844752 }, { "task": "gsm8k_test_500", "domain": "math", "N": 12, "seed": 22, "method": "mean", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.08585960651266165, "gap_recovered": 0.0274669055281015 }, { "task": "gsm8k_test_500", "domain": "math", "N": 12, "seed": 22, "method": "global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12446026506095098, "gap_recovered": 0.20840749247320772 }, { "task": "gsm8k_test_500", "domain": "math", "N": 12, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:humaneval", "r5:humaneval", "r4:mmlu_high_school_biology", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat", "r4:openbookqa", "r5:medmcqa_easy" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12446679827810705, "gap_recovered": 0.20843811692862682 }, { "task": "mbpp_test_held", "domain": "code", "N": 12, "seed": 22, "method": "mean", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.23572552253460063, "gap_recovered": 0.06361691705111805 }, { "task": "mbpp_test_held", "domain": "code", "N": 12, "seed": 22, "method": "global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25255285098634916, "gap_recovered": 0.25058723318165727 }, { "task": "mbpp_test_held", "domain": "code", "N": 12, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:humaneval", "r5:humaneval", "r4:mmlu_high_school_biology", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat", "r4:openbookqa", "r5:medmcqa_easy" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25256959167020077, "gap_recovered": 0.2507732407800084 }, { "task": "mbpp_plus", "domain": "code", "N": 12, "seed": 22, "method": "mean", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2237824846958292, "gap_recovered": 0.030496362982125127 }, { "task": "mbpp_plus", "domain": "code", "N": 12, "seed": 22, "method": "global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.26642122570125537, "gap_recovered": 0.21323382443395156 }, { "task": "mbpp_plus", "domain": "code", "N": 12, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:humaneval", "r5:humaneval", "r4:mmlu_high_school_biology", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat", "r4:openbookqa", "r5:medmcqa_easy" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.266440984298443, "gap_recovered": 0.2133185041361842 }, { "task": "openbookqa_test", "domain": "science", "N": 12, "seed": 22, "method": "mean", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7208439004010168, "gap_recovered": 0.039672806345183394 }, { "task": "openbookqa_test", "domain": "science", "N": 12, "seed": 22, "method": "global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7701774563460514, "gap_recovered": 0.22016142565628571 }, { "task": "openbookqa_test", "domain": "science", "N": 12, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mmlu_high_school_biology", "r4:humaneval", "r5:humaneval", "r4:svamp", "r4:openbookqa", "r5:pubmedqa_pqal", "r4:aqua_rat", "r5:medmcqa_easy" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.770183636046004, "gap_recovered": 0.22018403431464906 }, { "task": "gsm_hard", "domain": "math", "N": 12, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:mbpp_sanitized", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.05790650121096908, "gap_recovered": -0.06261729371958759 }, { "task": "gsm_hard", "domain": "math", "N": 12, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:mbpp_sanitized", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07281855566748258, "gap_recovered": 0.10944487308633737 }, { "task": "gsm_hard", "domain": "math", "N": 12, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:mbpp_sanitized", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:math_algebra_easy", "r4:mmlu_elementary_math", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat", "r4:medmcqa_easy" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07281329196074915, "gap_recovered": 0.10938413800864398 }, { "task": "gsm8k_test_500", "domain": "math", "N": 12, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:mbpp_sanitized", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.08668351535139414, "gap_recovered": 0.03132897820966002 }, { "task": "gsm8k_test_500", "domain": "math", "N": 12, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:mbpp_sanitized", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.1260294868206156, "gap_recovered": 0.21576321947163563 }, { "task": "gsm8k_test_500", "domain": "math", "N": 12, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:mbpp_sanitized", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:math_algebra_easy", "r4:mmlu_elementary_math", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat", "r4:medmcqa_easy" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12603113108667835, "gap_recovered": 0.21577092696880476 }, { "task": "mbpp_test_held", "domain": "code", "N": 12, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:mbpp_sanitized", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.23607687341755837, "gap_recovered": 0.06752081575064844 }, { "task": "mbpp_test_held", "domain": "code", "N": 12, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:mbpp_sanitized", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.254778455536941, "gap_recovered": 0.27531617263267777 }, { "task": "mbpp_test_held", "domain": "code", "N": 12, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:mbpp_sanitized", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat", "r4:medmcqa_easy" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2547751813921435, "gap_recovered": 0.2752797932460388 }, { "task": "mbpp_plus", "domain": "code", "N": 12, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:mbpp_sanitized", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.22509549623248223, "gap_recovered": 0.036123555282066684 }, { "task": "mbpp_plus", "domain": "code", "N": 12, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:mbpp_sanitized", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.27492768545260377, "gap_recovered": 0.24969008051115896 }, { "task": "mbpp_plus", "domain": "code", "N": 12, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:mbpp_sanitized", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat", "r4:medmcqa_easy" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.27493065883373397, "gap_recovered": 0.24970282357314552 }, { "task": "openbookqa_test", "domain": "science", "N": 12, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:mbpp_sanitized", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.718775217752347, "gap_recovered": 0.03210445519151349 }, { "task": "openbookqa_test", "domain": "science", "N": 12, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:mbpp_sanitized", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7686479525182439, "gap_recovered": 0.21456567994479495 }, { "task": "openbookqa_test", "domain": "science", "N": 12, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:mbpp_sanitized", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:humaneval", "r4:math_algebra_easy", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat", "r4:medmcqa_easy" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7686677837371826, "gap_recovered": 0.21463823318481448 }, { "task": "gsm_hard", "domain": "math", "N": 12, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.05730614473079814, "gap_recovered": -0.06954448387540614 }, { "task": "gsm_hard", "domain": "math", "N": 12, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07286305224758456, "gap_recovered": 0.1099582951644372 }, { "task": "gsm_hard", "domain": "math", "N": 12, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:aqua_rat", "r4:openbookqa", "r5:medmcqa_easy", "r5:conala_curated" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07285891075243896, "gap_recovered": 0.10991050868198791 }, { "task": "gsm8k_test_500", "domain": "math", "N": 12, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.0851024329525301, "gap_recovered": 0.023917654464984846 }, { "task": "gsm8k_test_500", "domain": "math", "N": 12, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.126309691681259, "gap_recovered": 0.21707667975590156 }, { "task": "gsm8k_test_500", "domain": "math", "N": 12, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:aqua_rat", "r4:openbookqa", "r5:conala_curated", "r5:medmcqa_easy" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12630539466594828, "gap_recovered": 0.2170565374966326 }, { "task": "mbpp_test_held", "domain": "code", "N": 12, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.23555245531016383, "gap_recovered": 0.061693947890709144 }, { "task": "mbpp_test_held", "domain": "code", "N": 12, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25476923428732773, "gap_recovered": 0.27521371430364133 }, { "task": "mbpp_test_held", "domain": "code", "N": 12, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:aqua_rat", "r5:conala_curated", "r4:openbookqa", "r5:medmcqa_easy" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2547697984761205, "gap_recovered": 0.2752199830680057 }, { "task": "mbpp_plus", "domain": "code", "N": 12, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.22392417589823407, "gap_recovered": 0.03110361099243171 }, { "task": "mbpp_plus", "domain": "code", "N": 12, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.274942360527214, "gap_recovered": 0.24975297368805993 }, { "task": "mbpp_plus", "domain": "code", "N": 12, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:aqua_rat", "r5:conala_curated", "r4:openbookqa", "r5:medmcqa_easy" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.27493696527919553, "gap_recovered": 0.24972985119655225 }, { "task": "openbookqa_test", "domain": "science", "N": 12, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7177663255559987, "gap_recovered": 0.02841338618048329 }, { "task": "openbookqa_test", "domain": "science", "N": 12, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7681811604554626, "gap_recovered": 0.21285790410535108 }, { "task": "openbookqa_test", "domain": "science", "N": 12, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mbpp_sanitized", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:openbookqa", "r4:aqua_rat", "r5:medmcqa_easy", "r5:conala_curated" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7681842503054388, "gap_recovered": 0.21286920843453233 }, { "task": "gsm_hard", "domain": "math", "N": 12, "seed": 55, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.04166666666666667, "gap_recovered": -0.25000000000000006 }, { "task": "gsm_hard", "domain": "math", "N": 12, "seed": 55, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07221353221213682, "gap_recovered": 0.10246383321696324 }, { "task": "gsm_hard", "domain": "math", "N": 12, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated" ], "selected_topk": [ "r5:humaneval", "r4:humaneval", "r4:math_algebra_easy", "r4:mmlu_elementary_math", "r4:openbookqa", "r5:conala_curated", "r5:mbpp_sanitized", "r5:math_counting_easy" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.0722140398792837, "gap_recovered": 0.10246969091481181 }, { "task": "gsm8k_test_500", "domain": "math", "N": 12, "seed": 55, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.026666666666666672, "gap_recovered": -0.25 }, { "task": "gsm8k_test_500", "domain": "math", "N": 12, "seed": 55, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12460340390260194, "gap_recovered": 0.20907845579344658 }, { "task": "gsm8k_test_500", "domain": "math", "N": 12, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated" ], "selected_topk": [ "r5:humaneval", "r4:humaneval", "r4:math_algebra_easy", "r4:mmlu_elementary_math", "r4:openbookqa", "r5:conala_curated", "r5:mbpp_sanitized", "r4:sciq" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.1246049604744747, "gap_recovered": 0.20908575222410014 }, { "task": "mbpp_test_held", "domain": "code", "N": 12, "seed": 55, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.20750000000000002, "gap_recovered": -0.24999999999999992 }, { "task": "mbpp_test_held", "domain": "code", "N": 12, "seed": 55, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2525556534323199, "gap_recovered": 0.2506183714702209 }, { "task": "mbpp_test_held", "domain": "code", "N": 12, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated" ], "selected_topk": [ "r5:humaneval", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:conala_curated", "r4:openbookqa", "r5:mbpp_sanitized", "r4:arc_easy" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25255594015121463, "gap_recovered": 0.250621557235718 }, { "task": "mbpp_plus", "domain": "code", "N": 12, "seed": 55, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.15833333333333333, "gap_recovered": -0.25000000000000006 }, { "task": "mbpp_plus", "domain": "code", "N": 12, "seed": 55, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.26660809311373485, "gap_recovered": 0.2140346847731493 }, { "task": "mbpp_plus", "domain": "code", "N": 12, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated" ], "selected_topk": [ "r5:humaneval", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:conala_curated", "r4:openbookqa", "r5:math_counting_easy", "r5:mbpp_sanitized" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2666099874452613, "gap_recovered": 0.2140428033368342 }, { "task": "openbookqa_test", "domain": "science", "N": 12, "seed": 55, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.6416666666666666, "gap_recovered": -0.2500000000000001 }, { "task": "openbookqa_test", "domain": "science", "N": 12, "seed": 55, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7690827786785432, "gap_recovered": 0.21615650736052422 }, { "task": "openbookqa_test", "domain": "science", "N": 12, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated" ], "selected_topk": [ "r4:mmlu_elementary_math", "r5:humaneval", "r4:humaneval", "r4:math_algebra_easy", "r4:openbookqa", "r5:conala_curated", "r4:sciq", "r5:mbpp_sanitized" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7690766832472264, "gap_recovered": 0.2161342070020481 }, { "task": "gsm_hard", "domain": "math", "N": 16, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.059948044563161926, "gap_recovered": -0.03906102427120862 }, { "task": "gsm_hard", "domain": "math", "N": 16, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07311635143455418, "gap_recovered": 0.11288097809100972 }, { "task": "gsm_hard", "domain": "math", "N": 16, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:mmlu_elementary_math", "r4:mmlu_high_school_biology", "r4:svamp" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07309355094515045, "gap_recovered": 0.1126178955209667 }, { "task": "gsm8k_test_500", "domain": "math", "N": 16, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.0921708915973532, "gap_recovered": 0.05705105436259312 }, { "task": "gsm8k_test_500", "domain": "math", "N": 16, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12692907574533047, "gap_recovered": 0.21998004255623657 }, { "task": "gsm8k_test_500", "domain": "math", "N": 16, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:mmlu_elementary_math", "r4:mmlu_high_school_biology", "r4:svamp" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12690070667486084, "gap_recovered": 0.2198470625384102 }, { "task": "mbpp_test_held", "domain": "code", "N": 16, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.23833444250041042, "gap_recovered": 0.09260491667122672 }, { "task": "mbpp_test_held", "domain": "code", "N": 16, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2547743489824493, "gap_recovered": 0.27527054424943626 }, { "task": "mbpp_test_held", "domain": "code", "N": 16, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:svamp" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2547730911189112, "gap_recovered": 0.2752565679879024 }, { "task": "mbpp_plus", "domain": "code", "N": 16, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2306903924065075, "gap_recovered": 0.060101681742174916 }, { "task": "mbpp_plus", "domain": "code", "N": 16, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.274936221933913, "gap_recovered": 0.2497266654310556 }, { "task": "mbpp_plus", "domain": "code", "N": 16, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:svamp" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.27493053893933356, "gap_recovered": 0.24970230974000093 }, { "task": "openbookqa_test", "domain": "science", "N": 16, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7263416142847346, "gap_recovered": 0.05978639372463909 }, { "task": "openbookqa_test", "domain": "science", "N": 16, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7703233534440227, "gap_recovered": 0.22069519552691244 }, { "task": "openbookqa_test", "domain": "science", "N": 16, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:arc_easy", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mmlu_high_school_biology", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:svamp" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7700408568875543, "gap_recovered": 0.21966167153983296 }, { "task": "gsm_hard", "domain": "math", "N": 16, "seed": 22, "method": "mean", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.05999645997738017, "gap_recovered": -0.038502384876382696 }, { "task": "gsm_hard", "domain": "math", "N": 16, "seed": 22, "method": "global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.0728135057153373, "gap_recovered": 0.10938660440773808 }, { "task": "gsm_hard", "domain": "math", "N": 16, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r5:humaneval", "r4:math_algebra_easy", "r4:mmlu_high_school_biology", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07278109517590754, "gap_recovered": 0.10901263664508698 }, { "task": "gsm8k_test_500", "domain": "math", "N": 16, "seed": 22, "method": "mean", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.09217455282978629, "gap_recovered": 0.05706821638962322 }, { "task": "gsm8k_test_500", "domain": "math", "N": 16, "seed": 22, "method": "global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12601758233432114, "gap_recovered": 0.21570741719213035 }, { "task": "gsm8k_test_500", "domain": "math", "N": 16, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r5:humaneval", "r4:math_algebra_easy", "r4:mmlu_high_school_biology", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12597274867967628, "gap_recovered": 0.21549725943598258 }, { "task": "mbpp_test_held", "domain": "code", "N": 16, "seed": 22, "method": "mean", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.23862580439140058, "gap_recovered": 0.09584227101556184 }, { "task": "mbpp_test_held", "domain": "code", "N": 16, "seed": 22, "method": "global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25477864976586967, "gap_recovered": 0.2753183307318851 }, { "task": "mbpp_test_held", "domain": "code", "N": 16, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r5:humaneval", "r4:mmlu_high_school_biology", "r4:math_algebra_easy", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25477734565734866, "gap_recovered": 0.27530384063720725 }, { "task": "mbpp_plus", "domain": "code", "N": 16, "seed": 22, "method": "mean", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.23142021360068488, "gap_recovered": 0.06322948686007801 }, { "task": "mbpp_plus", "domain": "code", "N": 16, "seed": 22, "method": "global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.27493547858863043, "gap_recovered": 0.24972347966555897 }, { "task": "mbpp_plus", "domain": "code", "N": 16, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r5:humaneval", "r4:mmlu_high_school_biology", "r4:math_algebra_easy", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2749307787281344, "gap_recovered": 0.24970333740629014 }, { "task": "openbookqa_test", "domain": "science", "N": 16, "seed": 22, "method": "mean", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7280278296854304, "gap_recovered": 0.06595547445889202 }, { "task": "openbookqa_test", "domain": "science", "N": 16, "seed": 22, "method": "global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7704543911725625, "gap_recovered": 0.22117460185083868 }, { "task": "openbookqa_test", "domain": "science", "N": 16, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:mbpp", "r4:openbookqa", "r4:svamp", "r4:mmlu_high_school_biology", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mmlu_high_school_biology", "r4:mbpp_sanitized", "r4:humaneval", "r5:humaneval", "r4:math_algebra_easy", "r4:svamp", "r4:openbookqa", "r5:pubmedqa_pqal" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7700888338308225, "gap_recovered": 0.21983719694203366 }, { "task": "gsm_hard", "domain": "math", "N": 16, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.052866684595743826, "gap_recovered": -0.12076902389526362 }, { "task": "gsm_hard", "domain": "math", "N": 16, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.0728197847563645, "gap_recovered": 0.10945905488112877 }, { "task": "gsm_hard", "domain": "math", "N": 16, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r5:humaneval", "r4:humaneval", "r4:math_algebra_easy", "r4:mmlu_elementary_math", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07280703073260429, "gap_recovered": 0.10931189306851101 }, { "task": "gsm8k_test_500", "domain": "math", "N": 16, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.07347902846062322, "gap_recovered": -0.030567054090828657 }, { "task": "gsm8k_test_500", "domain": "math", "N": 16, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12603295074112114, "gap_recovered": 0.21577945659900535 }, { "task": "gsm8k_test_500", "domain": "math", "N": 16, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r5:humaneval", "r4:humaneval", "r4:math_algebra_easy", "r4:mmlu_elementary_math", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.1260124741477528, "gap_recovered": 0.21568347256759124 }, { "task": "mbpp_test_held", "domain": "code", "N": 16, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.23054410515160398, "gap_recovered": 0.006045612795599651 }, { "task": "mbpp_test_held", "domain": "code", "N": 16, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25477470044432016, "gap_recovered": 0.275274449381335 }, { "task": "mbpp_test_held", "domain": "code", "N": 16, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r5:humaneval", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25477317435988067, "gap_recovered": 0.2752574928875629 }, { "task": "mbpp_plus", "domain": "code", "N": 16, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.21094915592807464, "gap_recovered": -0.024503617451108697 }, { "task": "mbpp_plus", "domain": "code", "N": 16, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2749346393278276, "gap_recovered": 0.24971988283354674 }, { "task": "mbpp_plus", "domain": "code", "N": 16, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r5:humaneval", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.27493053893933356, "gap_recovered": 0.24970230974000093 }, { "task": "openbookqa_test", "domain": "science", "N": 16, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7025252753838725, "gap_recovered": -0.027346553473636886 }, { "task": "openbookqa_test", "domain": "science", "N": 16, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7686494412641416, "gap_recovered": 0.2145711265761279 }, { "task": "openbookqa_test", "domain": "science", "N": 16, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:svamp", "r4:humaneval", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r5:humaneval", "r4:humaneval", "r4:math_algebra_easy", "r4:svamp", "r5:pubmedqa_pqal", "r4:aqua_rat" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7682971421877542, "gap_recovered": 0.2132822275161742 }, { "task": "gsm_hard", "domain": "math", "N": 16, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.05988999237959414, "gap_recovered": -0.039730857158529254 }, { "task": "gsm_hard", "domain": "math", "N": 16, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07309392501567973, "gap_recovered": 0.1126222117193814 }, { "task": "gsm_hard", "domain": "math", "N": 16, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": [ "r4:math_counting_easy", "r4:mbpp_sanitized", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:mmlu_high_school_biology", "r4:aqua_rat" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07306783804948304, "gap_recovered": 0.11232120826326579 }, { "task": "gsm8k_test_500", "domain": "math", "N": 16, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.09205213174052623, "gap_recovered": 0.05649436753371671 }, { "task": "gsm8k_test_500", "domain": "math", "N": 16, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12716354808588137, "gap_recovered": 0.22107913165256893 }, { "task": "gsm8k_test_500", "domain": "math", "N": 16, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": [ "r4:math_counting_easy", "r4:mbpp_sanitized", "r4:humaneval", "r5:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:mmlu_high_school_biology", "r4:aqua_rat" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12713919102460491, "gap_recovered": 0.22096495792783555 }, { "task": "mbpp_test_held", "domain": "code", "N": 16, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.23857803332394567, "gap_recovered": 0.09531148137717402 }, { "task": "mbpp_test_held", "domain": "code", "N": 16, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25479441930507796, "gap_recovered": 0.27549354783419944 }, { "task": "mbpp_test_held", "domain": "code", "N": 16, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:math_counting_easy", "r4:humaneval", "r5:humaneval", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_algebra_easy", "r4:aqua_rat" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25479235677883544, "gap_recovered": 0.2754706308759492 }, { "task": "mbpp_plus", "domain": "code", "N": 16, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2313646065777746, "gap_recovered": 0.06299117104760542 }, { "task": "mbpp_plus", "domain": "code", "N": 16, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2749491705291573, "gap_recovered": 0.24978215941067408 }, { "task": "mbpp_plus", "domain": "code", "N": 16, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:humaneval", "r5:humaneval", "r4:math_counting_easy", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_algebra_easy", "r4:aqua_rat" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2749346393278276, "gap_recovered": 0.24971988283354674 }, { "task": "openbookqa_test", "domain": "science", "N": 16, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7265955999527854, "gap_recovered": 0.06071560958336151 }, { "task": "openbookqa_test", "domain": "science", "N": 16, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7705321992128744, "gap_recovered": 0.22145926541295533 }, { "task": "openbookqa_test", "domain": "science", "N": 16, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mbpp_sanitized", "r4:math_algebra_easy", "r4:aqua_rat", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mmlu_high_school_biology", "r4:mbpp_sanitized", "r4:math_counting_easy", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:openbookqa" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7701881865523327, "gap_recovered": 0.22020068250853433 }, { "task": "gsm_hard", "domain": "math", "N": 16, "seed": 55, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.043656596123487115, "gap_recovered": -0.2270392754982257 }, { "task": "gsm_hard", "domain": "math", "N": 16, "seed": 55, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07285970342570339, "gap_recovered": 0.10991965491196211 }, { "task": "gsm_hard", "domain": "math", "N": 16, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": [ "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:mmlu_elementary_math", "r4:aqua_rat", "r4:openbookqa", "r5:medmcqa_easy" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07284248727491534, "gap_recovered": 0.10972100701825385 }, { "task": "gsm8k_test_500", "domain": "math", "N": 16, "seed": 55, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.04926707914505883, "gap_recovered": -0.14406056650753674 }, { "task": "gsm8k_test_500", "domain": "math", "N": 16, "seed": 55, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12633001480979483, "gap_recovered": 0.21717194442091328 }, { "task": "gsm8k_test_500", "domain": "math", "N": 16, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": [ "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:mmlu_elementary_math", "r4:aqua_rat", "r4:openbookqa", "r5:conala_curated" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.126309735528354, "gap_recovered": 0.21707688528915942 }, { "task": "mbpp_test_held", "domain": "code", "N": 16, "seed": 55, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2197829900938889, "gap_recovered": -0.11352233229012336 }, { "task": "mbpp_test_held", "domain": "code", "N": 16, "seed": 55, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.25266901838368383, "gap_recovered": 0.25187798204093137 }, { "task": "mbpp_test_held", "domain": "code", "N": 16, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": [ "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:conala_curated", "r4:openbookqa" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2526553668646977, "gap_recovered": 0.25172629849664097 }, { "task": "mbpp_plus", "domain": "code", "N": 16, "seed": 55, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.18353273553409796, "gap_recovered": -0.14200256199672304 }, { "task": "mbpp_plus", "domain": "code", "N": 16, "seed": 55, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.26689128368750387, "gap_recovered": 0.21524835866073083 }, { "task": "mbpp_plus", "domain": "code", "N": 16, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": [ "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:conala_curated", "r4:openbookqa" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2668517425142486, "gap_recovered": 0.2150788964896368 }, { "task": "openbookqa_test", "domain": "science", "N": 16, "seed": 55, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.6723003910053735, "gap_recovered": -0.13792539876082843 }, { "task": "openbookqa_test", "domain": "science", "N": 16, "seed": 55, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7697641186878599, "gap_recovered": 0.2186492147116826 }, { "task": "openbookqa_test", "domain": "science", "N": 16, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:multiarith", "r4:humaneval", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy" ], "selected_topk": [ "r4:mmlu_elementary_math", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:openbookqa", "r4:aqua_rat", "r5:medmcqa_easy" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.769729568547216, "gap_recovered": 0.21852281175810734 }, { "task": "gsm_hard", "domain": "math", "N": 24, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.058087711663081736, "gap_recovered": -0.0605264038875185 }, { "task": "gsm_hard", "domain": "math", "N": 24, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07326510681503122, "gap_recovered": 0.11459738632728325 }, { "task": "gsm_hard", "domain": "math", "N": 24, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:math_counting_easy", "r4:mbpp_sanitized", "r4:mmlu_high_school_physics", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:mmlu_elementary_math" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07306991325027642, "gap_recovered": 0.11234515288780485 }, { "task": "gsm8k_test_500", "domain": "math", "N": 24, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.08727496881594604, "gap_recovered": 0.034101416324747065 }, { "task": "gsm8k_test_500", "domain": "math", "N": 24, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.1274962817663434, "gap_recovered": 0.22263882077973468 }, { "task": "gsm8k_test_500", "domain": "math", "N": 24, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:math_counting_easy", "r4:mbpp_sanitized", "r4:mmlu_high_school_physics", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:mmlu_elementary_math" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12716514850484914, "gap_recovered": 0.22108663361648034 }, { "task": "mbpp_test_held", "domain": "code", "N": 24, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2363231649481017, "gap_recovered": 0.07025738831224113 }, { "task": "mbpp_test_held", "domain": "code", "N": 24, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2548052961250832, "gap_recovered": 0.275614401389813 }, { "task": "mbpp_test_held", "domain": "code", "N": 24, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:math_counting_easy", "r5:humaneval", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:mmlu_elementary_math" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2548048059282632, "gap_recovered": 0.27560895475848 }, { "task": "mbpp_plus", "domain": "code", "N": 24, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.22549109979607596, "gap_recovered": 0.037818999126039775 }, { "task": "mbpp_plus", "domain": "code", "N": 24, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.27496794599226154, "gap_recovered": 0.24986262568112086 }, { "task": "mbpp_plus", "domain": "code", "N": 24, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r5:humaneval", "r4:humaneval", "r4:math_counting_easy", "r4:mmlu_high_school_physics", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:mmlu_elementary_math" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2749465808101084, "gap_recovered": 0.24977106061475035 }, { "task": "openbookqa_test", "domain": "science", "N": 24, "seed": 11, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7208252208534328, "gap_recovered": 0.03960446653694945 }, { "task": "openbookqa_test", "domain": "science", "N": 24, "seed": 11, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7718177453402815, "gap_recovered": 0.22616248295224942 }, { "task": "openbookqa_test", "domain": "science", "N": 24, "seed": 11, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mmlu_high_school_physics", "r4:mmlu_high_school_biology", "r4:mbpp_sanitized", "r4:math_counting_easy", "r4:mmlu_elementary_math", "r5:humaneval", "r4:humaneval", "r4:multiarith" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7709135990581293, "gap_recovered": 0.22285463070047312 }, { "task": "gsm_hard", "domain": "math", "N": 24, "seed": 22, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.058087711663081736, "gap_recovered": -0.0605264038875185 }, { "task": "gsm_hard", "domain": "math", "N": 24, "seed": 22, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07326510681503122, "gap_recovered": 0.11459738632728325 }, { "task": "gsm_hard", "domain": "math", "N": 24, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:math_counting_easy", "r4:mbpp_sanitized", "r4:mmlu_high_school_physics", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:mmlu_elementary_math" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07306991325027642, "gap_recovered": 0.11234515288780485 }, { "task": "gsm8k_test_500", "domain": "math", "N": 24, "seed": 22, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.08727496881594604, "gap_recovered": 0.034101416324747065 }, { "task": "gsm8k_test_500", "domain": "math", "N": 24, "seed": 22, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.1274962817663434, "gap_recovered": 0.22263882077973468 }, { "task": "gsm8k_test_500", "domain": "math", "N": 24, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:math_counting_easy", "r4:mbpp_sanitized", "r4:mmlu_high_school_physics", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:mmlu_elementary_math" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12716514850484914, "gap_recovered": 0.22108663361648034 }, { "task": "mbpp_test_held", "domain": "code", "N": 24, "seed": 22, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2363231649481017, "gap_recovered": 0.07025738831224113 }, { "task": "mbpp_test_held", "domain": "code", "N": 24, "seed": 22, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2548052961250832, "gap_recovered": 0.275614401389813 }, { "task": "mbpp_test_held", "domain": "code", "N": 24, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:math_counting_easy", "r5:humaneval", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:mmlu_elementary_math" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2548048059282632, "gap_recovered": 0.27560895475848 }, { "task": "mbpp_plus", "domain": "code", "N": 24, "seed": 22, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.22549109979607596, "gap_recovered": 0.037818999126039775 }, { "task": "mbpp_plus", "domain": "code", "N": 24, "seed": 22, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.27496794599226154, "gap_recovered": 0.24986262568112086 }, { "task": "mbpp_plus", "domain": "code", "N": 24, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r5:humaneval", "r4:humaneval", "r4:math_counting_easy", "r4:mmlu_high_school_physics", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:mmlu_elementary_math" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2749465808101084, "gap_recovered": 0.24977106061475035 }, { "task": "openbookqa_test", "domain": "science", "N": 24, "seed": 22, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7208252208534328, "gap_recovered": 0.03960446653694945 }, { "task": "openbookqa_test", "domain": "science", "N": 24, "seed": 22, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7718177453402815, "gap_recovered": 0.22616248295224942 }, { "task": "openbookqa_test", "domain": "science", "N": 24, "seed": 22, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mmlu_high_school_physics", "r4:mmlu_high_school_biology", "r4:mbpp_sanitized", "r4:math_counting_easy", "r4:mmlu_elementary_math", "r5:humaneval", "r4:humaneval", "r4:multiarith" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7709135990581293, "gap_recovered": 0.22285463070047312 }, { "task": "gsm_hard", "domain": "math", "N": 24, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.058087711663081736, "gap_recovered": -0.0605264038875185 }, { "task": "gsm_hard", "domain": "math", "N": 24, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07326510681503122, "gap_recovered": 0.11459738632728325 }, { "task": "gsm_hard", "domain": "math", "N": 24, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:math_counting_easy", "r4:mbpp_sanitized", "r4:mmlu_high_school_physics", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:mmlu_elementary_math" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07306991325027642, "gap_recovered": 0.11234515288780485 }, { "task": "gsm8k_test_500", "domain": "math", "N": 24, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.08727496881594604, "gap_recovered": 0.034101416324747065 }, { "task": "gsm8k_test_500", "domain": "math", "N": 24, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.1274962817663434, "gap_recovered": 0.22263882077973468 }, { "task": "gsm8k_test_500", "domain": "math", "N": 24, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:math_counting_easy", "r4:mbpp_sanitized", "r4:mmlu_high_school_physics", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:mmlu_elementary_math" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12716514850484914, "gap_recovered": 0.22108663361648034 }, { "task": "mbpp_test_held", "domain": "code", "N": 24, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2363231649481017, "gap_recovered": 0.07025738831224113 }, { "task": "mbpp_test_held", "domain": "code", "N": 24, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2548052961250832, "gap_recovered": 0.275614401389813 }, { "task": "mbpp_test_held", "domain": "code", "N": 24, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:math_counting_easy", "r5:humaneval", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:mmlu_elementary_math" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2548048059282632, "gap_recovered": 0.27560895475848 }, { "task": "mbpp_plus", "domain": "code", "N": 24, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.22549109979607596, "gap_recovered": 0.037818999126039775 }, { "task": "mbpp_plus", "domain": "code", "N": 24, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.27496794599226154, "gap_recovered": 0.24986262568112086 }, { "task": "mbpp_plus", "domain": "code", "N": 24, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r5:humaneval", "r4:humaneval", "r4:math_counting_easy", "r4:mmlu_high_school_physics", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:mmlu_elementary_math" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2749465808101084, "gap_recovered": 0.24977106061475035 }, { "task": "openbookqa_test", "domain": "science", "N": 24, "seed": 33, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7208252208534328, "gap_recovered": 0.03960446653694945 }, { "task": "openbookqa_test", "domain": "science", "N": 24, "seed": 33, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7718177453402815, "gap_recovered": 0.22616248295224942 }, { "task": "openbookqa_test", "domain": "science", "N": 24, "seed": 33, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mmlu_high_school_physics", "r4:mmlu_high_school_biology", "r4:mbpp_sanitized", "r4:math_counting_easy", "r4:mmlu_elementary_math", "r5:humaneval", "r4:humaneval", "r4:multiarith" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7709135990581293, "gap_recovered": 0.22285463070047312 }, { "task": "gsm_hard", "domain": "math", "N": 24, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.058087711663081736, "gap_recovered": -0.0605264038875185 }, { "task": "gsm_hard", "domain": "math", "N": 24, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07326510681503122, "gap_recovered": 0.11459738632728325 }, { "task": "gsm_hard", "domain": "math", "N": 24, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:math_counting_easy", "r4:mbpp_sanitized", "r4:mmlu_high_school_physics", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:mmlu_elementary_math" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07306991325027642, "gap_recovered": 0.11234515288780485 }, { "task": "gsm8k_test_500", "domain": "math", "N": 24, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.08727496881594604, "gap_recovered": 0.034101416324747065 }, { "task": "gsm8k_test_500", "domain": "math", "N": 24, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.1274962817663434, "gap_recovered": 0.22263882077973468 }, { "task": "gsm8k_test_500", "domain": "math", "N": 24, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:math_counting_easy", "r4:mbpp_sanitized", "r4:mmlu_high_school_physics", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:mmlu_elementary_math" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12716514850484914, "gap_recovered": 0.22108663361648034 }, { "task": "mbpp_test_held", "domain": "code", "N": 24, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2363231649481017, "gap_recovered": 0.07025738831224113 }, { "task": "mbpp_test_held", "domain": "code", "N": 24, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2548052961250832, "gap_recovered": 0.275614401389813 }, { "task": "mbpp_test_held", "domain": "code", "N": 24, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:math_counting_easy", "r5:humaneval", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:mmlu_elementary_math" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2548048059282632, "gap_recovered": 0.27560895475848 }, { "task": "mbpp_plus", "domain": "code", "N": 24, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.22549109979607596, "gap_recovered": 0.037818999126039775 }, { "task": "mbpp_plus", "domain": "code", "N": 24, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.27496794599226154, "gap_recovered": 0.24986262568112086 }, { "task": "mbpp_plus", "domain": "code", "N": 24, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r5:humaneval", "r4:humaneval", "r4:math_counting_easy", "r4:mmlu_high_school_physics", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:mmlu_elementary_math" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2749465808101084, "gap_recovered": 0.24977106061475035 }, { "task": "openbookqa_test", "domain": "science", "N": 24, "seed": 44, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7208252208534328, "gap_recovered": 0.03960446653694945 }, { "task": "openbookqa_test", "domain": "science", "N": 24, "seed": 44, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7718177453402815, "gap_recovered": 0.22616248295224942 }, { "task": "openbookqa_test", "domain": "science", "N": 24, "seed": 44, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mmlu_high_school_physics", "r4:mmlu_high_school_biology", "r4:mbpp_sanitized", "r4:math_counting_easy", "r4:mmlu_elementary_math", "r5:humaneval", "r4:humaneval", "r4:multiarith" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7709135990581293, "gap_recovered": 0.22285463070047312 }, { "task": "gsm_hard", "domain": "math", "N": 24, "seed": 55, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.058087711663081736, "gap_recovered": -0.0605264038875185 }, { "task": "gsm_hard", "domain": "math", "N": 24, "seed": 55, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07326510681503122, "gap_recovered": 0.11459738632728325 }, { "task": "gsm_hard", "domain": "math", "N": 24, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:math_counting_easy", "r4:mbpp_sanitized", "r4:mmlu_high_school_physics", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:mmlu_elementary_math" ], "base_Y": 0.06333333333333334, "oracle": 0.15, "accuracy": 0.07306991325027642, "gap_recovered": 0.11234515288780485 }, { "task": "gsm8k_test_500", "domain": "math", "N": 24, "seed": 55, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.08727496881594604, "gap_recovered": 0.034101416324747065 }, { "task": "gsm8k_test_500", "domain": "math", "N": 24, "seed": 55, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.1274962817663434, "gap_recovered": 0.22263882077973468 }, { "task": "gsm8k_test_500", "domain": "math", "N": 24, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:math_counting_easy", "r4:mbpp_sanitized", "r4:mmlu_high_school_physics", "r5:humaneval", "r4:humaneval", "r4:multiarith", "r4:math_algebra_easy", "r4:mmlu_elementary_math" ], "base_Y": 0.08, "oracle": 0.29333333333333333, "accuracy": 0.12716514850484914, "gap_recovered": 0.22108663361648034 }, { "task": "mbpp_test_held", "domain": "code", "N": 24, "seed": 55, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2363231649481017, "gap_recovered": 0.07025738831224113 }, { "task": "mbpp_test_held", "domain": "code", "N": 24, "seed": 55, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2548052961250832, "gap_recovered": 0.275614401389813 }, { "task": "mbpp_test_held", "domain": "code", "N": 24, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r4:math_counting_easy", "r5:humaneval", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:mmlu_elementary_math" ], "base_Y": 0.23, "oracle": 0.32, "accuracy": 0.2548048059282632, "gap_recovered": 0.27560895475848 }, { "task": "mbpp_plus", "domain": "code", "N": 24, "seed": 55, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.22549109979607596, "gap_recovered": 0.037818999126039775 }, { "task": "mbpp_plus", "domain": "code", "N": 24, "seed": 55, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.27496794599226154, "gap_recovered": 0.24986262568112086 }, { "task": "mbpp_plus", "domain": "code", "N": 24, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mbpp_sanitized", "r5:humaneval", "r4:humaneval", "r4:math_counting_easy", "r4:mmlu_high_school_physics", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:mmlu_elementary_math" ], "base_Y": 0.21666666666666667, "oracle": 0.45, "accuracy": 0.2749465808101084, "gap_recovered": 0.24977106061475035 }, { "task": "openbookqa_test", "domain": "science", "N": 24, "seed": 55, "method": "mean", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7208252208534328, "gap_recovered": 0.03960446653694945 }, { "task": "openbookqa_test", "domain": "science", "N": 24, "seed": 55, "method": "global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": null, "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7718177453402815, "gap_recovered": 0.22616248295224942 }, { "task": "openbookqa_test", "domain": "science", "N": 24, "seed": 55, "method": "topk8_global_ridge", "anchors": [ "r4:gsm8k", "r4:mbpp", "r4:sciq", "r4:arc_easy", "r4:openbookqa", "r4:svamp", "r4:multiarith", "r4:mmlu_high_school_biology", "r4:math_counting_easy", "r4:humaneval", "r4:mmlu_high_school_physics", "r4:mbpp_sanitized", "r4:mmlu_elementary_math", "r4:math_algebra_easy", "r4:aqua_rat", "r4:medmcqa_easy", "r5:aqua_rat_numeric", "r5:math_counting_easy", "r5:mawps", "r5:mbpp_sanitized", "r5:humaneval", "r5:conala_curated", "r5:medmcqa_easy", "r5:pubmedqa_pqal" ], "selected_topk": [ "r4:mmlu_high_school_physics", "r4:mmlu_high_school_biology", "r4:mbpp_sanitized", "r4:math_counting_easy", "r4:mmlu_elementary_math", "r5:humaneval", "r4:humaneval", "r4:multiarith" ], "base_Y": 0.71, "oracle": 0.9833333333333333, "accuracy": 0.7709135990581293, "gap_recovered": 0.22285463070047312 } ], "summary": { "4": { "mean": { "gap_recovered_mean": -0.057793517770438306, "gap_recovered_std": 0.17563038381464027, "accuracy_mean": 0.25036947076019195, "accuracy_std": 0.03216155322401068, "per_seed_gap_recovered": [ 0.07797600483072223, 0.05780125075373164, -0.25, 0.07525515556335458, -0.25 ], "per_seed_accuracy": [ 0.27513360811650067, 0.2717492408341375, 0.21516666666666664, 0.27463117151698846, 0.21516666666666664 ] }, "global_ridge": { "gap_recovered_mean": 0.18758141994476324, "gap_recovered_std": 0.012099918774017874, "accuracy_mean": 0.2948275943624562, "accuracy_std": 0.002092518094207094, "per_seed_gap_recovered": [ 0.20656509234987466, 0.18344950675964355, 0.18193462388268833, 0.19116208060034404, 0.17479579613126564 ], "per_seed_accuracy": [ 0.2980716934039675, 0.2941577190530711, 0.29408085632872305, 0.2953825493543998, 0.29244515367211965 ] }, "topk8_global_ridge": { "gap_recovered_mean": 0.18764435011765057, "gap_recovered_std": 0.012084144151669675, "accuracy_mean": 0.29483878477140407, "accuracy_std": 0.002090313347697144, "per_seed_gap_recovered": [ 0.20655925520535176, 0.18367281864429347, 0.18193721360173717, 0.19125222748723533, 0.17480023564963507 ], "per_seed_accuracy": [ 0.2980712127986996, 0.29419602497144676, 0.29408122587752067, 0.29539983888056087, 0.29244562132879237 ] } }, "8": { "mean": { "gap_recovered_mean": -0.13972029274907605, "gap_recovered_std": 0.16140573821969564, "accuracy_mean": 0.2354215987117811, "accuracy_std": 0.029564716809042605, "per_seed_gap_recovered": [ -0.25, -0.05490825505092227, 0.10630679130554208, -0.25, -0.25 ], "per_seed_accuracy": [ 0.21516666666666664, 0.25132430619480967, 0.280283687364096, 0.21516666666666664, 0.21516666666666664 ] }, "global_ridge": { "gap_recovered_mean": 0.20068106569092853, "gap_recovered_std": 0.012220388493588764, "accuracy_mean": 0.2971712960484384, "accuracy_std": 0.002230078724212129, "per_seed_gap_recovered": [ 0.21065586188743862, 0.18605573012911042, 0.21070932108780432, 0.20718265812972508, 0.18880175722056428 ], "per_seed_accuracy": [ 0.2990403014928445, 0.29462198929128974, 0.2990668709141085, 0.2982323727278874, 0.29489494581606196 ] }, "topk8_global_ridge": { "gap_recovered_mean": 0.20068368007396833, "gap_recovered_std": 0.012222230487989805, "accuracy_mean": 0.29717193931272656, "accuracy_std": 0.0022303862873171943, "per_seed_gap_recovered": [ 0.21065314884843503, 0.18605505186935956, 0.21070738907518063, 0.20720142331616637, 0.18880138726070012 ], "per_seed_accuracy": [ 0.29903937467487374, 0.2946218563112719, 0.2990664386091561, 0.29823706614834145, 0.29489496081998984 ] } }, "12": { "mean": { "gap_recovered_mean": -0.06185165931438573, "gap_recovered_std": 0.11987185088549845, "accuracy_mean": 0.24987132692446656, "accuracy_std": 0.02200175109794565, "per_seed_gap_recovered": [ -0.11435040523265969, 0.01908318338723024, 0.020892102142860212, 0.01511682313064057, -0.25 ], "per_seed_accuracy": [ 0.24058434674383583, 0.2647677935293351, 0.2649075207929502, 0.263930306889545, 0.21516666666666664 ] }, "global_ridge": { "gap_recovered_mean": 0.20759854234498124, "gap_recovered_std": 0.008070049662145362, "accuracy_mean": 0.2985715409284351, "accuracy_std": 0.0013586871836284732, "per_seed_gap_recovered": [ 0.21448874884638292, 0.19910567382286332, 0.21295600512932095, 0.2129719134034782, 0.19847037052286085 ], "per_seed_accuracy": [ 0.2998147268980399, 0.2971767584373211, 0.2994404271991774, 0.29941309983976955, 0.29701269226786736 ] }, "topk8_global_ridge": { "gap_recovered_mean": 0.20760713363515926, "gap_recovered_std": 0.00804956171172338, "accuracy_mean": 0.2985733282867519, "accuracy_std": 0.0013561412938367808, "per_seed_gap_recovered": [ 0.21448340498167906, 0.1991690602795832, 0.21295518299628952, 0.21295721777554216, 0.19847080214270246 ], "per_seed_accuracy": [ 0.2998131661469909, 0.29718647974935075, 0.2994436094020975, 0.2994110638958284, 0.29701232223949214 ] } }, "16": { "mean": { "gap_recovered_mean": -0.010073316508325977, "gap_recovered_std": 0.08824797021532643, "accuracy_mean": 0.25940458604933203, "accuracy_std": 0.015888833442519255, "per_seed_gap_recovered": [ 0.04609660444588505, 0.04871861276955448, -0.03942812722304764, 0.04715635447666568, -0.15291002701068745 ], "per_seed_accuracy": [ 0.2694970770704336, 0.27004897209693646, 0.25407284990398366, 0.26969607279492525, 0.23370795838038125 ] }, "global_ridge": { "gap_recovered_mean": 0.2123188520299978, "gap_recovered_std": 0.0055871937881194125, "accuracy_mean": 0.2994135150717593, "accuracy_std": 0.0009899319263617858, "per_seed_gap_recovered": [ 0.21571068517093012, 0.21426208676963024, 0.21296079405422877, 0.21608726320595584, 0.20257343094924402 ], "per_seed_accuracy": [ 0.30001587030805393, 0.29979992151534424, 0.299442303306755, 0.3001066524297341, 0.29770282779890916 ] }, "topk8_global_ridge": { "gap_recovered_mean": 0.21201921742537935, "gap_recovered_std": 0.005505111192090935, "accuracy_mean": 0.2993448407787016, "accuracy_std": 0.0009667310823210812, "per_seed_gap_recovered": [ 0.21541710146542262, 0.21387085421332014, 0.2126474791559681, 0.21573547248182634, 0.20242517981035965 ], "per_seed_accuracy": [ 0.29994774891316206, 0.29971016041437787, 0.2993640720734651, 0.30002444234661674, 0.2976777801458863 ] } }, "24": { "mean": { "gap_recovered_mean": 0.02425117328249178, "gap_recovered_std": 0.0, "accuracy_mean": 0.26560043321532767, "accuracy_std": 0.0, "per_seed_gap_recovered": [ 0.02425117328249178, 0.02425117328249178, 0.02425117328249178, 0.02425117328249178, 0.02425117328249178 ], "per_seed_accuracy": [ 0.26560043321532767, 0.26560043321532767, 0.26560043321532767, 0.26560043321532767, 0.26560043321532767 ] }, "global_ridge": { "gap_recovered_mean": 0.21777514342604026, "gap_recovered_std": 0.0, "accuracy_mean": 0.3004704752078001, "accuracy_std": 0.0, "per_seed_gap_recovered": [ 0.21777514342604026, 0.21777514342604026, 0.21777514342604026, 0.21777514342604026, 0.21777514342604026 ], "per_seed_accuracy": [ 0.3004704752078001, 0.3004704752078001, 0.3004704752078001, 0.3004704752078001, 0.3004704752078001 ] }, "topk8_global_ridge": { "gap_recovered_mean": 0.21633328651559772, "gap_recovered_std": 0.0, "accuracy_mean": 0.3001800095103253, "accuracy_std": 0.0, "per_seed_gap_recovered": [ 0.21633328651559772, 0.21633328651559772, 0.21633328651559772, 0.21633328651559772, 0.21633328651559772 ], "per_seed_accuracy": [ 0.3001800095103253, 0.3001800095103253, 0.3001800095103253, 0.3001800095103253, 0.3001800095103253 ] } } } }