cross-model-lora-prediction-3b / results_round5.json
CK0607's picture
Round 5 upload results_round5.json
3ab752a verified
raw
history blame
296 kB
{
"config": {
"model_X": "Qwen/Qwen2.5-3B-Instruct",
"model_Y": "meta-llama/Llama-3.2-3B-Instruct",
"hub_repo": "CK0607/cross-model-lora-prediction-3b",
"lora": {
"r": 16,
"alpha": 32,
"targets": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj"
],
"dropout": 0.0
},
"train": {
"examples": 1500,
"eval_examples": 300,
"round5_eval_examples": 100,
"round5_fast_surrogate": true,
"epochs": 3.0,
"bs": 8,
"lr": 0.0002,
"max_len": 512
},
"r4_anchor_names": [
"gsm8k",
"mbpp",
"sciq",
"arc_easy",
"openbookqa",
"svamp",
"multiarith",
"mmlu_high_school_biology",
"math_counting_easy",
"humaneval",
"mmlu_high_school_physics",
"mbpp_sanitized",
"mmlu_elementary_math",
"math_algebra_easy",
"aqua_rat",
"medmcqa_easy"
],
"r5_requested_new_anchors": [
"aqua_rat_numeric",
"math_counting_easy",
"mawps",
"mbpp_sanitized",
"humaneval",
"conala_curated",
"medmcqa_easy",
"pubmedqa_pqal"
],
"r5_new_anchor_names": [
"aqua_rat_numeric",
"math_counting_easy",
"mawps",
"mbpp_sanitized",
"humaneval",
"conala_curated",
"medmcqa_easy",
"pubmedqa_pqal"
],
"dropped": [],
"pool_anchor_names": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"heldouts": [
"gsm_hard",
"gsm8k_test_500",
"mbpp_test_held",
"mbpp_plus",
"openbookqa_test"
],
"N_values": [
4,
8,
12,
16,
24
],
"methods": [
"mean",
"global_ridge",
"topk8_global_ridge"
],
"seeds": [
11,
22,
33,
44,
55
]
},
"dataset_audit": {
"tasks": {
"aqua_rat_numeric": {
"domain": "math",
"kind": "math_num",
"dataset": "deepmind/aqua_rat",
"config": "raw",
"train_rows_sampled": 20,
"eval_rows_sampled": 10,
"labels": [],
"sample": {
"messages": [
{
"role": "user",
"content": "Solve the math problem. Respond with only the final answer.\n\nProblem: From (1, 2, 3, 4, 5, 6), one number is picked out and replaced and one number is picked out again. If the sum of the 2 numbers is 9, what is the probability that the 2 numbers included the number 5?\n\nFinal answer:"
},
{
"role": "assistant",
"content": "1/2"
}
]
},
"ok": true
},
"math_counting_easy": {
"domain": "math",
"kind": "math_solution",
"dataset": "EleutherAI/hendrycks_math",
"config": "counting_and_probability",
"train_rows_sampled": 20,
"eval_rows_sampled": 10,
"labels": [],
"sample": {
"messages": [
{
"role": "user",
"content": "Solve the math problem. Respond with only the final answer.\n\nProblem: Alex has 10 different kinds of lunch meat and 9 different kinds of cheese. If he wants to make a sandwich with one kind of meat and two kinds of cheese, how many different sandwiches could he make? (It does not matter in which order he chooses the two types of cheese.)\n\nFinal answer:"
},
{
"role": "assistant",
"content": "360"
}
]
},
"ok": true
},
"mawps": {
"domain": "math",
"kind": "math_num",
"dataset": "MU-NLPC/Calc-mawps",
"config": null,
"train_rows_sampled": 20,
"eval_rows_sampled": 10,
"labels": [],
"sample": {
"messages": [
{
"role": "user",
"content": "Solve the math problem. Respond with only the final answer.\n\nProblem: Marilyn had 279 Beg. Daryl clasped some Beg. Now Marilyn has 59 Beg. How many did Daryl claspeds?\n\nFinal answer:"
},
{
"role": "assistant",
"content": "220"
}
]
},
"ok": true
},
"mbpp_sanitized": {
"domain": "code",
"kind": "code",
"dataset": "google-research-datasets/mbpp",
"config": "sanitized",
"train_rows_sampled": 20,
"eval_rows_sampled": 10,
"labels": [],
"sample": {
"messages": [
{
"role": "user",
"content": "Write a Python function that solves the task. Respond with only code.\n\nTask: Write a function to reverse words seperated by spaces in a given string.\n\nTests:\nassert reverse_words(\"python program\")==(\"program python\")\nassert reverse_words(\"java language\")==(\"language java\")\nassert reverse_words(\"indian man\")==(\"man indian\")\n\nCode:"
},
{
"role": "assistant",
"content": "def reverse_words(s):\n return ' '.join(reversed(s.split()))"
}
]
},
"ok": true
},
"humaneval": {
"domain": "code",
"kind": "code",
"dataset": "openai/openai_humaneval",
"config": null,
"train_rows_sampled": 20,
"eval_rows_sampled": 10,
"labels": [],
"sample": {
"messages": [
{
"role": "user",
"content": "Complete the following Python function. Respond with only the completion code.\n\ndef below_threshold(l: list, t: int):\n \"\"\"Return True if all numbers in the list l are below threshold t.\n >>> below_threshold([1, 2, 4, 10], 100)\n True\n >>> below_threshold([1, 20, 4, 10], 5)\n False\n \"\"\""
},
{
"role": "assistant",
"content": "for e in l:\n if e >= t:\n return False\n return True"
}
]
},
"ok": true
},
"conala_curated": {
"domain": "code",
"kind": "code",
"dataset": "codeparrot/conala-mined-curated",
"config": null,
"train_rows_sampled": 20,
"eval_rows_sampled": 10,
"labels": [],
"sample": {
"messages": [
{
"role": "user",
"content": "Write a Python code snippet for this intent. Respond with only code.\n\nIntent: print all contacts in class all_contacts\n\nCode:"
},
{
"role": "assistant",
"content": "all_contacts = []"
}
]
},
"ok": true
},
"medmcqa_easy": {
"domain": "science",
"kind": "mcq",
"dataset": "openlifescienceai/medmcqa",
"config": null,
"train_rows_sampled": 20,
"eval_rows_sampled": 10,
"labels": [
"A",
"B",
"C",
"D",
"E",
"F",
"G",
"H"
],
"sample": {
"messages": [
{
"role": "user",
"content": "Answer the multiple-choice question. Respond with only the option letter.\n\nApolipoprotein A-I (Apo A-I) is found in which of the following lipid components?\nA. LDL\nB. HDL\nC. VLDL\nD. Chylomicron\nAnswer:"
},
{
"role": "assistant",
"content": "B"
}
]
},
"ok": true
},
"pubmedqa_pqal": {
"domain": "science",
"kind": "short",
"dataset": "qiaojin/PubMedQA",
"config": "pqa_labeled",
"train_rows_sampled": 20,
"eval_rows_sampled": 10,
"labels": [
"yes",
"no",
"maybe"
],
"sample": {
"messages": [
{
"role": "user",
"content": "Answer the biomedical question with only one of: yes, no, maybe.\n\nContext: To assess whether eligibility to an adjuvant chemotherapy protocol in itself represents a good prognostic factor after radical cystectomy for bladder cancer. Between April 1984 and May 1989, our institution entered 35 patients with invasive bladder cancer into the Swiss Group for Clinical and Epidemiological Cancer Research (SAKK) study 09/84. They were randomly assigned to either observation or three postoperative courses of cisplatin monotherapy after cystectomy. This study had a negative result. The outcome of these 35 patients (protocol group) was compared with an age- and tumor-stage-matched cohort (matched group; n = 35) who also underwent cystectomy during the same period, but were not entered into the SAKK study, as well as the remaining 57 patients treated during the study period for the same indication (remaining group). Median overall survival decreased from 76.3 months in the protocol group to 52.1 months in the matched group and to 20.3 months in the remaining group. The respective times of median recurrence-free survival were 67.2, 16.0, and 9.4 months. Tumor progression occurred in 46% of the protocol group compared with 69% in the matched group and 65% in the remaining group (P<.05). Cancer-related death was noted in 40% of the protocol group, 57% in the matched group, and 56% in the remaining group.\n\nQuestion: Is eligibility for a chemotherapy protocol a good prognostic factor for invasive bladder cancer after radical cystectomy?\n\nAnswer:"
},
{
"role": "assistant",
"content": "yes"
}
]
},
"ok": true
}
},
"dropped": [],
"kept": [
"aqua_rat_numeric",
"math_counting_easy",
"mawps",
"mbpp_sanitized",
"humaneval",
"conala_curated",
"medmcqa_easy",
"pubmedqa_pqal"
],
"anchor_domain_counts": {
"math": 3,
"code": 3,
"science": 2
}
},
"training": [
{
"side": "X",
"model": "Qwen/Qwen2.5-3B-Instruct",
"task": "aqua_rat_numeric",
"ok": true,
"gpu": 0
},
{
"side": "X",
"model": "Qwen/Qwen2.5-3B-Instruct",
"task": "humaneval",
"ok": true,
"gpu": 0
},
{
"side": "Y",
"model": "meta-llama/Llama-3.2-3B-Instruct",
"task": "aqua_rat_numeric",
"ok": true,
"gpu": 1
},
{
"side": "Y",
"model": "meta-llama/Llama-3.2-3B-Instruct",
"task": "humaneval",
"ok": true,
"gpu": 1
},
{
"side": "X",
"model": "Qwen/Qwen2.5-3B-Instruct",
"task": "math_counting_easy",
"ok": true,
"gpu": 2
},
{
"side": "X",
"model": "Qwen/Qwen2.5-3B-Instruct",
"task": "conala_curated",
"ok": true,
"gpu": 2
},
{
"side": "Y",
"model": "meta-llama/Llama-3.2-3B-Instruct",
"task": "math_counting_easy",
"ok": true,
"gpu": 3
},
{
"side": "Y",
"model": "meta-llama/Llama-3.2-3B-Instruct",
"task": "conala_curated",
"ok": true,
"gpu": 3
},
{
"side": "X",
"model": "Qwen/Qwen2.5-3B-Instruct",
"task": "mawps",
"ok": true,
"gpu": 4
},
{
"side": "X",
"model": "Qwen/Qwen2.5-3B-Instruct",
"task": "medmcqa_easy",
"ok": true,
"gpu": 4
},
{
"side": "Y",
"model": "meta-llama/Llama-3.2-3B-Instruct",
"task": "mawps",
"ok": true,
"gpu": 5
},
{
"side": "Y",
"model": "meta-llama/Llama-3.2-3B-Instruct",
"task": "medmcqa_easy",
"ok": true,
"gpu": 5
},
{
"side": "X",
"model": "Qwen/Qwen2.5-3B-Instruct",
"task": "mbpp_sanitized",
"ok": true,
"gpu": 6
},
{
"side": "X",
"model": "Qwen/Qwen2.5-3B-Instruct",
"task": "pubmedqa_pqal",
"ok": true,
"gpu": 6
},
{
"side": "Y",
"model": "meta-llama/Llama-3.2-3B-Instruct",
"task": "mbpp_sanitized",
"ok": true,
"gpu": 7
},
{
"side": "Y",
"model": "meta-llama/Llama-3.2-3B-Instruct",
"task": "pubmedqa_pqal",
"ok": true,
"gpu": 7
}
],
"baselines": {
"gsm_hard": {
"base_Y": 0.06333333333333334,
"oracle": 0.15
},
"gsm8k_test_500": {
"base_Y": 0.08,
"oracle": 0.29333333333333333
},
"mbpp_test_held": {
"base_Y": 0.23,
"oracle": 0.32
},
"mbpp_plus": {
"base_Y": 0.21666666666666667,
"oracle": 0.45
},
"openbookqa_test": {
"base_Y": 0.71,
"oracle": 0.9833333333333333
}
},
"records": [
{
"task": "gsm_hard",
"domain": "math",
"N": 4,
"seed": 11,
"method": "mean",
"anchors": [
"r4:multiarith",
"r4:mbpp_sanitized",
"r5:aqua_rat_numeric",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.06235950804304804,
"gap_recovered": -0.011236445657138083
},
{
"task": "gsm_hard",
"domain": "math",
"N": 4,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:multiarith",
"r4:mbpp_sanitized",
"r5:aqua_rat_numeric",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07221313142228401,
"gap_recovered": 0.10245920871866164
},
{
"task": "gsm_hard",
"domain": "math",
"N": 4,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:multiarith",
"r4:mbpp_sanitized",
"r5:aqua_rat_numeric",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:multiarith",
"r5:medmcqa_easy"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.0722087583596679,
"gap_recovered": 0.10240875030386039
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 4,
"seed": 11,
"method": "mean",
"anchors": [
"r4:multiarith",
"r4:mbpp_sanitized",
"r5:aqua_rat_numeric",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.09861895769491964,
"gap_recovered": 0.08727636419493583
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 4,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:multiarith",
"r4:mbpp_sanitized",
"r5:aqua_rat_numeric",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12459708992092089,
"gap_recovered": 0.20904885900431666
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 4,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:multiarith",
"r4:mbpp_sanitized",
"r5:aqua_rat_numeric",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:multiarith",
"r5:medmcqa_easy"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12459502910745557,
"gap_recovered": 0.209039198941198
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 4,
"seed": 11,
"method": "mean",
"anchors": [
"r4:multiarith",
"r4:mbpp_sanitized",
"r5:aqua_rat_numeric",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.24167782528647064,
"gap_recovered": 0.12975361429411808
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 4,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:multiarith",
"r4:mbpp_sanitized",
"r5:aqua_rat_numeric",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25468766738628523,
"gap_recovered": 0.27430741540316916
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 4,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:multiarith",
"r4:mbpp_sanitized",
"r5:aqua_rat_numeric",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:multiarith",
"r5:medmcqa_easy"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25469019236235785,
"gap_recovered": 0.2743354706928649
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 4,
"seed": 11,
"method": "mean",
"anchors": [
"r4:multiarith",
"r4:mbpp_sanitized",
"r5:aqua_rat_numeric",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2398360331853231,
"gap_recovered": 0.09929728507995608
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 4,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:multiarith",
"r4:mbpp_sanitized",
"r5:aqua_rat_numeric",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.274934759222228,
"gap_recovered": 0.24972039666669132
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 4,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:multiarith",
"r4:mbpp_sanitized",
"r5:aqua_rat_numeric",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:multiarith",
"r5:medmcqa_easy"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2749305629182136,
"gap_recovered": 0.2497024125066297
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 4,
"seed": 11,
"method": "mean",
"anchors": [
"r4:multiarith",
"r4:mbpp_sanitized",
"r5:aqua_rat_numeric",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.733175716372742,
"gap_recovered": 0.08478920624173929
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 4,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:multiarith",
"r4:mbpp_sanitized",
"r5:aqua_rat_numeric",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7639258190681194,
"gap_recovered": 0.19728958195653454
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 4,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:multiarith",
"r4:mbpp_sanitized",
"r5:aqua_rat_numeric",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:multiarith",
"r5:medmcqa_easy"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7639315212458029,
"gap_recovered": 0.1973104435822058
},
{
"task": "gsm_hard",
"domain": "math",
"N": 4,
"seed": 22,
"method": "mean",
"anchors": [
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.06060013856010876,
"gap_recovered": -0.03153686276797588
},
{
"task": "gsm_hard",
"domain": "math",
"N": 4,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.0706267873720191,
"gap_recovered": 0.08415523890791271
},
{
"task": "gsm_hard",
"domain": "math",
"N": 4,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": [
"r5:humaneval",
"r4:aqua_rat",
"r5:medmcqa_easy"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07063988874698508,
"gap_recovered": 0.0843064086190586
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 4,
"seed": 22,
"method": "mean",
"anchors": [
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.09357568675074085,
"gap_recovered": 0.06363603164409773
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 4,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.1202605464540679,
"gap_recovered": 0.18872131150344332
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 4,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": [
"r5:humaneval",
"r4:aqua_rat",
"r5:medmcqa_easy"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.1203002280750494,
"gap_recovered": 0.1889073191017941
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 4,
"seed": 22,
"method": "mean",
"anchors": [
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.23954079837634648,
"gap_recovered": 0.10600887084829412
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 4,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25191722294379926,
"gap_recovered": 0.2435246993755472
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 4,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": [
"r5:humaneval",
"r4:aqua_rat",
"r5:medmcqa_easy"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25194909498609347,
"gap_recovered": 0.24387883317881623
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 4,
"seed": 22,
"method": "mean",
"anchors": [
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.23348182179461952,
"gap_recovered": 0.07206495054836931
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 4,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.26477579495002485,
"gap_recovered": 0.2061819783572493
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 4,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": [
"r5:humaneval",
"r4:aqua_rat",
"r5:medmcqa_easy"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2648303948599717,
"gap_recovered": 0.20641597797130715
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 4,
"seed": 22,
"method": "mean",
"anchors": [
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7315477586888719,
"gap_recovered": 0.07883326349587293
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 4,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7632082435454445,
"gap_recovered": 0.19466430565406528
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 4,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": [
"r5:humaneval",
"r4:aqua_rat",
"r5:medmcqa_easy"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7632605181891342,
"gap_recovered": 0.19485555435049115
},
{
"task": "gsm_hard",
"domain": "math",
"N": 4,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.04166666666666667,
"gap_recovered": -0.25000000000000006
},
{
"task": "gsm_hard",
"domain": "math",
"N": 4,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07145326948713983,
"gap_recovered": 0.09369157100545951
},
{
"task": "gsm_hard",
"domain": "math",
"N": 4,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy"
],
"selected_topk": [
"r4:math_algebra_easy",
"r4:mmlu_elementary_math",
"r4:mbpp"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07145337636443391,
"gap_recovered": 0.09369280420500663
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 4,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.026666666666666672,
"gap_recovered": -0.25
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 4,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12266466074976429,
"gap_recovered": 0.1999905972645201
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 4,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy"
],
"selected_topk": [
"r4:math_algebra_easy",
"r4:mmlu_elementary_math",
"r4:mbpp"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12266483613814431,
"gap_recovered": 0.19999141939755147
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 4,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.20750000000000002,
"gap_recovered": -0.24999999999999992
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 4,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2502199395771684,
"gap_recovered": 0.2246659953018713
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 4,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy"
],
"selected_topk": [
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:mbpp"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25022055001094423,
"gap_recovered": 0.22467277789938025
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 4,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.15833333333333333,
"gap_recovered": -0.25000000000000006
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 4,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.26078072169731403,
"gap_recovered": 0.18906023584563153
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 4,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy"
],
"selected_topk": [
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:mbpp"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2607817048313974,
"gap_recovered": 0.1890644492774174
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 4,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.6416666666666666,
"gap_recovered": -0.2500000000000001
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 4,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7652856901322288,
"gap_recovered": 0.20226471999595919
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 4,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy"
],
"selected_topk": [
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:mbpp"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7652856620426836,
"gap_recovered": 0.20226461722933023
},
{
"task": "gsm_hard",
"domain": "math",
"N": 4,
"seed": 44,
"method": "mean",
"anchors": [
"r4:sciq",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.06258799388490875,
"gap_recovered": -0.008600070558745234
},
{
"task": "gsm_hard",
"domain": "math",
"N": 4,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:sciq",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07188610471528153,
"gap_recovered": 0.09868582363786381
},
{
"task": "gsm_hard",
"domain": "math",
"N": 4,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:sciq",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"selected_topk": [
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07188549907728174,
"gap_recovered": 0.09867883550709697
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 4,
"seed": 44,
"method": "mean",
"anchors": [
"r4:sciq",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.09898812831133263,
"gap_recovered": 0.08900685145937172
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 4,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:sciq",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12376669171212734,
"gap_recovered": 0.20515636740059692
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 4,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:sciq",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"selected_topk": [
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12377916621065689,
"gap_recovered": 0.20521484161245418
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 4,
"seed": 44,
"method": "mean",
"anchors": [
"r4:sciq",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.24100257454247312,
"gap_recovered": 0.12225082824970122
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 4,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:sciq",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25219830919956343,
"gap_recovered": 0.24664787999514912
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 4,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:sciq",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"selected_topk": [
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2522135422969687,
"gap_recovered": 0.2468171366329853
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 4,
"seed": 44,
"method": "mean",
"anchors": [
"r4:sciq",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.23734664177072462,
"gap_recovered": 0.0886284647316769
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 4,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:sciq",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2656234004031653,
"gap_recovered": 0.20981457315642263
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 4,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:sciq",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"selected_topk": [
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.26564392632451556,
"gap_recovered": 0.2099025413907809
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 4,
"seed": 44,
"method": "mean",
"anchors": [
"r4:sciq",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7332305190755033,
"gap_recovered": 0.08498970393476829
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 4,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:sciq",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7634382407418613,
"gap_recovered": 0.19550575881168777
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 4,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:sciq",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"selected_topk": [
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7634770604933815,
"gap_recovered": 0.19564778229285923
},
{
"task": "gsm_hard",
"domain": "math",
"N": 4,
"seed": 55,
"method": "mean",
"anchors": [
"r4:sciq",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.04166666666666667,
"gap_recovered": -0.25000000000000006
},
{
"task": "gsm_hard",
"domain": "math",
"N": 4,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:sciq",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.06984950443793989,
"gap_recovered": 0.07518658966853708
},
{
"task": "gsm_hard",
"domain": "math",
"N": 4,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:sciq",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:humaneval"
],
"selected_topk": [
"r5:humaneval",
"r5:math_counting_easy",
"r4:sciq"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.06985002991796911,
"gap_recovered": 0.07519265289964354
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 4,
"seed": 55,
"method": "mean",
"anchors": [
"r4:sciq",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.026666666666666672,
"gap_recovered": -0.25
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 4,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:sciq",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.11843885312135194,
"gap_recovered": 0.1801821240063372
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 4,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:sciq",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:humaneval"
],
"selected_topk": [
"r5:humaneval",
"r4:sciq",
"r5:math_counting_easy"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.11843983968098959,
"gap_recovered": 0.1801867485046387
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 4,
"seed": 55,
"method": "mean",
"anchors": [
"r4:sciq",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.20750000000000002,
"gap_recovered": -0.24999999999999992
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 4,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:sciq",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2514841386778601,
"gap_recovered": 0.23871265197622352
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 4,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:sciq",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:humaneval"
],
"selected_topk": [
"r5:humaneval",
"r5:math_counting_easy",
"r4:sciq"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2514852855534389,
"gap_recovered": 0.23872539503821
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 4,
"seed": 55,
"method": "mean",
"anchors": [
"r4:sciq",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.15833333333333333,
"gap_recovered": -0.25000000000000006
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 4,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:sciq",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.26363710987156835,
"gap_recovered": 0.20130189944957863
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 4,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:sciq",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:humaneval"
],
"selected_topk": [
"r5:humaneval",
"r5:math_counting_easy",
"r4:sciq"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.26363701395604805,
"gap_recovered": 0.20130148838306303
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 4,
"seed": 55,
"method": "mean",
"anchors": [
"r4:sciq",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.6416666666666666,
"gap_recovered": -0.2500000000000001
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 4,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:sciq",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7588161622518781,
"gap_recovered": 0.17859571555565168
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 4,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:sciq",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:humaneval"
],
"selected_topk": [
"r5:humaneval",
"r4:sciq",
"r5:math_counting_easy"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7588159375355161,
"gap_recovered": 0.1785948934226201
},
{
"task": "gsm_hard",
"domain": "math",
"N": 8,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:multiarith",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.04166666666666667,
"gap_recovered": -0.25000000000000006
},
{
"task": "gsm_hard",
"domain": "math",
"N": 8,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:multiarith",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07245608132461023,
"gap_recovered": 0.10526247682242572
},
{
"task": "gsm_hard",
"domain": "math",
"N": 8,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:multiarith",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:multiarith",
"r4:mmlu_elementary_math",
"r5:medmcqa_easy",
"r4:mbpp",
"r5:mbpp_sanitized",
"r4:gsm8k"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.0724562861727572,
"gap_recovered": 0.10526484045489079
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 8,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:multiarith",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.026666666666666672,
"gap_recovered": -0.25
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 8,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:multiarith",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12521504895440466,
"gap_recovered": 0.21194554197377186
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 8,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:multiarith",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:multiarith",
"r4:mmlu_elementary_math",
"r5:medmcqa_easy",
"r4:mbpp",
"r5:mbpp_sanitized",
"r5:aqua_rat_numeric"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.1252146762540971,
"gap_recovered": 0.21194379494108018
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 8,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:multiarith",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.20750000000000002,
"gap_recovered": -0.24999999999999992
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 8,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:multiarith",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25471247319517465,
"gap_recovered": 0.27458303550194046
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 8,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:multiarith",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:multiarith",
"r4:mmlu_elementary_math",
"r5:medmcqa_easy",
"r4:mbpp",
"r5:mbpp_sanitized",
"r4:gsm8k"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25471275066507276,
"gap_recovered": 0.2745861185008084
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 8,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:multiarith",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.15833333333333333,
"gap_recovered": -0.25000000000000006
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 8,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:multiarith",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.27493516686318936,
"gap_recovered": 0.24972214369938295
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 8,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:multiarith",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:multiarith",
"r4:mmlu_elementary_math",
"r5:medmcqa_easy",
"r4:mbpp",
"r5:mbpp_sanitized",
"r4:gsm8k"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2749353107364699,
"gap_recovered": 0.24972276029915658
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 8,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:multiarith",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.6416666666666666,
"gap_recovered": -0.2500000000000001
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 8,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:multiarith",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7678827371268436,
"gap_recovered": 0.21176611143967194
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 8,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:multiarith",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:multiarith",
"r5:medmcqa_easy",
"r4:mbpp",
"r5:mbpp_sanitized",
"r4:gsm8k"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.767877849545972,
"gap_recovered": 0.21174823004623933
},
{
"task": "gsm_hard",
"domain": "math",
"N": 8,
"seed": 22,
"method": "mean",
"anchors": [
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.05140127220373045,
"gap_recovered": -0.13767762841849493
},
{
"task": "gsm_hard",
"domain": "math",
"N": 8,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.0708979350671001,
"gap_recovered": 0.08728386615884724
},
{
"task": "gsm_hard",
"domain": "math",
"N": 8,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:humaneval",
"r5:humaneval",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r5:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:aqua_rat_numeric"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.0708976678738649,
"gap_recovered": 0.0872807831599795
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 8,
"seed": 22,
"method": "mean",
"anchors": [
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.06954419015467853,
"gap_recovered": -0.04901160864994442
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 8,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12091904212688578,
"gap_recovered": 0.19180800996977712
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 8,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:humaneval",
"r5:humaneval",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r5:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:aqua_rat_numeric"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12091888866205326,
"gap_recovered": 0.19180729060337467
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 8,
"seed": 22,
"method": "mean",
"anchors": [
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.22929394602775574,
"gap_recovered": -0.007845044136047438
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 8,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25210894539438444,
"gap_recovered": 0.24565494882649372
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 8,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:humaneval",
"r5:humaneval",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r5:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:mawps"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2521090841293335,
"gap_recovered": 0.24565649032592798
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 8,
"seed": 22,
"method": "mean",
"anchors": [
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2074264666129803,
"gap_recovered": -0.03960085737294156
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 8,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2652467161759563,
"gap_recovered": 0.20820021218266982
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 8,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:humaneval",
"r5:humaneval",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r5:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:mawps"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2652471477957978,
"gap_recovered": 0.20820206198199043
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 8,
"seed": 22,
"method": "mean",
"anchors": [
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.6989556559749033,
"gap_recovered": -0.040406136677183
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 8,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7639373076921222,
"gap_recovered": 0.1973316135077643
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 8,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:humaneval",
"r5:humaneval",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r5:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:mawps"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7639364930953102,
"gap_recovered": 0.19732863327552524
},
{
"task": "gsm_hard",
"domain": "math",
"N": 8,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.0646524178844759,
"gap_recovered": 0.015220206359337235
},
{
"task": "gsm_hard",
"domain": "math",
"N": 8,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07239382530080862,
"gap_recovered": 0.10454413808625329
},
{
"task": "gsm_hard",
"domain": "math",
"N": 8,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r4:mbpp"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07239352248180872,
"gap_recovered": 0.1045406440208698
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 8,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.10447578956340922,
"gap_recovered": 0.11473026357848072
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 8,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12504286141231144,
"gap_recovered": 0.21113841287020987
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 8,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r4:mbpp"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.1250425544826464,
"gap_recovered": 0.21113697413740498
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 8,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.24428923323236665,
"gap_recovered": 0.1587692581374071
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 8,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2547727211590471,
"gap_recovered": 0.27525245732274534
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 8,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r4:mbpp"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25477285064499955,
"gap_recovered": 0.27525389605555045
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 8,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.24603687565902185,
"gap_recovered": 0.12587232425295075
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 8,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.27493519084206947,
"gap_recovered": 0.24972224646601196
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 8,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r4:mbpp"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.27493516686318936,
"gap_recovered": 0.24972214369938295
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 8,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7419641204812061,
"gap_recovered": 0.11694190419953457
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 8,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7681897558563057,
"gap_recovered": 0.21288935069380135
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 8,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r4:mbpp"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7681880985731366,
"gap_recovered": 0.21288328746269514
},
{
"task": "gsm_hard",
"domain": "math",
"N": 8,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.04166666666666667,
"gap_recovered": -0.25000000000000006
},
{
"task": "gsm_hard",
"domain": "math",
"N": 8,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07217366698144498,
"gap_recovered": 0.10200384978590354
},
{
"task": "gsm_hard",
"domain": "math",
"N": 8,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r4:sciq",
"r4:gsm8k"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07217657938770865,
"gap_recovered": 0.10203745447356127
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 8,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.026666666666666672,
"gap_recovered": -0.25
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 8,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12450159094799526,
"gap_recovered": 0.2086012075687278
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 8,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r4:sciq",
"r4:arc_easy"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12450937380735902,
"gap_recovered": 0.20863768972199542
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 8,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.20750000000000002,
"gap_recovered": -0.24999999999999992
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 8,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25476326868451876,
"gap_recovered": 0.2751474298279861
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 8,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r4:arc_easy",
"r4:sciq"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2547594395999251,
"gap_recovered": 0.275104884443612
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 8,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.15833333333333333,
"gap_recovered": -0.25000000000000006
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 8,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2749258150999574,
"gap_recovered": 0.2496820647141031
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 8,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r4:arc_easy",
"r4:sciq"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2749346633067076,
"gap_recovered": 0.2497199856001755
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 8,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.6416666666666666,
"gap_recovered": -0.2500000000000001
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 8,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7647975219255206,
"gap_recovered": 0.20047873875190492
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 8,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:sciq",
"r5:mbpp_sanitized",
"r4:gsm8k"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7648052746400066,
"gap_recovered": 0.20050710234148764
},
{
"task": "gsm_hard",
"domain": "math",
"N": 8,
"seed": 55,
"method": "mean",
"anchors": [
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.04166666666666667,
"gap_recovered": -0.25000000000000006
},
{
"task": "gsm_hard",
"domain": "math",
"N": 8,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07169886460249453,
"gap_recovered": 0.09652536079801373
},
{
"task": "gsm_hard",
"domain": "math",
"N": 8,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval"
],
"selected_topk": [
"r5:humaneval",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:mbpp_sanitized",
"r5:math_counting_easy",
"r4:sciq",
"r5:aqua_rat_numeric"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07169961274355308,
"gap_recovered": 0.09653399319484314
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 8,
"seed": 55,
"method": "mean",
"anchors": [
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.026666666666666672,
"gap_recovered": -0.25
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 8,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.1234050847196031,
"gap_recovered": 0.20346133462313953
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 8,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval"
],
"selected_topk": [
"r5:humaneval",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:mbpp_sanitized",
"r4:sciq",
"r5:math_counting_easy",
"r5:aqua_rat_numeric"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.1234069701446884,
"gap_recovered": 0.2034701725532269
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 8,
"seed": 55,
"method": "mean",
"anchors": [
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.20750000000000002,
"gap_recovered": -0.24999999999999992
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 8,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2520854159470262,
"gap_recovered": 0.2453935105225134
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 8,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval"
],
"selected_topk": [
"r5:humaneval",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:mbpp_sanitized",
"r4:arc_easy",
"r5:math_counting_easy",
"r4:sciq"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2520842228264644,
"gap_recovered": 0.24538025362738236
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 8,
"seed": 55,
"method": "mean",
"anchors": [
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.15833333333333333,
"gap_recovered": -0.25000000000000006
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 8,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2653153916885113,
"gap_recovered": 0.2084945358079056
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 8,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval"
],
"selected_topk": [
"r5:humaneval",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r4:arc_easy",
"r4:sciq"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2653136891880255,
"gap_recovered": 0.2084872393772521
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 8,
"seed": 55,
"method": "mean",
"anchors": [
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.6416666666666666,
"gap_recovered": -0.2500000000000001
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 8,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7619699721226747,
"gap_recovered": 0.19013404435124903
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 8,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:sciq",
"r4:arc_easy",
"r4:humaneval",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval"
],
"selected_topk": [
"r5:humaneval",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:sciq",
"r5:mbpp_sanitized",
"r5:math_counting_easy",
"r5:aqua_rat_numeric"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7619703091972175,
"gap_recovered": 0.19013527755079598
},
{
"task": "gsm_hard",
"domain": "math",
"N": 12,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.046795635689264065,
"gap_recovered": -0.19081958820079933
},
{
"task": "gsm_hard",
"domain": "math",
"N": 12,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07289376165675021,
"gap_recovered": 0.11031263450096393
},
{
"task": "gsm_hard",
"domain": "math",
"N": 12,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:multiarith",
"r4:mmlu_elementary_math",
"r4:mmlu_high_school_biology",
"r4:svamp",
"r5:medmcqa_easy",
"r4:mbpp"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07289361024725027,
"gap_recovered": 0.11031088746827233
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 12,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.05766524479306978,
"gap_recovered": -0.10469416503248542
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 12,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12629289824387124,
"gap_recovered": 0.21699796051814646
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 12,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:multiarith",
"r4:mmlu_elementary_math",
"r4:mmlu_high_school_biology",
"r4:svamp",
"r5:medmcqa_easy",
"r4:mbpp"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.1262937751857714,
"gap_recovered": 0.21700207118330342
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 12,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.22348956247855878,
"gap_recovered": -0.07233819468268035
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 12,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25477145404651247,
"gap_recovered": 0.27523837829458286
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 12,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:mmlu_elementary_math",
"r4:svamp",
"r5:medmcqa_easy",
"r4:mbpp"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2547717407654072,
"gap_recovered": 0.27524156406008
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 12,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.19311567914897,
"gap_recovered": -0.10093280364727152
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 12,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2749361260183927,
"gap_recovered": 0.24972625436454002
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 12,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:mmlu_elementary_math",
"r4:svamp",
"r5:medmcqa_easy",
"r4:mbpp"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2749361020395126,
"gap_recovered": 0.249726151597911
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 12,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.6818556116093164,
"gap_recovered": -0.10296727460006182
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 12,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7701793945246729,
"gap_recovered": 0.22016851655368141
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 12,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mmlu_high_school_biology",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:humaneval",
"r4:multiarith",
"r4:svamp",
"r5:medmcqa_easy",
"r4:mbpp"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7701706024970131,
"gap_recovered": 0.2201363505988285
},
{
"task": "gsm_hard",
"domain": "math",
"N": 12,
"seed": 22,
"method": "mean",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.057627453502567344,
"gap_recovered": -0.06583707497037687
},
{
"task": "gsm_hard",
"domain": "math",
"N": 12,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07227199409199858,
"gap_recovered": 0.10313839336921435
},
{
"task": "gsm_hard",
"domain": "math",
"N": 12,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:humaneval",
"r5:humaneval",
"r4:mmlu_high_school_biology",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r4:openbookqa",
"r5:medmcqa_easy"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07227138845399879,
"gap_recovered": 0.10313140523844752
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 12,
"seed": 22,
"method": "mean",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.08585960651266165,
"gap_recovered": 0.0274669055281015
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 12,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12446026506095098,
"gap_recovered": 0.20840749247320772
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 12,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:humaneval",
"r5:humaneval",
"r4:mmlu_high_school_biology",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r4:openbookqa",
"r5:medmcqa_easy"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12446679827810705,
"gap_recovered": 0.20843811692862682
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 12,
"seed": 22,
"method": "mean",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.23572552253460063,
"gap_recovered": 0.06361691705111805
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 12,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25255285098634916,
"gap_recovered": 0.25058723318165727
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 12,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:humaneval",
"r5:humaneval",
"r4:mmlu_high_school_biology",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r4:openbookqa",
"r5:medmcqa_easy"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25256959167020077,
"gap_recovered": 0.2507732407800084
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 12,
"seed": 22,
"method": "mean",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2237824846958292,
"gap_recovered": 0.030496362982125127
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 12,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.26642122570125537,
"gap_recovered": 0.21323382443395156
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 12,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:humaneval",
"r5:humaneval",
"r4:mmlu_high_school_biology",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r4:openbookqa",
"r5:medmcqa_easy"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.266440984298443,
"gap_recovered": 0.2133185041361842
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 12,
"seed": 22,
"method": "mean",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7208439004010168,
"gap_recovered": 0.039672806345183394
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 12,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7701774563460514,
"gap_recovered": 0.22016142565628571
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 12,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r5:humaneval",
"r4:svamp",
"r4:openbookqa",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r5:medmcqa_easy"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.770183636046004,
"gap_recovered": 0.22018403431464906
},
{
"task": "gsm_hard",
"domain": "math",
"N": 12,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.05790650121096908,
"gap_recovered": -0.06261729371958759
},
{
"task": "gsm_hard",
"domain": "math",
"N": 12,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07281855566748258,
"gap_recovered": 0.10944487308633737
},
{
"task": "gsm_hard",
"domain": "math",
"N": 12,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r4:medmcqa_easy"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07281329196074915,
"gap_recovered": 0.10938413800864398
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 12,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.08668351535139414,
"gap_recovered": 0.03132897820966002
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 12,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.1260294868206156,
"gap_recovered": 0.21576321947163563
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 12,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r4:medmcqa_easy"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12603113108667835,
"gap_recovered": 0.21577092696880476
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 12,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.23607687341755837,
"gap_recovered": 0.06752081575064844
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 12,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.254778455536941,
"gap_recovered": 0.27531617263267777
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 12,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r4:medmcqa_easy"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2547751813921435,
"gap_recovered": 0.2752797932460388
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 12,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.22509549623248223,
"gap_recovered": 0.036123555282066684
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 12,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.27492768545260377,
"gap_recovered": 0.24969008051115896
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 12,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r4:medmcqa_easy"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.27493065883373397,
"gap_recovered": 0.24970282357314552
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 12,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.718775217752347,
"gap_recovered": 0.03210445519151349
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 12,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7686479525182439,
"gap_recovered": 0.21456567994479495
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 12,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:mbpp_sanitized",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat",
"r4:medmcqa_easy"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7686677837371826,
"gap_recovered": 0.21463823318481448
},
{
"task": "gsm_hard",
"domain": "math",
"N": 12,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.05730614473079814,
"gap_recovered": -0.06954448387540614
},
{
"task": "gsm_hard",
"domain": "math",
"N": 12,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07286305224758456,
"gap_recovered": 0.1099582951644372
},
{
"task": "gsm_hard",
"domain": "math",
"N": 12,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:openbookqa",
"r5:medmcqa_easy",
"r5:conala_curated"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07285891075243896,
"gap_recovered": 0.10991050868198791
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 12,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.0851024329525301,
"gap_recovered": 0.023917654464984846
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 12,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.126309691681259,
"gap_recovered": 0.21707667975590156
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 12,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:openbookqa",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12630539466594828,
"gap_recovered": 0.2170565374966326
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 12,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.23555245531016383,
"gap_recovered": 0.061693947890709144
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 12,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25476923428732773,
"gap_recovered": 0.27521371430364133
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 12,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:conala_curated",
"r4:openbookqa",
"r5:medmcqa_easy"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2547697984761205,
"gap_recovered": 0.2752199830680057
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 12,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.22392417589823407,
"gap_recovered": 0.03110361099243171
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 12,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.274942360527214,
"gap_recovered": 0.24975297368805993
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 12,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:conala_curated",
"r4:openbookqa",
"r5:medmcqa_easy"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.27493696527919553,
"gap_recovered": 0.24972985119655225
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 12,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7177663255559987,
"gap_recovered": 0.02841338618048329
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 12,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7681811604554626,
"gap_recovered": 0.21285790410535108
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 12,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mbpp_sanitized",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:openbookqa",
"r4:aqua_rat",
"r5:medmcqa_easy",
"r5:conala_curated"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7681842503054388,
"gap_recovered": 0.21286920843453233
},
{
"task": "gsm_hard",
"domain": "math",
"N": 12,
"seed": 55,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.04166666666666667,
"gap_recovered": -0.25000000000000006
},
{
"task": "gsm_hard",
"domain": "math",
"N": 12,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07221353221213682,
"gap_recovered": 0.10246383321696324
},
{
"task": "gsm_hard",
"domain": "math",
"N": 12,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated"
],
"selected_topk": [
"r5:humaneval",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math",
"r4:openbookqa",
"r5:conala_curated",
"r5:mbpp_sanitized",
"r5:math_counting_easy"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.0722140398792837,
"gap_recovered": 0.10246969091481181
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 12,
"seed": 55,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.026666666666666672,
"gap_recovered": -0.25
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 12,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12460340390260194,
"gap_recovered": 0.20907845579344658
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 12,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated"
],
"selected_topk": [
"r5:humaneval",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math",
"r4:openbookqa",
"r5:conala_curated",
"r5:mbpp_sanitized",
"r4:sciq"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.1246049604744747,
"gap_recovered": 0.20908575222410014
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 12,
"seed": 55,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.20750000000000002,
"gap_recovered": -0.24999999999999992
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 12,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2525556534323199,
"gap_recovered": 0.2506183714702209
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 12,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated"
],
"selected_topk": [
"r5:humaneval",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:conala_curated",
"r4:openbookqa",
"r5:mbpp_sanitized",
"r4:arc_easy"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25255594015121463,
"gap_recovered": 0.250621557235718
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 12,
"seed": 55,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.15833333333333333,
"gap_recovered": -0.25000000000000006
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 12,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.26660809311373485,
"gap_recovered": 0.2140346847731493
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 12,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated"
],
"selected_topk": [
"r5:humaneval",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:conala_curated",
"r4:openbookqa",
"r5:math_counting_easy",
"r5:mbpp_sanitized"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2666099874452613,
"gap_recovered": 0.2140428033368342
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 12,
"seed": 55,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.6416666666666666,
"gap_recovered": -0.2500000000000001
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 12,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7690827786785432,
"gap_recovered": 0.21615650736052422
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 12,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated"
],
"selected_topk": [
"r4:mmlu_elementary_math",
"r5:humaneval",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:openbookqa",
"r5:conala_curated",
"r4:sciq",
"r5:mbpp_sanitized"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7690766832472264,
"gap_recovered": 0.2161342070020481
},
{
"task": "gsm_hard",
"domain": "math",
"N": 16,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.059948044563161926,
"gap_recovered": -0.03906102427120862
},
{
"task": "gsm_hard",
"domain": "math",
"N": 16,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07311635143455418,
"gap_recovered": 0.11288097809100972
},
{
"task": "gsm_hard",
"domain": "math",
"N": 16,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math",
"r4:mmlu_high_school_biology",
"r4:svamp"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07309355094515045,
"gap_recovered": 0.1126178955209667
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 16,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.0921708915973532,
"gap_recovered": 0.05705105436259312
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 16,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12692907574533047,
"gap_recovered": 0.21998004255623657
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 16,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math",
"r4:mmlu_high_school_biology",
"r4:svamp"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12690070667486084,
"gap_recovered": 0.2198470625384102
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 16,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.23833444250041042,
"gap_recovered": 0.09260491667122672
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 16,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2547743489824493,
"gap_recovered": 0.27527054424943626
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 16,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:svamp"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2547730911189112,
"gap_recovered": 0.2752565679879024
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 16,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2306903924065075,
"gap_recovered": 0.060101681742174916
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 16,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.274936221933913,
"gap_recovered": 0.2497266654310556
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 16,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:svamp"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.27493053893933356,
"gap_recovered": 0.24970230974000093
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 16,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7263416142847346,
"gap_recovered": 0.05978639372463909
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 16,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7703233534440227,
"gap_recovered": 0.22069519552691244
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 16,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:arc_easy",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mmlu_high_school_biology",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:svamp"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7700408568875543,
"gap_recovered": 0.21966167153983296
},
{
"task": "gsm_hard",
"domain": "math",
"N": 16,
"seed": 22,
"method": "mean",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.05999645997738017,
"gap_recovered": -0.038502384876382696
},
{
"task": "gsm_hard",
"domain": "math",
"N": 16,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.0728135057153373,
"gap_recovered": 0.10938660440773808
},
{
"task": "gsm_hard",
"domain": "math",
"N": 16,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r5:humaneval",
"r4:math_algebra_easy",
"r4:mmlu_high_school_biology",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07278109517590754,
"gap_recovered": 0.10901263664508698
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 16,
"seed": 22,
"method": "mean",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.09217455282978629,
"gap_recovered": 0.05706821638962322
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 16,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12601758233432114,
"gap_recovered": 0.21570741719213035
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 16,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r5:humaneval",
"r4:math_algebra_easy",
"r4:mmlu_high_school_biology",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12597274867967628,
"gap_recovered": 0.21549725943598258
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 16,
"seed": 22,
"method": "mean",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.23862580439140058,
"gap_recovered": 0.09584227101556184
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 16,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25477864976586967,
"gap_recovered": 0.2753183307318851
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 16,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r5:humaneval",
"r4:mmlu_high_school_biology",
"r4:math_algebra_easy",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25477734565734866,
"gap_recovered": 0.27530384063720725
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 16,
"seed": 22,
"method": "mean",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.23142021360068488,
"gap_recovered": 0.06322948686007801
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 16,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.27493547858863043,
"gap_recovered": 0.24972347966555897
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 16,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r5:humaneval",
"r4:mmlu_high_school_biology",
"r4:math_algebra_easy",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2749307787281344,
"gap_recovered": 0.24970333740629014
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 16,
"seed": 22,
"method": "mean",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7280278296854304,
"gap_recovered": 0.06595547445889202
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 16,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7704543911725625,
"gap_recovered": 0.22117460185083868
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 16,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:mbpp",
"r4:openbookqa",
"r4:svamp",
"r4:mmlu_high_school_biology",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mmlu_high_school_biology",
"r4:mbpp_sanitized",
"r4:humaneval",
"r5:humaneval",
"r4:math_algebra_easy",
"r4:svamp",
"r4:openbookqa",
"r5:pubmedqa_pqal"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7700888338308225,
"gap_recovered": 0.21983719694203366
},
{
"task": "gsm_hard",
"domain": "math",
"N": 16,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.052866684595743826,
"gap_recovered": -0.12076902389526362
},
{
"task": "gsm_hard",
"domain": "math",
"N": 16,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.0728197847563645,
"gap_recovered": 0.10945905488112877
},
{
"task": "gsm_hard",
"domain": "math",
"N": 16,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r5:humaneval",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07280703073260429,
"gap_recovered": 0.10931189306851101
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 16,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.07347902846062322,
"gap_recovered": -0.030567054090828657
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 16,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12603295074112114,
"gap_recovered": 0.21577945659900535
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 16,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r5:humaneval",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.1260124741477528,
"gap_recovered": 0.21568347256759124
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 16,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.23054410515160398,
"gap_recovered": 0.006045612795599651
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 16,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25477470044432016,
"gap_recovered": 0.275274449381335
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 16,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r5:humaneval",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25477317435988067,
"gap_recovered": 0.2752574928875629
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 16,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.21094915592807464,
"gap_recovered": -0.024503617451108697
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 16,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2749346393278276,
"gap_recovered": 0.24971988283354674
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 16,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r5:humaneval",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.27493053893933356,
"gap_recovered": 0.24970230974000093
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 16,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7025252753838725,
"gap_recovered": -0.027346553473636886
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 16,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7686494412641416,
"gap_recovered": 0.2145711265761279
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 16,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:svamp",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r5:humaneval",
"r4:humaneval",
"r4:math_algebra_easy",
"r4:svamp",
"r5:pubmedqa_pqal",
"r4:aqua_rat"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7682971421877542,
"gap_recovered": 0.2132822275161742
},
{
"task": "gsm_hard",
"domain": "math",
"N": 16,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.05988999237959414,
"gap_recovered": -0.039730857158529254
},
{
"task": "gsm_hard",
"domain": "math",
"N": 16,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07309392501567973,
"gap_recovered": 0.1126222117193814
},
{
"task": "gsm_hard",
"domain": "math",
"N": 16,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:math_counting_easy",
"r4:mbpp_sanitized",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:mmlu_high_school_biology",
"r4:aqua_rat"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07306783804948304,
"gap_recovered": 0.11232120826326579
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 16,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.09205213174052623,
"gap_recovered": 0.05649436753371671
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 16,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12716354808588137,
"gap_recovered": 0.22107913165256893
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 16,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:math_counting_easy",
"r4:mbpp_sanitized",
"r4:humaneval",
"r5:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:mmlu_high_school_biology",
"r4:aqua_rat"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12713919102460491,
"gap_recovered": 0.22096495792783555
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 16,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.23857803332394567,
"gap_recovered": 0.09531148137717402
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 16,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25479441930507796,
"gap_recovered": 0.27549354783419944
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 16,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:math_counting_easy",
"r4:humaneval",
"r5:humaneval",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25479235677883544,
"gap_recovered": 0.2754706308759492
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 16,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2313646065777746,
"gap_recovered": 0.06299117104760542
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 16,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2749491705291573,
"gap_recovered": 0.24978215941067408
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 16,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:humaneval",
"r5:humaneval",
"r4:math_counting_easy",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_algebra_easy",
"r4:aqua_rat"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2749346393278276,
"gap_recovered": 0.24971988283354674
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 16,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7265955999527854,
"gap_recovered": 0.06071560958336151
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 16,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7705321992128744,
"gap_recovered": 0.22145926541295533
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 16,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mbpp_sanitized",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mmlu_high_school_biology",
"r4:mbpp_sanitized",
"r4:math_counting_easy",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:openbookqa"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7701881865523327,
"gap_recovered": 0.22020068250853433
},
{
"task": "gsm_hard",
"domain": "math",
"N": 16,
"seed": 55,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.043656596123487115,
"gap_recovered": -0.2270392754982257
},
{
"task": "gsm_hard",
"domain": "math",
"N": 16,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07285970342570339,
"gap_recovered": 0.10991965491196211
},
{
"task": "gsm_hard",
"domain": "math",
"N": 16,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": [
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math",
"r4:aqua_rat",
"r4:openbookqa",
"r5:medmcqa_easy"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07284248727491534,
"gap_recovered": 0.10972100701825385
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 16,
"seed": 55,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.04926707914505883,
"gap_recovered": -0.14406056650753674
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 16,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12633001480979483,
"gap_recovered": 0.21717194442091328
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 16,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": [
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math",
"r4:aqua_rat",
"r4:openbookqa",
"r5:conala_curated"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.126309735528354,
"gap_recovered": 0.21707688528915942
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 16,
"seed": 55,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2197829900938889,
"gap_recovered": -0.11352233229012336
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 16,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.25266901838368383,
"gap_recovered": 0.25187798204093137
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 16,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": [
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:conala_curated",
"r4:openbookqa"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2526553668646977,
"gap_recovered": 0.25172629849664097
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 16,
"seed": 55,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.18353273553409796,
"gap_recovered": -0.14200256199672304
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 16,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.26689128368750387,
"gap_recovered": 0.21524835866073083
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 16,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": [
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:conala_curated",
"r4:openbookqa"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2668517425142486,
"gap_recovered": 0.2150788964896368
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 16,
"seed": 55,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.6723003910053735,
"gap_recovered": -0.13792539876082843
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 16,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7697641186878599,
"gap_recovered": 0.2186492147116826
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 16,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:multiarith",
"r4:humaneval",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy"
],
"selected_topk": [
"r4:mmlu_elementary_math",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:openbookqa",
"r4:aqua_rat",
"r5:medmcqa_easy"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.769729568547216,
"gap_recovered": 0.21852281175810734
},
{
"task": "gsm_hard",
"domain": "math",
"N": 24,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.058087711663081736,
"gap_recovered": -0.0605264038875185
},
{
"task": "gsm_hard",
"domain": "math",
"N": 24,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07326510681503122,
"gap_recovered": 0.11459738632728325
},
{
"task": "gsm_hard",
"domain": "math",
"N": 24,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:math_counting_easy",
"r4:mbpp_sanitized",
"r4:mmlu_high_school_physics",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07306991325027642,
"gap_recovered": 0.11234515288780485
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 24,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.08727496881594604,
"gap_recovered": 0.034101416324747065
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 24,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.1274962817663434,
"gap_recovered": 0.22263882077973468
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 24,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:math_counting_easy",
"r4:mbpp_sanitized",
"r4:mmlu_high_school_physics",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12716514850484914,
"gap_recovered": 0.22108663361648034
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 24,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2363231649481017,
"gap_recovered": 0.07025738831224113
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 24,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2548052961250832,
"gap_recovered": 0.275614401389813
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 24,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:math_counting_easy",
"r5:humaneval",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:mmlu_elementary_math"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2548048059282632,
"gap_recovered": 0.27560895475848
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 24,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.22549109979607596,
"gap_recovered": 0.037818999126039775
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 24,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.27496794599226154,
"gap_recovered": 0.24986262568112086
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 24,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r5:humaneval",
"r4:humaneval",
"r4:math_counting_easy",
"r4:mmlu_high_school_physics",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:mmlu_elementary_math"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2749465808101084,
"gap_recovered": 0.24977106061475035
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 24,
"seed": 11,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7208252208534328,
"gap_recovered": 0.03960446653694945
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 24,
"seed": 11,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7718177453402815,
"gap_recovered": 0.22616248295224942
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 24,
"seed": 11,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mmlu_high_school_physics",
"r4:mmlu_high_school_biology",
"r4:mbpp_sanitized",
"r4:math_counting_easy",
"r4:mmlu_elementary_math",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7709135990581293,
"gap_recovered": 0.22285463070047312
},
{
"task": "gsm_hard",
"domain": "math",
"N": 24,
"seed": 22,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.058087711663081736,
"gap_recovered": -0.0605264038875185
},
{
"task": "gsm_hard",
"domain": "math",
"N": 24,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07326510681503122,
"gap_recovered": 0.11459738632728325
},
{
"task": "gsm_hard",
"domain": "math",
"N": 24,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:math_counting_easy",
"r4:mbpp_sanitized",
"r4:mmlu_high_school_physics",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07306991325027642,
"gap_recovered": 0.11234515288780485
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 24,
"seed": 22,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.08727496881594604,
"gap_recovered": 0.034101416324747065
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 24,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.1274962817663434,
"gap_recovered": 0.22263882077973468
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 24,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:math_counting_easy",
"r4:mbpp_sanitized",
"r4:mmlu_high_school_physics",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12716514850484914,
"gap_recovered": 0.22108663361648034
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 24,
"seed": 22,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2363231649481017,
"gap_recovered": 0.07025738831224113
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 24,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2548052961250832,
"gap_recovered": 0.275614401389813
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 24,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:math_counting_easy",
"r5:humaneval",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:mmlu_elementary_math"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2548048059282632,
"gap_recovered": 0.27560895475848
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 24,
"seed": 22,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.22549109979607596,
"gap_recovered": 0.037818999126039775
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 24,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.27496794599226154,
"gap_recovered": 0.24986262568112086
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 24,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r5:humaneval",
"r4:humaneval",
"r4:math_counting_easy",
"r4:mmlu_high_school_physics",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:mmlu_elementary_math"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2749465808101084,
"gap_recovered": 0.24977106061475035
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 24,
"seed": 22,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7208252208534328,
"gap_recovered": 0.03960446653694945
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 24,
"seed": 22,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7718177453402815,
"gap_recovered": 0.22616248295224942
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 24,
"seed": 22,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mmlu_high_school_physics",
"r4:mmlu_high_school_biology",
"r4:mbpp_sanitized",
"r4:math_counting_easy",
"r4:mmlu_elementary_math",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7709135990581293,
"gap_recovered": 0.22285463070047312
},
{
"task": "gsm_hard",
"domain": "math",
"N": 24,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.058087711663081736,
"gap_recovered": -0.0605264038875185
},
{
"task": "gsm_hard",
"domain": "math",
"N": 24,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07326510681503122,
"gap_recovered": 0.11459738632728325
},
{
"task": "gsm_hard",
"domain": "math",
"N": 24,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:math_counting_easy",
"r4:mbpp_sanitized",
"r4:mmlu_high_school_physics",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07306991325027642,
"gap_recovered": 0.11234515288780485
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 24,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.08727496881594604,
"gap_recovered": 0.034101416324747065
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 24,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.1274962817663434,
"gap_recovered": 0.22263882077973468
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 24,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:math_counting_easy",
"r4:mbpp_sanitized",
"r4:mmlu_high_school_physics",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12716514850484914,
"gap_recovered": 0.22108663361648034
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 24,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2363231649481017,
"gap_recovered": 0.07025738831224113
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 24,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2548052961250832,
"gap_recovered": 0.275614401389813
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 24,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:math_counting_easy",
"r5:humaneval",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:mmlu_elementary_math"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2548048059282632,
"gap_recovered": 0.27560895475848
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 24,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.22549109979607596,
"gap_recovered": 0.037818999126039775
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 24,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.27496794599226154,
"gap_recovered": 0.24986262568112086
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 24,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r5:humaneval",
"r4:humaneval",
"r4:math_counting_easy",
"r4:mmlu_high_school_physics",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:mmlu_elementary_math"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2749465808101084,
"gap_recovered": 0.24977106061475035
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 24,
"seed": 33,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7208252208534328,
"gap_recovered": 0.03960446653694945
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 24,
"seed": 33,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7718177453402815,
"gap_recovered": 0.22616248295224942
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 24,
"seed": 33,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mmlu_high_school_physics",
"r4:mmlu_high_school_biology",
"r4:mbpp_sanitized",
"r4:math_counting_easy",
"r4:mmlu_elementary_math",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7709135990581293,
"gap_recovered": 0.22285463070047312
},
{
"task": "gsm_hard",
"domain": "math",
"N": 24,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.058087711663081736,
"gap_recovered": -0.0605264038875185
},
{
"task": "gsm_hard",
"domain": "math",
"N": 24,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07326510681503122,
"gap_recovered": 0.11459738632728325
},
{
"task": "gsm_hard",
"domain": "math",
"N": 24,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:math_counting_easy",
"r4:mbpp_sanitized",
"r4:mmlu_high_school_physics",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07306991325027642,
"gap_recovered": 0.11234515288780485
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 24,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.08727496881594604,
"gap_recovered": 0.034101416324747065
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 24,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.1274962817663434,
"gap_recovered": 0.22263882077973468
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 24,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:math_counting_easy",
"r4:mbpp_sanitized",
"r4:mmlu_high_school_physics",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12716514850484914,
"gap_recovered": 0.22108663361648034
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 24,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2363231649481017,
"gap_recovered": 0.07025738831224113
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 24,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2548052961250832,
"gap_recovered": 0.275614401389813
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 24,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:math_counting_easy",
"r5:humaneval",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:mmlu_elementary_math"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2548048059282632,
"gap_recovered": 0.27560895475848
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 24,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.22549109979607596,
"gap_recovered": 0.037818999126039775
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 24,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.27496794599226154,
"gap_recovered": 0.24986262568112086
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 24,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r5:humaneval",
"r4:humaneval",
"r4:math_counting_easy",
"r4:mmlu_high_school_physics",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:mmlu_elementary_math"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2749465808101084,
"gap_recovered": 0.24977106061475035
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 24,
"seed": 44,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7208252208534328,
"gap_recovered": 0.03960446653694945
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 24,
"seed": 44,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7718177453402815,
"gap_recovered": 0.22616248295224942
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 24,
"seed": 44,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mmlu_high_school_physics",
"r4:mmlu_high_school_biology",
"r4:mbpp_sanitized",
"r4:math_counting_easy",
"r4:mmlu_elementary_math",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7709135990581293,
"gap_recovered": 0.22285463070047312
},
{
"task": "gsm_hard",
"domain": "math",
"N": 24,
"seed": 55,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.058087711663081736,
"gap_recovered": -0.0605264038875185
},
{
"task": "gsm_hard",
"domain": "math",
"N": 24,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07326510681503122,
"gap_recovered": 0.11459738632728325
},
{
"task": "gsm_hard",
"domain": "math",
"N": 24,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:math_counting_easy",
"r4:mbpp_sanitized",
"r4:mmlu_high_school_physics",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math"
],
"base_Y": 0.06333333333333334,
"oracle": 0.15,
"accuracy": 0.07306991325027642,
"gap_recovered": 0.11234515288780485
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 24,
"seed": 55,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.08727496881594604,
"gap_recovered": 0.034101416324747065
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 24,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.1274962817663434,
"gap_recovered": 0.22263882077973468
},
{
"task": "gsm8k_test_500",
"domain": "math",
"N": 24,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:math_counting_easy",
"r4:mbpp_sanitized",
"r4:mmlu_high_school_physics",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith",
"r4:math_algebra_easy",
"r4:mmlu_elementary_math"
],
"base_Y": 0.08,
"oracle": 0.29333333333333333,
"accuracy": 0.12716514850484914,
"gap_recovered": 0.22108663361648034
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 24,
"seed": 55,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2363231649481017,
"gap_recovered": 0.07025738831224113
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 24,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2548052961250832,
"gap_recovered": 0.275614401389813
},
{
"task": "mbpp_test_held",
"domain": "code",
"N": 24,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r4:math_counting_easy",
"r5:humaneval",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:mmlu_elementary_math"
],
"base_Y": 0.23,
"oracle": 0.32,
"accuracy": 0.2548048059282632,
"gap_recovered": 0.27560895475848
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 24,
"seed": 55,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.22549109979607596,
"gap_recovered": 0.037818999126039775
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 24,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.27496794599226154,
"gap_recovered": 0.24986262568112086
},
{
"task": "mbpp_plus",
"domain": "code",
"N": 24,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mbpp_sanitized",
"r5:humaneval",
"r4:humaneval",
"r4:math_counting_easy",
"r4:mmlu_high_school_physics",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:mmlu_elementary_math"
],
"base_Y": 0.21666666666666667,
"oracle": 0.45,
"accuracy": 0.2749465808101084,
"gap_recovered": 0.24977106061475035
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 24,
"seed": 55,
"method": "mean",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7208252208534328,
"gap_recovered": 0.03960446653694945
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 24,
"seed": 55,
"method": "global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": null,
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7718177453402815,
"gap_recovered": 0.22616248295224942
},
{
"task": "openbookqa_test",
"domain": "science",
"N": 24,
"seed": 55,
"method": "topk8_global_ridge",
"anchors": [
"r4:gsm8k",
"r4:mbpp",
"r4:sciq",
"r4:arc_easy",
"r4:openbookqa",
"r4:svamp",
"r4:multiarith",
"r4:mmlu_high_school_biology",
"r4:math_counting_easy",
"r4:humaneval",
"r4:mmlu_high_school_physics",
"r4:mbpp_sanitized",
"r4:mmlu_elementary_math",
"r4:math_algebra_easy",
"r4:aqua_rat",
"r4:medmcqa_easy",
"r5:aqua_rat_numeric",
"r5:math_counting_easy",
"r5:mawps",
"r5:mbpp_sanitized",
"r5:humaneval",
"r5:conala_curated",
"r5:medmcqa_easy",
"r5:pubmedqa_pqal"
],
"selected_topk": [
"r4:mmlu_high_school_physics",
"r4:mmlu_high_school_biology",
"r4:mbpp_sanitized",
"r4:math_counting_easy",
"r4:mmlu_elementary_math",
"r5:humaneval",
"r4:humaneval",
"r4:multiarith"
],
"base_Y": 0.71,
"oracle": 0.9833333333333333,
"accuracy": 0.7709135990581293,
"gap_recovered": 0.22285463070047312
}
],
"summary": {
"4": {
"mean": {
"gap_recovered_mean": -0.057793517770438306,
"gap_recovered_std": 0.17563038381464027,
"accuracy_mean": 0.25036947076019195,
"accuracy_std": 0.03216155322401068,
"per_seed_gap_recovered": [
0.07797600483072223,
0.05780125075373164,
-0.25,
0.07525515556335458,
-0.25
],
"per_seed_accuracy": [
0.27513360811650067,
0.2717492408341375,
0.21516666666666664,
0.27463117151698846,
0.21516666666666664
]
},
"global_ridge": {
"gap_recovered_mean": 0.18758141994476324,
"gap_recovered_std": 0.012099918774017874,
"accuracy_mean": 0.2948275943624562,
"accuracy_std": 0.002092518094207094,
"per_seed_gap_recovered": [
0.20656509234987466,
0.18344950675964355,
0.18193462388268833,
0.19116208060034404,
0.17479579613126564
],
"per_seed_accuracy": [
0.2980716934039675,
0.2941577190530711,
0.29408085632872305,
0.2953825493543998,
0.29244515367211965
]
},
"topk8_global_ridge": {
"gap_recovered_mean": 0.18764435011765057,
"gap_recovered_std": 0.012084144151669675,
"accuracy_mean": 0.29483878477140407,
"accuracy_std": 0.002090313347697144,
"per_seed_gap_recovered": [
0.20655925520535176,
0.18367281864429347,
0.18193721360173717,
0.19125222748723533,
0.17480023564963507
],
"per_seed_accuracy": [
0.2980712127986996,
0.29419602497144676,
0.29408122587752067,
0.29539983888056087,
0.29244562132879237
]
}
},
"8": {
"mean": {
"gap_recovered_mean": -0.13972029274907605,
"gap_recovered_std": 0.16140573821969564,
"accuracy_mean": 0.2354215987117811,
"accuracy_std": 0.029564716809042605,
"per_seed_gap_recovered": [
-0.25,
-0.05490825505092227,
0.10630679130554208,
-0.25,
-0.25
],
"per_seed_accuracy": [
0.21516666666666664,
0.25132430619480967,
0.280283687364096,
0.21516666666666664,
0.21516666666666664
]
},
"global_ridge": {
"gap_recovered_mean": 0.20068106569092853,
"gap_recovered_std": 0.012220388493588764,
"accuracy_mean": 0.2971712960484384,
"accuracy_std": 0.002230078724212129,
"per_seed_gap_recovered": [
0.21065586188743862,
0.18605573012911042,
0.21070932108780432,
0.20718265812972508,
0.18880175722056428
],
"per_seed_accuracy": [
0.2990403014928445,
0.29462198929128974,
0.2990668709141085,
0.2982323727278874,
0.29489494581606196
]
},
"topk8_global_ridge": {
"gap_recovered_mean": 0.20068368007396833,
"gap_recovered_std": 0.012222230487989805,
"accuracy_mean": 0.29717193931272656,
"accuracy_std": 0.0022303862873171943,
"per_seed_gap_recovered": [
0.21065314884843503,
0.18605505186935956,
0.21070738907518063,
0.20720142331616637,
0.18880138726070012
],
"per_seed_accuracy": [
0.29903937467487374,
0.2946218563112719,
0.2990664386091561,
0.29823706614834145,
0.29489496081998984
]
}
},
"12": {
"mean": {
"gap_recovered_mean": -0.06185165931438573,
"gap_recovered_std": 0.11987185088549845,
"accuracy_mean": 0.24987132692446656,
"accuracy_std": 0.02200175109794565,
"per_seed_gap_recovered": [
-0.11435040523265969,
0.01908318338723024,
0.020892102142860212,
0.01511682313064057,
-0.25
],
"per_seed_accuracy": [
0.24058434674383583,
0.2647677935293351,
0.2649075207929502,
0.263930306889545,
0.21516666666666664
]
},
"global_ridge": {
"gap_recovered_mean": 0.20759854234498124,
"gap_recovered_std": 0.008070049662145362,
"accuracy_mean": 0.2985715409284351,
"accuracy_std": 0.0013586871836284732,
"per_seed_gap_recovered": [
0.21448874884638292,
0.19910567382286332,
0.21295600512932095,
0.2129719134034782,
0.19847037052286085
],
"per_seed_accuracy": [
0.2998147268980399,
0.2971767584373211,
0.2994404271991774,
0.29941309983976955,
0.29701269226786736
]
},
"topk8_global_ridge": {
"gap_recovered_mean": 0.20760713363515926,
"gap_recovered_std": 0.00804956171172338,
"accuracy_mean": 0.2985733282867519,
"accuracy_std": 0.0013561412938367808,
"per_seed_gap_recovered": [
0.21448340498167906,
0.1991690602795832,
0.21295518299628952,
0.21295721777554216,
0.19847080214270246
],
"per_seed_accuracy": [
0.2998131661469909,
0.29718647974935075,
0.2994436094020975,
0.2994110638958284,
0.29701232223949214
]
}
},
"16": {
"mean": {
"gap_recovered_mean": -0.010073316508325977,
"gap_recovered_std": 0.08824797021532643,
"accuracy_mean": 0.25940458604933203,
"accuracy_std": 0.015888833442519255,
"per_seed_gap_recovered": [
0.04609660444588505,
0.04871861276955448,
-0.03942812722304764,
0.04715635447666568,
-0.15291002701068745
],
"per_seed_accuracy": [
0.2694970770704336,
0.27004897209693646,
0.25407284990398366,
0.26969607279492525,
0.23370795838038125
]
},
"global_ridge": {
"gap_recovered_mean": 0.2123188520299978,
"gap_recovered_std": 0.0055871937881194125,
"accuracy_mean": 0.2994135150717593,
"accuracy_std": 0.0009899319263617858,
"per_seed_gap_recovered": [
0.21571068517093012,
0.21426208676963024,
0.21296079405422877,
0.21608726320595584,
0.20257343094924402
],
"per_seed_accuracy": [
0.30001587030805393,
0.29979992151534424,
0.299442303306755,
0.3001066524297341,
0.29770282779890916
]
},
"topk8_global_ridge": {
"gap_recovered_mean": 0.21201921742537935,
"gap_recovered_std": 0.005505111192090935,
"accuracy_mean": 0.2993448407787016,
"accuracy_std": 0.0009667310823210812,
"per_seed_gap_recovered": [
0.21541710146542262,
0.21387085421332014,
0.2126474791559681,
0.21573547248182634,
0.20242517981035965
],
"per_seed_accuracy": [
0.29994774891316206,
0.29971016041437787,
0.2993640720734651,
0.30002444234661674,
0.2976777801458863
]
}
},
"24": {
"mean": {
"gap_recovered_mean": 0.02425117328249178,
"gap_recovered_std": 0.0,
"accuracy_mean": 0.26560043321532767,
"accuracy_std": 0.0,
"per_seed_gap_recovered": [
0.02425117328249178,
0.02425117328249178,
0.02425117328249178,
0.02425117328249178,
0.02425117328249178
],
"per_seed_accuracy": [
0.26560043321532767,
0.26560043321532767,
0.26560043321532767,
0.26560043321532767,
0.26560043321532767
]
},
"global_ridge": {
"gap_recovered_mean": 0.21777514342604026,
"gap_recovered_std": 0.0,
"accuracy_mean": 0.3004704752078001,
"accuracy_std": 0.0,
"per_seed_gap_recovered": [
0.21777514342604026,
0.21777514342604026,
0.21777514342604026,
0.21777514342604026,
0.21777514342604026
],
"per_seed_accuracy": [
0.3004704752078001,
0.3004704752078001,
0.3004704752078001,
0.3004704752078001,
0.3004704752078001
]
},
"topk8_global_ridge": {
"gap_recovered_mean": 0.21633328651559772,
"gap_recovered_std": 0.0,
"accuracy_mean": 0.3001800095103253,
"accuracy_std": 0.0,
"per_seed_gap_recovered": [
0.21633328651559772,
0.21633328651559772,
0.21633328651559772,
0.21633328651559772,
0.21633328651559772
],
"per_seed_accuracy": [
0.3001800095103253,
0.3001800095103253,
0.3001800095103253,
0.3001800095103253,
0.3001800095103253
]
}
}
}
}