| { |
| "config": { |
| "model_X": "Qwen/Qwen2.5-3B-Instruct", |
| "model_Y": "meta-llama/Llama-3.2-3B-Instruct", |
| "hub_repo": "CK0607/cross-model-lora-prediction-3b", |
| "round6_real_generation_eval": true, |
| "round5_surrogate_deprecated": true, |
| "no_surrogate": true, |
| "eval_examples_requested": 300, |
| "generation": { |
| "do_sample": false, |
| "num_beams": 1, |
| "greedy": true, |
| "max_new_tokens_code": 96, |
| "max_new_tokens_other": 24 |
| }, |
| "pool_anchor_names": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "heldouts": [ |
| "gsm_hard", |
| "gsm8k_test_500", |
| "mbpp_test_held", |
| "mbpp_plus", |
| "openbookqa_test" |
| ], |
| "N_values": [ |
| 4, |
| 8, |
| 12, |
| 16, |
| 24 |
| ], |
| "methods": [ |
| "mean", |
| "global_ridge", |
| "topk8_global_ridge" |
| ], |
| "seeds_for_subsampled_N": [ |
| 0, |
| 1, |
| 2 |
| ], |
| "N24_seed": 0, |
| "budget_reduction": null, |
| "wall_seconds": 724.129 |
| }, |
| "adapter_verification": { |
| "listing": { |
| "round4/X": [ |
| "aqua_rat", |
| "arc_challenge", |
| "arc_easy", |
| "gsm8k", |
| "gsm8k_test_500", |
| "gsm_hard", |
| "humaneval", |
| "math_algebra_easy", |
| "math_counting_easy", |
| "mbpp", |
| "mbpp_plus", |
| "mbpp_sanitized", |
| "mbpp_test_held", |
| "medmcqa_easy", |
| "mmlu_elementary_math", |
| "mmlu_high_school_biology", |
| "mmlu_high_school_physics", |
| "multiarith", |
| "openbookqa", |
| "openbookqa_test", |
| "sciq", |
| "svamp" |
| ], |
| "round4/Y": [ |
| "aqua_rat", |
| "arc_challenge", |
| "arc_easy", |
| "gsm8k", |
| "gsm8k_test_500", |
| "gsm_hard", |
| "humaneval", |
| "math_algebra_easy", |
| "math_counting_easy", |
| "mbpp", |
| "mbpp_plus", |
| "mbpp_sanitized", |
| "mbpp_test_held", |
| "medmcqa_easy", |
| "mmlu_elementary_math", |
| "mmlu_high_school_biology", |
| "mmlu_high_school_physics", |
| "multiarith", |
| "openbookqa", |
| "openbookqa_test", |
| "sciq", |
| "svamp" |
| ], |
| "round5/X": [ |
| "aqua_rat_numeric", |
| "conala_curated", |
| "humaneval", |
| "math_counting_easy", |
| "mawps", |
| "mbpp_sanitized", |
| "medmcqa_easy", |
| "pubmedqa_pqal" |
| ], |
| "round5/Y": [ |
| "aqua_rat_numeric", |
| "conala_curated", |
| "humaneval", |
| "math_counting_easy", |
| "mawps", |
| "mbpp_sanitized", |
| "medmcqa_easy", |
| "pubmedqa_pqal" |
| ] |
| }, |
| "missing": [] |
| }, |
| "baselines": { |
| "gsm_hard": { |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15 |
| }, |
| "gsm8k_test_500": { |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333 |
| }, |
| "mbpp_test_held": { |
| "base_Y": 0.23, |
| "oracle": 0.32 |
| }, |
| "mbpp_plus": { |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45 |
| }, |
| "openbookqa_test": { |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333 |
| } |
| }, |
| "records": [ |
| { |
| "task": "gsm8k_test_500", |
| "N": 4, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_global_ridge_N4_seed0", |
| "selected_topk": null, |
| "accuracy": 0.08, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 16.047, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.0, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 4, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_global_ridge_N4_seed1", |
| "selected_topk": null, |
| "accuracy": 0.056666666666666664, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 6.767, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": -0.10937500000000003, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 4, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r5:aqua_rat_numeric", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_global_ridge_N4_seed2", |
| "selected_topk": null, |
| "accuracy": 0.07666666666666666, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 6.193, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": -0.015625000000000038, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 4, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_mean_N4_seed0", |
| "selected_topk": null, |
| "accuracy": 0.09333333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 12.83, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.06250000000000001, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 4, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_mean_N4_seed1", |
| "selected_topk": null, |
| "accuracy": 0.08666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 5.713, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.03125000000000001, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 4, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r5:aqua_rat_numeric", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_mean_N4_seed2", |
| "selected_topk": null, |
| "accuracy": 0.09666666666666666, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 7.464, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.07812499999999999, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 4, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_topk8_global_ridge_N4_seed0", |
| "selected_topk": [ |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r5:math_counting_easy" |
| ], |
| "accuracy": 0.07666666666666666, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 15.428, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": -0.015625000000000038, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 4, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_topk8_global_ridge_N4_seed1", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat" |
| ], |
| "accuracy": 0.06, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 7.106, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": -0.09375000000000003, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 4, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r5:aqua_rat_numeric", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_topk8_global_ridge_N4_seed2", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r5:conala_curated", |
| "r5:aqua_rat_numeric" |
| ], |
| "accuracy": 0.07333333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 6.161, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": -0.03125000000000001, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 8, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_global_ridge_N8_seed0", |
| "selected_topk": null, |
| "accuracy": 0.09333333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 10.252, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.06250000000000001, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 8, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_global_ridge_N8_seed1", |
| "selected_topk": null, |
| "accuracy": 0.07333333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 11.38, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": -0.03125000000000001, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 8, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_global_ridge_N8_seed2", |
| "selected_topk": null, |
| "accuracy": 0.08, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 5.259, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.0, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 8, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_mean_N8_seed0", |
| "selected_topk": null, |
| "accuracy": 0.09666666666666666, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 8.174, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.07812499999999999, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 8, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_mean_N8_seed1", |
| "selected_topk": null, |
| "accuracy": 0.1, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 9.26, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.09375000000000003, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 8, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_mean_N8_seed2", |
| "selected_topk": null, |
| "accuracy": 0.1, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 6.14, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.09375000000000003, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 8, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_topk8_global_ridge_N8_seed0", |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:svamp", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r4:mbpp", |
| "r5:math_counting_easy" |
| ], |
| "accuracy": 0.1, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 8.825, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.09375000000000003, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 8, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_topk8_global_ridge_N8_seed1", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:math_counting_easy" |
| ], |
| "accuracy": 0.07333333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 13.534, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": -0.03125000000000001, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 8, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_topk8_global_ridge_N8_seed2", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r5:aqua_rat_numeric" |
| ], |
| "accuracy": 0.08333333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 5.216, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.015624999999999972, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 12, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_global_ridge_N12_seed0", |
| "selected_topk": null, |
| "accuracy": 0.09666666666666666, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 4.862, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.07812499999999999, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 12, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_global_ridge_N12_seed1", |
| "selected_topk": null, |
| "accuracy": 0.09666666666666666, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 6.151, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.07812499999999999, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 12, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_global_ridge_N12_seed2", |
| "selected_topk": null, |
| "accuracy": 0.09, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 4.764, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.04687499999999998, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 12, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_mean_N12_seed0", |
| "selected_topk": null, |
| "accuracy": 0.09666666666666666, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 10.116, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.07812499999999999, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 12, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_mean_N12_seed1", |
| "selected_topk": null, |
| "accuracy": 0.10666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 9.274, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.12500000000000003, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 12, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_mean_N12_seed2", |
| "selected_topk": null, |
| "accuracy": 0.1, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 5.933, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.09375000000000003, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 12, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_topk8_global_ridge_N12_seed0", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:mmlu_high_school_biology", |
| "r4:svamp", |
| "r4:aqua_rat", |
| "r5:conala_curated" |
| ], |
| "accuracy": 0.09333333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 5.642, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.06250000000000001, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 12, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_topk8_global_ridge_N12_seed1", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:mmlu_high_school_physics", |
| "r5:humaneval", |
| "r4:math_algebra_easy", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "accuracy": 0.09, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 6.188, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.04687499999999998, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 12, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_topk8_global_ridge_N12_seed2", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:openbookqa", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "accuracy": 0.09333333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 4.729, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.06250000000000001, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 16, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_global_ridge_N16_seed0", |
| "selected_topk": null, |
| "accuracy": 0.1, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 4.316, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.09375000000000003, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 16, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_global_ridge_N16_seed1", |
| "selected_topk": null, |
| "accuracy": 0.08, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 4.721, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.0, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 16, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_global_ridge_N16_seed2", |
| "selected_topk": null, |
| "accuracy": 0.09666666666666666, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 4.675, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.07812499999999999, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 16, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_mean_N16_seed0", |
| "selected_topk": null, |
| "accuracy": 0.10333333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 5.288, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.109375, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 16, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_mean_N16_seed1", |
| "selected_topk": null, |
| "accuracy": 0.10333333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 5.185, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.109375, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 16, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_mean_N16_seed2", |
| "selected_topk": null, |
| "accuracy": 0.1, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 5.741, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.09375000000000003, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 16, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_topk8_global_ridge_N16_seed0", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math", |
| "r4:mmlu_high_school_biology" |
| ], |
| "accuracy": 0.1, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 4.798, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.09375000000000003, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 16, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_topk8_global_ridge_N16_seed1", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_high_school_physics", |
| "r5:humaneval", |
| "r4:math_algebra_easy", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r4:openbookqa" |
| ], |
| "accuracy": 0.08333333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 4.775, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.015624999999999972, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 16, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_topk8_global_ridge_N16_seed2", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:openbookqa", |
| "r5:conala_curated" |
| ], |
| "accuracy": 0.09666666666666666, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 4.732, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.07812499999999999, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 24, |
| "seed": 0, |
| "deterministic_full_pool": true, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_global_ridge_N24_full", |
| "selected_topk": null, |
| "accuracy": 0.09333333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 4.367, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.06250000000000001, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 24, |
| "seed": 0, |
| "deterministic_full_pool": true, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_mean_N24_full", |
| "selected_topk": null, |
| "accuracy": 0.10333333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 5.883, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.109375, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm8k_test_500", |
| "N": 24, |
| "seed": 0, |
| "deterministic_full_pool": true, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm8k_test_500_topk8_global_ridge_N24_full", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_high_school_physics", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math" |
| ], |
| "accuracy": 0.09, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 4.254, |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "gap_recovered": 0.04687499999999998, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 4, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_global_ridge_N4_seed0", |
| "selected_topk": null, |
| "accuracy": 0.05, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 18.317, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": -0.15384615384615388, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 4, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_global_ridge_N4_seed1", |
| "selected_topk": null, |
| "accuracy": 0.03666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 14.877, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": -0.30769230769230776, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 4, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r5:aqua_rat_numeric", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_global_ridge_N4_seed2", |
| "selected_topk": null, |
| "accuracy": 0.04666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 14.164, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": -0.19230769230769237, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 4, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_mean_N4_seed0", |
| "selected_topk": null, |
| "accuracy": 0.06333333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 18.709, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.0, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 4, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_mean_N4_seed1", |
| "selected_topk": null, |
| "accuracy": 0.06666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 15.851, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.038461538461538394, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 4, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r5:aqua_rat_numeric", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_mean_N4_seed2", |
| "selected_topk": null, |
| "accuracy": 0.056666666666666664, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 15.649, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": -0.07692307692307702, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 4, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_topk8_global_ridge_N4_seed0", |
| "selected_topk": [ |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r5:math_counting_easy" |
| ], |
| "accuracy": 0.05333333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 18.151, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": -0.11538461538461542, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 4, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_topk8_global_ridge_N4_seed1", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat" |
| ], |
| "accuracy": 0.03333333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 16.423, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": -0.34615384615384626, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 4, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r5:aqua_rat_numeric", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_topk8_global_ridge_N4_seed2", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r5:conala_curated", |
| "r5:aqua_rat_numeric" |
| ], |
| "accuracy": 0.04, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 14.211, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": -0.26923076923076933, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 8, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_global_ridge_N8_seed0", |
| "selected_topk": null, |
| "accuracy": 0.08, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 17.133, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.1923076923076923, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 8, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_global_ridge_N8_seed1", |
| "selected_topk": null, |
| "accuracy": 0.06666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 17.412, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.038461538461538394, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 8, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_global_ridge_N8_seed2", |
| "selected_topk": null, |
| "accuracy": 0.06, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 13.813, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": -0.038461538461538554, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 8, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_mean_N8_seed0", |
| "selected_topk": null, |
| "accuracy": 0.06666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 16.831, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.038461538461538394, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 8, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_mean_N8_seed1", |
| "selected_topk": null, |
| "accuracy": 0.07666666666666666, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 19.168, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.15384615384615374, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 8, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_mean_N8_seed2", |
| "selected_topk": null, |
| "accuracy": 0.06333333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 15.834, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.0, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 8, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_topk8_global_ridge_N8_seed0", |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:svamp", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r4:mbpp", |
| "r5:math_counting_easy" |
| ], |
| "accuracy": 0.08333333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 17.033, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.23076923076923067, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 8, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_topk8_global_ridge_N8_seed1", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy", |
| "r5:conala_curated", |
| "r5:math_counting_easy" |
| ], |
| "accuracy": 0.056666666666666664, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 17.685, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": -0.07692307692307702, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 8, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_topk8_global_ridge_N8_seed2", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:medmcqa_easy", |
| "r5:conala_curated", |
| "r5:mbpp_sanitized", |
| "r5:aqua_rat_numeric" |
| ], |
| "accuracy": 0.056666666666666664, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 12.006, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": -0.07692307692307702, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 12, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_global_ridge_N12_seed0", |
| "selected_topk": null, |
| "accuracy": 0.06, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 14.247, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": -0.038461538461538554, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 12, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_global_ridge_N12_seed1", |
| "selected_topk": null, |
| "accuracy": 0.06666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 16.792, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.038461538461538394, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 12, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_global_ridge_N12_seed2", |
| "selected_topk": null, |
| "accuracy": 0.06333333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 10.577, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.0, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 12, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_mean_N12_seed0", |
| "selected_topk": null, |
| "accuracy": 0.07, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 17.127, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.07692307692307694, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 12, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_mean_N12_seed1", |
| "selected_topk": null, |
| "accuracy": 0.07, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 14.327, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.07692307692307694, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 12, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_mean_N12_seed2", |
| "selected_topk": null, |
| "accuracy": 0.06666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 13.838, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.038461538461538394, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 12, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_topk8_global_ridge_N12_seed0", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:mmlu_high_school_biology", |
| "r4:svamp", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy" |
| ], |
| "accuracy": 0.06333333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 16.357, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.0, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 12, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_topk8_global_ridge_N12_seed1", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:mmlu_high_school_physics", |
| "r5:humaneval", |
| "r4:math_algebra_easy", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy", |
| "r5:conala_curated" |
| ], |
| "accuracy": 0.07333333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 16.623, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.11538461538461534, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 12, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_topk8_global_ridge_N12_seed2", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:openbookqa", |
| "r5:medmcqa_easy", |
| "r5:conala_curated" |
| ], |
| "accuracy": 0.06333333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 10.698, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.0, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 16, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_global_ridge_N16_seed0", |
| "selected_topk": null, |
| "accuracy": 0.056666666666666664, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 10.982, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": -0.07692307692307702, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 16, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_global_ridge_N16_seed1", |
| "selected_topk": null, |
| "accuracy": 0.06333333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 11.658, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.0, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 16, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_global_ridge_N16_seed2", |
| "selected_topk": null, |
| "accuracy": 0.06333333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 11.127, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.0, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 16, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_mean_N16_seed0", |
| "selected_topk": null, |
| "accuracy": 0.07, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 13.972, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.07692307692307694, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 16, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_mean_N16_seed1", |
| "selected_topk": null, |
| "accuracy": 0.06666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 14.059, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.038461538461538394, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 16, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_mean_N16_seed2", |
| "selected_topk": null, |
| "accuracy": 0.06, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 14.114, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": -0.038461538461538554, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 16, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_topk8_global_ridge_N16_seed0", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math", |
| "r4:mmlu_high_school_biology" |
| ], |
| "accuracy": 0.06666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 10.721, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.038461538461538394, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 16, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_topk8_global_ridge_N16_seed1", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_high_school_physics", |
| "r5:humaneval", |
| "r4:math_algebra_easy", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r4:openbookqa" |
| ], |
| "accuracy": 0.06333333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 11.104, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.0, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 16, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_topk8_global_ridge_N16_seed2", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:openbookqa", |
| "r5:medmcqa_easy" |
| ], |
| "accuracy": 0.056666666666666664, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 10.618, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": -0.07692307692307702, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 24, |
| "seed": 0, |
| "deterministic_full_pool": true, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_global_ridge_N24_full", |
| "selected_topk": null, |
| "accuracy": 0.06333333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 11.544, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.0, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 24, |
| "seed": 0, |
| "deterministic_full_pool": true, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_mean_N24_full", |
| "selected_topk": null, |
| "accuracy": 0.06666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 13.897, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.038461538461538394, |
| "domain": "math" |
| }, |
| { |
| "task": "gsm_hard", |
| "N": 24, |
| "seed": 0, |
| "deterministic_full_pool": true, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/gsm_hard_topk8_global_ridge_N24_full", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_high_school_physics", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:mmlu_elementary_math" |
| ], |
| "accuracy": 0.06666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 10.731, |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "gap_recovered": 0.038461538461538394, |
| "domain": "math" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 4, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_global_ridge_N4_seed0", |
| "selected_topk": null, |
| "accuracy": 0.20666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 91.152, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": -0.04285714285714289, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 4, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_global_ridge_N4_seed1", |
| "selected_topk": null, |
| "accuracy": 0.26666666666666666, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 88.715, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.21428571428571425, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 4, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r5:aqua_rat_numeric", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_global_ridge_N4_seed2", |
| "selected_topk": null, |
| "accuracy": 0.21666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 92.565, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.0, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 4, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_mean_N4_seed0", |
| "selected_topk": null, |
| "accuracy": 0.20666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 92.101, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": -0.04285714285714289, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 4, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_mean_N4_seed1", |
| "selected_topk": null, |
| "accuracy": 0.21333333333333335, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 94.791, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": -0.01428571428571426, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 4, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r5:aqua_rat_numeric", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_mean_N4_seed2", |
| "selected_topk": null, |
| "accuracy": 0.21333333333333335, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 92.398, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": -0.01428571428571426, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 4, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_topk8_global_ridge_N4_seed0", |
| "selected_topk": [ |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r5:math_counting_easy" |
| ], |
| "accuracy": 0.20666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 91.879, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": -0.04285714285714289, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 4, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_topk8_global_ridge_N4_seed1", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat" |
| ], |
| "accuracy": 0.27, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 86.762, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.22857142857142862, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 4, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r5:aqua_rat_numeric", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_topk8_global_ridge_N4_seed2", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r5:conala_curated", |
| "r4:arc_easy" |
| ], |
| "accuracy": 0.21666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 92.694, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.0, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 8, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_global_ridge_N8_seed0", |
| "selected_topk": null, |
| "accuracy": 0.21666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 87.079, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.0, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 8, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_global_ridge_N8_seed1", |
| "selected_topk": null, |
| "accuracy": 0.27, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 87.223, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.22857142857142862, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 8, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_global_ridge_N8_seed2", |
| "selected_topk": null, |
| "accuracy": 0.26666666666666666, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 90.746, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.21428571428571425, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 8, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_mean_N8_seed0", |
| "selected_topk": null, |
| "accuracy": 0.21333333333333335, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 89.515, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": -0.01428571428571426, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 8, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_mean_N8_seed1", |
| "selected_topk": null, |
| "accuracy": 0.21333333333333335, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 92.797, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": -0.01428571428571426, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 8, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_mean_N8_seed2", |
| "selected_topk": null, |
| "accuracy": 0.21666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 94.052, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.0, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 8, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_topk8_global_ridge_N8_seed0", |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:svamp", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r5:math_counting_easy", |
| "r4:mbpp" |
| ], |
| "accuracy": 0.21333333333333335, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 89.202, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": -0.01428571428571426, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 8, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_topk8_global_ridge_N8_seed1", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:math_counting_easy" |
| ], |
| "accuracy": 0.27, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 87.665, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.22857142857142862, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 8, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_topk8_global_ridge_N8_seed2", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r4:math_algebra_easy", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r4:arc_easy" |
| ], |
| "accuracy": 0.2733333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 72.699, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.24285714285714274, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 12, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_global_ridge_N12_seed0", |
| "selected_topk": null, |
| "accuracy": 0.21666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 92.332, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.0, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 12, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_global_ridge_N12_seed1", |
| "selected_topk": null, |
| "accuracy": 0.27666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 71.408, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.2571428571428571, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 12, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_global_ridge_N12_seed2", |
| "selected_topk": null, |
| "accuracy": 0.27666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 71.427, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.2571428571428571, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 12, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_mean_N12_seed0", |
| "selected_topk": null, |
| "accuracy": 0.21333333333333335, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 91.029, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": -0.01428571428571426, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 12, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_mean_N12_seed1", |
| "selected_topk": null, |
| "accuracy": 0.21, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 75.373, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": -0.028571428571428636, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 12, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_mean_N12_seed2", |
| "selected_topk": null, |
| "accuracy": 0.21333333333333335, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 75.328, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": -0.01428571428571426, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 12, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_topk8_global_ridge_N12_seed0", |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math", |
| "r4:svamp", |
| "r4:aqua_rat", |
| "r5:conala_curated" |
| ], |
| "accuracy": 0.21666666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 74.153, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.0, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 12, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_topk8_global_ridge_N12_seed1", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:math_algebra_easy", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "accuracy": 0.27, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 72.042, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.22857142857142862, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 12, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_topk8_global_ridge_N12_seed2", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:math_counting_easy", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r4:openbookqa", |
| "r5:medmcqa_easy" |
| ], |
| "accuracy": 0.2633333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 71.075, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.19999999999999984, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 16, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_global_ridge_N16_seed0", |
| "selected_topk": null, |
| "accuracy": 0.27, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 71.109, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.22857142857142862, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 16, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_global_ridge_N16_seed1", |
| "selected_topk": null, |
| "accuracy": 0.2733333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 70.902, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.24285714285714274, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 16, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_global_ridge_N16_seed2", |
| "selected_topk": null, |
| "accuracy": 0.26666666666666666, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 73.046, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.21428571428571425, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 16, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_mean_N16_seed0", |
| "selected_topk": null, |
| "accuracy": 0.21333333333333335, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 73.969, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": -0.01428571428571426, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 16, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_mean_N16_seed1", |
| "selected_topk": null, |
| "accuracy": 0.21, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 78.014, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": -0.028571428571428636, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 16, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_mean_N16_seed2", |
| "selected_topk": null, |
| "accuracy": 0.21333333333333335, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 75.572, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": -0.01428571428571426, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 16, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_topk8_global_ridge_N16_seed0", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_counting_easy", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "accuracy": 0.26666666666666666, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 70.097, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.21428571428571425, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 16, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_topk8_global_ridge_N16_seed1", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:math_algebra_easy", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:conala_curated" |
| ], |
| "accuracy": 0.2733333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 70.33, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.24285714285714274, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 16, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_topk8_global_ridge_N16_seed2", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:humaneval", |
| "r4:math_counting_easy", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r4:openbookqa" |
| ], |
| "accuracy": 0.26666666666666666, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 69.202, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.21428571428571425, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 24, |
| "seed": 0, |
| "deterministic_full_pool": true, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_global_ridge_N24_full", |
| "selected_topk": null, |
| "accuracy": 0.2733333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 69.544, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.24285714285714274, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 24, |
| "seed": 0, |
| "deterministic_full_pool": true, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_mean_N24_full", |
| "selected_topk": null, |
| "accuracy": 0.21333333333333335, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 77.605, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": -0.01428571428571426, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_plus", |
| "N": 24, |
| "seed": 0, |
| "deterministic_full_pool": true, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_topk8_global_ridge_N24_full", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math" |
| ], |
| "accuracy": 0.2833333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 71.1, |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "gap_recovered": 0.28571428571428564, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 4, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_global_ridge_N4_seed0", |
| "selected_topk": null, |
| "accuracy": 0.23, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 7, |
| "eval_seconds": 30.304, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.0, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 4, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_global_ridge_N4_seed1", |
| "selected_topk": null, |
| "accuracy": 0.28, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 6, |
| "eval_seconds": 30.334, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.5555555555555558, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 4, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r5:aqua_rat_numeric", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_global_ridge_N4_seed2", |
| "selected_topk": null, |
| "accuracy": 0.23, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 5, |
| "eval_seconds": 29.778, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.0, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 4, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_mean_N4_seed0", |
| "selected_topk": null, |
| "accuracy": 0.23, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 6, |
| "eval_seconds": 30.954, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.0, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 4, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_mean_N4_seed1", |
| "selected_topk": null, |
| "accuracy": 0.23, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 5, |
| "eval_seconds": 29.348, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.0, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 4, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r5:aqua_rat_numeric", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_mean_N4_seed2", |
| "selected_topk": null, |
| "accuracy": 0.23, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 4, |
| "eval_seconds": 30.259, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.0, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 4, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_topk8_global_ridge_N4_seed0", |
| "selected_topk": [ |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r4:arc_easy" |
| ], |
| "accuracy": 0.23, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 0, |
| "eval_seconds": 29.348, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.0, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 4, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_topk8_global_ridge_N4_seed1", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat" |
| ], |
| "accuracy": 0.27, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 7, |
| "eval_seconds": 29.833, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.44444444444444453, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 4, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r5:aqua_rat_numeric", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_topk8_global_ridge_N4_seed2", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r5:conala_curated", |
| "r4:arc_easy" |
| ], |
| "accuracy": 0.23, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 6, |
| "eval_seconds": 29.391, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.0, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 8, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_global_ridge_N8_seed0", |
| "selected_topk": null, |
| "accuracy": 0.25, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 4, |
| "eval_seconds": 28.852, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.22222222222222213, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 8, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_global_ridge_N8_seed1", |
| "selected_topk": null, |
| "accuracy": 0.27, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 3, |
| "eval_seconds": 30.289, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.44444444444444453, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 8, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_global_ridge_N8_seed2", |
| "selected_topk": null, |
| "accuracy": 0.26, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 2, |
| "eval_seconds": 28.2, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.3333333333333333, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 8, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_mean_N8_seed0", |
| "selected_topk": null, |
| "accuracy": 0.24, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 3, |
| "eval_seconds": 29.436, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.11111111111111091, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 8, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_mean_N8_seed1", |
| "selected_topk": null, |
| "accuracy": 0.24, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 2, |
| "eval_seconds": 28.812, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.11111111111111091, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 8, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_mean_N8_seed2", |
| "selected_topk": null, |
| "accuracy": 0.23, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 1, |
| "eval_seconds": 30.096, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.0, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 8, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_topk8_global_ridge_N8_seed0", |
| "selected_topk": [ |
| "r5:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:svamp", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r4:mbpp", |
| "r4:arc_easy" |
| ], |
| "accuracy": 0.25, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 5, |
| "eval_seconds": 28.379, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.22222222222222213, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 8, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_topk8_global_ridge_N8_seed1", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:math_counting_easy" |
| ], |
| "accuracy": 0.27, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 4, |
| "eval_seconds": 31.024, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.44444444444444453, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 8, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_topk8_global_ridge_N8_seed2", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r4:math_algebra_easy", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:mbpp_sanitized", |
| "r4:arc_easy" |
| ], |
| "accuracy": 0.26, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 3, |
| "eval_seconds": 29.802, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.3333333333333333, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 12, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_global_ridge_N12_seed0", |
| "selected_topk": null, |
| "accuracy": 0.25, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 1, |
| "eval_seconds": 29.335, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.22222222222222213, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 12, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_global_ridge_N12_seed1", |
| "selected_topk": null, |
| "accuracy": 0.26, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 0, |
| "eval_seconds": 24.376, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.3333333333333333, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 12, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_global_ridge_N12_seed2", |
| "selected_topk": null, |
| "accuracy": 0.26, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 7, |
| "eval_seconds": 25.224, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.3333333333333333, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 12, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_mean_N12_seed0", |
| "selected_topk": null, |
| "accuracy": 0.25, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 0, |
| "eval_seconds": 29.425, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.22222222222222213, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 12, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_mean_N12_seed1", |
| "selected_topk": null, |
| "accuracy": 0.23, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 7, |
| "eval_seconds": 25.255, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.0, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 12, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_mean_N12_seed2", |
| "selected_topk": null, |
| "accuracy": 0.24, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 6, |
| "eval_seconds": 24.291, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.11111111111111091, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 12, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_topk8_global_ridge_N12_seed0", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math", |
| "r4:svamp", |
| "r4:aqua_rat", |
| "r5:conala_curated" |
| ], |
| "accuracy": 0.25, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 2, |
| "eval_seconds": 28.665, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.22222222222222213, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 12, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_topk8_global_ridge_N12_seed1", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:math_algebra_easy", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "accuracy": 0.26, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 1, |
| "eval_seconds": 24.393, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.3333333333333333, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 12, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_topk8_global_ridge_N12_seed2", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r4:openbookqa", |
| "r5:medmcqa_easy" |
| ], |
| "accuracy": 0.25, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 0, |
| "eval_seconds": 24.33, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.22222222222222213, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 16, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_global_ridge_N16_seed0", |
| "selected_topk": null, |
| "accuracy": 0.25, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 6, |
| "eval_seconds": 24.197, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.22222222222222213, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 16, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_global_ridge_N16_seed1", |
| "selected_topk": null, |
| "accuracy": 0.26, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 5, |
| "eval_seconds": 24.539, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.3333333333333333, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 16, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_global_ridge_N16_seed2", |
| "selected_topk": null, |
| "accuracy": 0.26, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 4, |
| "eval_seconds": 24.852, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.3333333333333333, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 16, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_mean_N16_seed0", |
| "selected_topk": null, |
| "accuracy": 0.24, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 5, |
| "eval_seconds": 23.992, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.11111111111111091, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 16, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_mean_N16_seed1", |
| "selected_topk": null, |
| "accuracy": 0.24, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 4, |
| "eval_seconds": 24.993, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.11111111111111091, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 16, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_mean_N16_seed2", |
| "selected_topk": null, |
| "accuracy": 0.24, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 3, |
| "eval_seconds": 24.299, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.11111111111111091, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 16, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_topk8_global_ridge_N16_seed0", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy" |
| ], |
| "accuracy": 0.25, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 7, |
| "eval_seconds": 25.384, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.22222222222222213, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 16, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_topk8_global_ridge_N16_seed1", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r5:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:math_algebra_easy", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:conala_curated" |
| ], |
| "accuracy": 0.25, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 6, |
| "eval_seconds": 25.052, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.22222222222222213, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 16, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_topk8_global_ridge_N16_seed2", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r4:openbookqa" |
| ], |
| "accuracy": 0.25, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 5, |
| "eval_seconds": 24.877, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.22222222222222213, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 24, |
| "seed": 0, |
| "deterministic_full_pool": true, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_global_ridge_N24_full", |
| "selected_topk": null, |
| "accuracy": 0.25, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 3, |
| "eval_seconds": 23.396, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.22222222222222213, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 24, |
| "seed": 0, |
| "deterministic_full_pool": true, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_mean_N24_full", |
| "selected_topk": null, |
| "accuracy": 0.24, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 2, |
| "eval_seconds": 23.907, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.11111111111111091, |
| "domain": "code" |
| }, |
| { |
| "task": "mbpp_test_held", |
| "N": 24, |
| "seed": 0, |
| "deterministic_full_pool": true, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_topk8_global_ridge_N24_full", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:mmlu_elementary_math" |
| ], |
| "accuracy": 0.25, |
| "real_generation_eval": true, |
| "eval_examples": 100, |
| "gpu": 4, |
| "eval_seconds": 24.651, |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "gap_recovered": 0.22222222222222213, |
| "domain": "code" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 4, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_global_ridge_N4_seed0", |
| "selected_topk": null, |
| "accuracy": 0.73, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 10.099, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.07317073170731714, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 4, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_global_ridge_N4_seed1", |
| "selected_topk": null, |
| "accuracy": 0.69, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 4.078, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": -0.07317073170731714, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 4, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r5:aqua_rat_numeric", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_global_ridge_N4_seed2", |
| "selected_topk": null, |
| "accuracy": 0.71, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 2.76, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.0, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 4, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_mean_N4_seed0", |
| "selected_topk": null, |
| "accuracy": 0.75, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 2.965, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.14634146341463428, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 4, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_mean_N4_seed1", |
| "selected_topk": null, |
| "accuracy": 0.74, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 4.032, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.10975609756097571, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 4, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r5:aqua_rat_numeric", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_mean_N4_seed2", |
| "selected_topk": null, |
| "accuracy": 0.7466666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 3.041, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.1341463414634149, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 4, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_topk8_global_ridge_N4_seed0", |
| "selected_topk": [ |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r5:math_counting_easy" |
| ], |
| "accuracy": 0.72, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 9.433, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.03658536585365857, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 4, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_topk8_global_ridge_N4_seed1", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat" |
| ], |
| "accuracy": 0.69, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 4.006, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": -0.07317073170731714, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 4, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r5:aqua_rat_numeric", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_topk8_global_ridge_N4_seed2", |
| "selected_topk": [ |
| "r4:math_counting_easy", |
| "r5:conala_curated", |
| "r5:aqua_rat_numeric" |
| ], |
| "accuracy": 0.7166666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 3.818, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.02439024390243918, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 8, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_global_ridge_N8_seed0", |
| "selected_topk": null, |
| "accuracy": 0.7433333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 6.422, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.1219512195121951, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 8, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_global_ridge_N8_seed1", |
| "selected_topk": null, |
| "accuracy": 0.7333333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 5.598, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.08536585365853654, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 8, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_global_ridge_N8_seed2", |
| "selected_topk": null, |
| "accuracy": 0.7333333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 3.741, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.08536585365853654, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 8, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_mean_N8_seed0", |
| "selected_topk": null, |
| "accuracy": 0.7533333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 4.328, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.15853658536585366, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 8, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_mean_N8_seed1", |
| "selected_topk": null, |
| "accuracy": 0.7433333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 3.192, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.1219512195121951, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 8, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_mean_N8_seed2", |
| "selected_topk": null, |
| "accuracy": 0.74, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 2.692, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.10975609756097571, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 8, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_topk8_global_ridge_N8_seed0", |
| "selected_topk": [ |
| "r4:mmlu_elementary_math", |
| "r5:humaneval", |
| "r4:svamp", |
| "r4:aqua_rat", |
| "r5:conala_curated", |
| "r4:mbpp", |
| "r5:math_counting_easy" |
| ], |
| "accuracy": 0.7366666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 6.351, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.09756097560975632, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 8, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp_sanitized", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_topk8_global_ridge_N8_seed1", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy", |
| "r5:conala_curated", |
| "r5:math_counting_easy" |
| ], |
| "accuracy": 0.7366666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 4.449, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.09756097560975632, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 8, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:math_counting_easy", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_topk8_global_ridge_N8_seed2", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r4:math_algebra_easy", |
| "r5:medmcqa_easy", |
| "r5:conala_curated", |
| "r5:mbpp_sanitized", |
| "r5:aqua_rat_numeric" |
| ], |
| "accuracy": 0.73, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 4.862, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.07317073170731714, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 12, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_global_ridge_N12_seed0", |
| "selected_topk": null, |
| "accuracy": 0.7333333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 10.478, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.08536585365853654, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 12, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_global_ridge_N12_seed1", |
| "selected_topk": null, |
| "accuracy": 0.71, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 6.801, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.0, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 12, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_global_ridge_N12_seed2", |
| "selected_topk": null, |
| "accuracy": 0.7633333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 3.86, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.19512195121951226, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 12, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_mean_N12_seed0", |
| "selected_topk": null, |
| "accuracy": 0.7366666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 2.48, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.09756097560975632, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 12, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_mean_N12_seed1", |
| "selected_topk": null, |
| "accuracy": 0.7433333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 2.299, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.1219512195121951, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 12, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_mean_N12_seed2", |
| "selected_topk": null, |
| "accuracy": 0.7566666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 2.517, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.17073170731707346, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 12, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_elementary_math", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_topk8_global_ridge_N12_seed0", |
| "selected_topk": [ |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:mmlu_elementary_math", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:svamp", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy" |
| ], |
| "accuracy": 0.7366666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 9.074, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.09756097560975632, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 12, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:arc_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_topk8_global_ridge_N12_seed1", |
| "selected_topk": [ |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r5:humaneval", |
| "r4:math_algebra_easy", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy", |
| "r5:conala_curated" |
| ], |
| "accuracy": 0.7033333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 7, |
| "eval_seconds": 6.701, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": -0.024390243902438775, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 12, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_topk8_global_ridge_N12_seed2", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:math_algebra_easy", |
| "r4:openbookqa", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy", |
| "r5:conala_curated" |
| ], |
| "accuracy": 0.7633333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 6, |
| "eval_seconds": 4.001, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.19512195121951226, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 16, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_global_ridge_N16_seed0", |
| "selected_topk": null, |
| "accuracy": 0.73, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 8.029, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.07317073170731714, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 16, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_global_ridge_N16_seed1", |
| "selected_topk": null, |
| "accuracy": 0.7433333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 7.467, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.1219512195121951, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 16, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_global_ridge_N16_seed2", |
| "selected_topk": null, |
| "accuracy": 0.76, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 3.776, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.18292682926829285, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 16, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_mean_N16_seed0", |
| "selected_topk": null, |
| "accuracy": 0.7533333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 2.287, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.15853658536585366, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 16, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_mean_N16_seed1", |
| "selected_topk": null, |
| "accuracy": 0.7566666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 2.284, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.17073170731707346, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 16, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "mean", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_mean_N16_seed2", |
| "selected_topk": null, |
| "accuracy": 0.7533333333333333, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 2.384, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.15853658536585366, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 16, |
| "seed": 0, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:math_counting_easy", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_topk8_global_ridge_N16_seed0", |
| "selected_topk": [ |
| "r4:mmlu_high_school_biology", |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r4:mmlu_elementary_math", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy" |
| ], |
| "accuracy": 0.73, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 5, |
| "eval_seconds": 7.751, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.07317073170731714, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 16, |
| "seed": 1, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:math_counting_easy", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_topk8_global_ridge_N16_seed1", |
| "selected_topk": [ |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r5:humaneval", |
| "r4:math_algebra_easy", |
| "r4:openbookqa", |
| "r5:pubmedqa_pqal", |
| "r4:aqua_rat" |
| ], |
| "accuracy": 0.74, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 4, |
| "eval_seconds": 6.88, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.10975609756097571, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 16, |
| "seed": 2, |
| "deterministic_full_pool": false, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:multiarith", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mbpp_sanitized", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mbpp_sanitized", |
| "r5:conala_curated", |
| "r5:medmcqa_easy" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_topk8_global_ridge_N16_seed2", |
| "selected_topk": [ |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:multiarith", |
| "r4:math_algebra_easy", |
| "r4:openbookqa", |
| "r4:aqua_rat", |
| "r5:medmcqa_easy" |
| ], |
| "accuracy": 0.76, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 3, |
| "eval_seconds": 3.951, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.18292682926829285, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 24, |
| "seed": 0, |
| "deterministic_full_pool": true, |
| "method": "global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_global_ridge_N24_full", |
| "selected_topk": null, |
| "accuracy": 0.75, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 1, |
| "eval_seconds": 10.959, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.14634146341463428, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 24, |
| "seed": 0, |
| "deterministic_full_pool": true, |
| "method": "mean", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_mean_N24_full", |
| "selected_topk": null, |
| "accuracy": 0.7566666666666667, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 0, |
| "eval_seconds": 2.284, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.17073170731707346, |
| "domain": "science" |
| }, |
| { |
| "task": "openbookqa_test", |
| "N": 24, |
| "seed": 0, |
| "deterministic_full_pool": true, |
| "method": "topk8_global_ridge", |
| "anchors": [ |
| "r4:gsm8k", |
| "r4:mbpp", |
| "r4:sciq", |
| "r4:arc_easy", |
| "r4:openbookqa", |
| "r4:svamp", |
| "r4:multiarith", |
| "r4:mmlu_high_school_biology", |
| "r4:math_counting_easy", |
| "r4:humaneval", |
| "r4:mmlu_high_school_physics", |
| "r4:mbpp_sanitized", |
| "r4:mmlu_elementary_math", |
| "r4:math_algebra_easy", |
| "r4:aqua_rat", |
| "r4:medmcqa_easy", |
| "r5:aqua_rat_numeric", |
| "r5:math_counting_easy", |
| "r5:mawps", |
| "r5:mbpp_sanitized", |
| "r5:humaneval", |
| "r5:conala_curated", |
| "r5:medmcqa_easy", |
| "r5:pubmedqa_pqal" |
| ], |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/openbookqa_test_topk8_global_ridge_N24_full", |
| "selected_topk": [ |
| "r4:mmlu_high_school_physics", |
| "r4:mmlu_high_school_biology", |
| "r4:mbpp_sanitized", |
| "r4:math_counting_easy", |
| "r4:mmlu_elementary_math", |
| "r5:humaneval", |
| "r4:humaneval", |
| "r4:multiarith" |
| ], |
| "accuracy": 0.7133333333333334, |
| "real_generation_eval": true, |
| "eval_examples": 300, |
| "gpu": 2, |
| "eval_seconds": 10.881, |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "gap_recovered": 0.012195121951219795, |
| "domain": "science" |
| } |
| ], |
| "summary": { |
| "4": { |
| "mean": { |
| "n_records": 15, |
| "gap_recovered_mean": 0.030148586169927653, |
| "gap_recovered_std": 0.06461483844499136, |
| "accuracy_mean": 0.26822222222222225, |
| "accuracy_std": 0.2560873693502683 |
| }, |
| "global_ridge": { |
| "n_records": 15, |
| "gap_recovered_mean": -0.003457468457468462, |
| "gap_recovered_std": 0.19538572711097718, |
| "accuracy_mean": 0.2604444444444444, |
| "accuracy_std": 0.24815147277248514 |
| }, |
| "topk8_global_ridge": { |
| "n_records": 15, |
| "gap_recovered_mean": -0.016895374837448015, |
| "gap_recovered_std": 0.18327362210993292, |
| "accuracy_mean": 0.2591111111111111, |
| "accuracy_std": 0.24823653168173224 |
| } |
| }, |
| "8": { |
| "mean": { |
| "n_records": 15, |
| "gap_recovered_mean": 0.06945515922650067, |
| "gap_recovered_std": 0.06188765948601224, |
| "accuracy_mean": 0.2728888888888889, |
| "accuracy_std": 0.25359906697873863 |
| }, |
| "global_ridge": { |
| "n_records": 15, |
| "gap_recovered_mean": 0.13060651746627353, |
| "gap_recovered_std": 0.14004780290486193, |
| "accuracy_mean": 0.2797777777777778, |
| "accuracy_std": 0.2511196093519146 |
| }, |
| "topk8_global_ridge": { |
| "n_records": 15, |
| "gap_recovered_mean": 0.1253655744661842, |
| "gap_recovered_std": 0.15427112515492677, |
| "accuracy_mean": 0.27955555555555556, |
| "accuracy_std": 0.250399257905826 |
| } |
| }, |
| "12": { |
| "mean": { |
| "n_records": 15, |
| "gap_recovered_mean": 0.07704113806247954, |
| "gap_recovered_std": 0.07136104275669189, |
| "accuracy_mean": 0.27355555555555555, |
| "accuracy_std": 0.2533159977360188 |
| }, |
| "global_ridge": { |
| "n_records": 15, |
| "gap_recovered_mean": 0.12578582720351011, |
| "gap_recovered_std": 0.12796246741216633, |
| "accuracy_mean": 0.2813333333333334, |
| "accuracy_std": 0.24984376070237324 |
| }, |
| "topk8_global_ridge": { |
| "n_records": 15, |
| "gap_recovered_mean": 0.11746010031071008, |
| "gap_recovered_std": 0.10911242321364244, |
| "accuracy_mean": 0.2793333333333333, |
| "accuracy_std": 0.24933307868588983 |
| } |
| }, |
| "16": { |
| "mean": { |
| "n_records": 15, |
| "gap_recovered_mean": 0.07689456207748886, |
| "gap_recovered_std": 0.07100176997488256, |
| "accuracy_mean": 0.2748888888888889, |
| "accuracy_std": 0.25722841022417225 |
| }, |
| "global_ridge": { |
| "n_records": 15, |
| "gap_recovered_mean": 0.1365069252111935, |
| "gap_recovered_std": 0.12641589655600136, |
| "accuracy_mean": 0.2848888888888889, |
| "accuracy_std": 0.25340809256150715 |
| }, |
| "topk8_global_ridge": { |
| "n_records": 15, |
| "gap_recovered_mean": 0.12353249054468564, |
| "gap_recovered_std": 0.10122678980066042, |
| "accuracy_mean": 0.2835555555555556, |
| "accuracy_std": 0.25276084017909567 |
| } |
| }, |
| "24": { |
| "mean": { |
| "n_records": 5, |
| "gap_recovered_mean": 0.0830787285208017, |
| "gap_recovered_std": 0.07181727060927716, |
| "accuracy_mean": 0.276, |
| "accuracy_std": 0.27834231522433744 |
| }, |
| "global_ridge": { |
| "n_records": 5, |
| "gap_recovered_mean": 0.13478416569879983, |
| "gap_recovered_std": 0.10350184199429305, |
| "accuracy_mean": 0.2859999999999999, |
| "accuracy_std": 0.2754329924561205 |
| }, |
| "topk8_global_ridge": { |
| "n_records": 5, |
| "gap_recovered_mean": 0.12109363366985319, |
| "gap_recovered_std": 0.12401845134797244, |
| "accuracy_mean": 0.2806666666666667, |
| "accuracy_std": 0.259950850055245 |
| } |
| } |
| } |
| } |