| { |
| "config": { |
| "model_X": "Qwen/Qwen2.5-3B-Instruct", |
| "model_Y": "meta-llama/Llama-3.2-3B-Instruct", |
| "hub_repo": "CK0607/cross-model-lora-prediction-3b", |
| "lora": { |
| "r": 16, |
| "alpha": 32, |
| "targets": [ |
| "q_proj", |
| "k_proj", |
| "v_proj", |
| "o_proj", |
| "gate_proj", |
| "up_proj", |
| "down_proj" |
| ], |
| "dropout": 0.0 |
| }, |
| "train": { |
| "examples": 1500, |
| "eval_examples": 300, |
| "epochs": 3.0, |
| "bs": 8, |
| "lr": 0.0002, |
| "max_len": 512 |
| }, |
| "anchors": [ |
| "gsm8k", |
| "svamp", |
| "multiarith", |
| "aqua_rat", |
| "math_algebra_easy", |
| "math_counting_easy", |
| "mbpp", |
| "humaneval", |
| "mbpp_sanitized", |
| "sciq", |
| "arc_easy", |
| "openbookqa", |
| "medmcqa_easy", |
| "mmlu_elementary_math", |
| "mmlu_high_school_biology", |
| "mmlu_high_school_physics" |
| ], |
| "heldout": [ |
| "gsm_hard", |
| "gsm8k_test_500", |
| "mbpp_test_held", |
| "mbpp_plus", |
| "arc_challenge", |
| "openbookqa_test" |
| ] |
| }, |
| "dataset_audit": { |
| "tasks": { |
| "gsm8k": { |
| "domain": "math", |
| "kind": "math_num", |
| "dataset": "openai/gsm8k", |
| "config": "main", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Solve the math problem. Respond with only the final answer.\n\nProblem: Mimi picked up 2 dozen seashells on the beach. Kyle found twice as many shells as Mimi and put them in his pocket. Leigh grabbed one-third of the shells that Kyle found. How many seashells did Leigh have?\n\nFinal answer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "16" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "mbpp": { |
| "domain": "code", |
| "kind": "code", |
| "dataset": "google-research-datasets/mbpp", |
| "config": "sanitized", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Write a Python function that solves the task. Respond with only code.\n\nTask: Write a function to reverse words seperated by spaces in a given string.\n\nTests:\nassert reverse_words(\"python program\")==(\"program python\")\nassert reverse_words(\"java language\")==(\"language java\")\nassert reverse_words(\"indian man\")==(\"man indian\")\n\nCode:" |
| }, |
| { |
| "role": "assistant", |
| "content": "def reverse_words(s):\n return ' '.join(reversed(s.split()))" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "sciq": { |
| "domain": "science", |
| "kind": "mcq", |
| "dataset": "allenai/sciq", |
| "config": null, |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [ |
| "A", |
| "B", |
| "C", |
| "D", |
| "E", |
| "F", |
| "G", |
| "H" |
| ], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Answer the multiple-choice question. Respond with only the option letter.\n\nWhat kind of reasoning involves formulating generalizations inferred from careful observation and the analysis of a large amount of data?\nA. skepticism\nB. reflexive\nC. inductive\nD. deductive\nAnswer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "D" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "arc_easy": { |
| "domain": "science", |
| "kind": "mcq", |
| "dataset": "allenai/ai2_arc", |
| "config": "ARC-Easy", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [ |
| "A", |
| "B", |
| "C", |
| "D", |
| "E", |
| "F", |
| "G", |
| "H" |
| ], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Answer the multiple-choice question. Respond with only the option letter.\n\nWhich of the following materials would best slow the transfer of heat?\nA. aluminum\nB. copper\nC. glass\nD. wood\nAnswer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "D" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "openbookqa": { |
| "domain": "science", |
| "kind": "mcq", |
| "dataset": "allenai/openbookqa", |
| "config": "main", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [ |
| "A", |
| "B", |
| "C", |
| "D", |
| "E", |
| "F", |
| "G", |
| "H" |
| ], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Answer the multiple-choice question. Respond with only the option letter.\n\nWith the addition of thrusters your forward momentum will\nA. stop\nB. increase\nC. decrease\nD. stall\nAnswer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "B" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "svamp": { |
| "domain": "math", |
| "kind": "math_num", |
| "dataset": "ChilleD/SVAMP", |
| "config": null, |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Solve the math problem. Respond with only the final answer.\n\nProblem: Danny collects bottle caps. He found 63 bottle caps at the park while he threw away 51 old ones. Now he has 33 bottle caps in his collection. How many bottle caps did danny have at first?\n\nFinal answer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "21" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "multiarith": { |
| "domain": "math", |
| "kind": "math_num", |
| "dataset": "ChilleD/MultiArith", |
| "config": null, |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Solve the math problem. Respond with only the final answer.\n\nProblem: A florist had 11 roses. If she sold 2 of them and then later picked 32 more, how many roses would she have?\n\nFinal answer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "41" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "mmlu_high_school_biology": { |
| "domain": "science", |
| "kind": "mcq", |
| "dataset": "cais/mmlu", |
| "config": "high_school_biology", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [ |
| "A", |
| "B", |
| "C", |
| "D", |
| "E", |
| "F", |
| "G", |
| "H" |
| ], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Answer the multiple-choice question. Respond with only the option letter.\n\nWhich of the following is (are) required when performing a polymerase chain reaction (P C R) on a DNA sample?\nA. Reverse transcriptase\nB. A vacuum chamber\nC. Primers complementary to certain portions of the DNA\nD. DNA ligase\nAnswer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "C" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "math_counting_easy": { |
| "domain": "math", |
| "kind": "math_solution", |
| "dataset": "EleutherAI/hendrycks_math", |
| "config": "counting_and_probability", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Solve the math problem. Respond with only the final answer.\n\nProblem: Alex has 10 different kinds of lunch meat and 9 different kinds of cheese. If he wants to make a sandwich with one kind of meat and two kinds of cheese, how many different sandwiches could he make? (It does not matter in which order he chooses the two types of cheese.)\n\nFinal answer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "360" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "humaneval": { |
| "domain": "code", |
| "kind": "code", |
| "dataset": "openai/openai_humaneval", |
| "config": null, |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Complete the following Python function. Respond with only the completion code.\n\ndef below_threshold(l: list, t: int):\n \"\"\"Return True if all numbers in the list l are below threshold t.\n >>> below_threshold([1, 2, 4, 10], 100)\n True\n >>> below_threshold([1, 20, 4, 10], 5)\n False\n \"\"\"" |
| }, |
| { |
| "role": "assistant", |
| "content": "for e in l:\n if e >= t:\n return False\n return True" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "mmlu_high_school_physics": { |
| "domain": "science", |
| "kind": "mcq", |
| "dataset": "cais/mmlu", |
| "config": "high_school_physics", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [ |
| "A", |
| "B", |
| "C", |
| "D", |
| "E", |
| "F", |
| "G", |
| "H" |
| ], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Answer the multiple-choice question. Respond with only the option letter.\n\nThe mass of the Earth is 5.97 \u00d7 10^24 kg. The Moon, whose center is 3.84 \u00d7 10^8 m from the Earth\u2019s center, has mass 7.35 \u00d7 10^22 kg. Which of the following is the best estimate of the gravitational force of the Earth on the Moon?\nA. 10^39 N\nB. 10^29 N\nC. 10^19 N\nD. 10^9 N\nAnswer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "C" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "mbpp_sanitized": { |
| "domain": "code", |
| "kind": "code", |
| "dataset": "google-research-datasets/mbpp", |
| "config": "sanitized", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Write a Python function that solves the task. Respond with only code.\n\nTask: Write a function to reverse words seperated by spaces in a given string.\n\nTests:\nassert reverse_words(\"python program\")==(\"program python\")\nassert reverse_words(\"java language\")==(\"language java\")\nassert reverse_words(\"indian man\")==(\"man indian\")\n\nCode:" |
| }, |
| { |
| "role": "assistant", |
| "content": "def reverse_words(s):\n return ' '.join(reversed(s.split()))" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "mmlu_elementary_math": { |
| "domain": "science", |
| "kind": "mcq", |
| "dataset": "cais/mmlu", |
| "config": "elementary_mathematics", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [ |
| "A", |
| "B", |
| "C", |
| "D", |
| "E", |
| "F", |
| "G", |
| "H" |
| ], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Answer the multiple-choice question. Respond with only the option letter.\n\nFind the median of the set of data 13, 35, 26, 8, 24, 10, 22, 10, 32.\nA. 23\nB. 20\nC. 22\nD. 27\nAnswer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "C" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "math_algebra_easy": { |
| "domain": "math", |
| "kind": "math_solution", |
| "dataset": "EleutherAI/hendrycks_math", |
| "config": "algebra", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Solve the math problem. Respond with only the final answer.\n\nProblem: Given that $2^x+ 2^x+ 2^x+ 2^x= 512$, what is the value of $x$?\n\nFinal answer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "7" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "aqua_rat": { |
| "domain": "math", |
| "kind": "mcq", |
| "dataset": "deepmind/aqua_rat", |
| "config": "raw", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [ |
| "A", |
| "B", |
| "C", |
| "D", |
| "E", |
| "F", |
| "G", |
| "H" |
| ], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Answer the multiple-choice question. Respond with only the option letter.\n\nThere are r red ball, b blue ball and w white ball in a bag. What is the ratio of the number of blue ball to the total no. of ball in terms of r, b and w.?\nA. r / (r + b + w)\nB. r * (r + b + w)\nC. (r + b + w)\nD. r / (r + b )\nE. r / (b + w)\nAnswer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "A" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "medmcqa_easy": { |
| "domain": "science", |
| "kind": "mcq", |
| "dataset": "openlifescienceai/medmcqa", |
| "config": null, |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [ |
| "A", |
| "B", |
| "C", |
| "D", |
| "E", |
| "F", |
| "G", |
| "H" |
| ], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Answer the multiple-choice question. Respond with only the option letter.\n\nApolipoprotein A-I (Apo A-I) is found in which of the following lipid components?\nA. LDL\nB. HDL\nC. VLDL\nD. Chylomicron\nAnswer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "B" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "gsm_hard": { |
| "domain": "math", |
| "kind": "math_num", |
| "dataset": "reasoning-machines/gsm-hard", |
| "config": null, |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Solve the math problem. Respond with only the final answer.\n\nProblem: Charlie wants to sell beeswax candles. For every pound of beeswax, he can make 10 tapered candles. One pound of beeswax and the wicks cost $10.00 in supplies. If he sells each candle for $2.00 each, what is his net profit if he makes and sells 1080379 candles?\n\nFinal answer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "1080379.0" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "gsm8k_test_500": { |
| "domain": "math", |
| "kind": "math_num", |
| "dataset": "openai/gsm8k", |
| "config": "main", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Solve the math problem. Respond with only the final answer.\n\nProblem: Benny saw a 10-foot shark with 2 6-inch remoras attached to it. What percentage of the shark's body length is the combined length of the remoras?\n\nFinal answer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "10" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "mbpp_test_held": { |
| "domain": "code", |
| "kind": "code", |
| "dataset": "google-research-datasets/mbpp", |
| "config": "sanitized", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Write a Python function that solves the task. Respond with only code.\n\nTask: Write a function to convert a given tuple of positive integers into a single integer.\n\nTests:\nassert tuple_to_int((1,2,3))==123\nassert tuple_to_int((4,5,6))==456\nassert tuple_to_int((5,6,7))==567\n\nCode:" |
| }, |
| { |
| "role": "assistant", |
| "content": "def tuple_to_int(nums):\n result = int(''.join(map(str,nums)))\n return result" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "mbpp_plus": { |
| "domain": "code", |
| "kind": "code", |
| "dataset": "evalplus/mbppplus", |
| "config": null, |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Write a Python function that solves the task. Respond with only code.\n\nTask: Write a function to check if the given tuples contain the k or not.\n\nTests:\nassert check_K((10, 4, 5, 6, 8), 6) == True\nassert check_K((1, 2, 3, 4, 5, 6), 7) == False\nassert check_K((7, 8, 9, 44, 11, 12), 11) == True\n\nCode:" |
| }, |
| { |
| "role": "assistant", |
| "content": "def check_K(test_tup, K):\n return K in test_tup" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "arc_challenge": { |
| "domain": "science", |
| "kind": "mcq", |
| "dataset": "allenai/ai2_arc", |
| "config": "ARC-Challenge", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [ |
| "A", |
| "B", |
| "C", |
| "D", |
| "E", |
| "F", |
| "G", |
| "H" |
| ], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Answer the multiple-choice question. Respond with only the option letter.\n\nWhich characteristic can a human offspring inherit?\nA. facial scar\nB. blue eyes\nC. long hair\nD. broken leg\nAnswer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "B" |
| } |
| ] |
| }, |
| "ok": true |
| }, |
| "openbookqa_test": { |
| "domain": "science", |
| "kind": "mcq", |
| "dataset": "allenai/openbookqa", |
| "config": "main", |
| "train_rows_sampled": 20, |
| "eval_rows_sampled": 10, |
| "labels": [ |
| "A", |
| "B", |
| "C", |
| "D", |
| "E", |
| "F", |
| "G", |
| "H" |
| ], |
| "sample": { |
| "messages": [ |
| { |
| "role": "user", |
| "content": "Answer the multiple-choice question. Respond with only the option letter.\n\nLive birth is exemplified in\nA. snakes slithering out of eggs\nB. a calf emerging from a mother giraffe\nC. owlets pecking out of their encasement\nD. sea turtles emerging from their shells\nAnswer:" |
| }, |
| { |
| "role": "assistant", |
| "content": "B" |
| } |
| ] |
| }, |
| "ok": true |
| } |
| }, |
| "dropped": [], |
| "kept": [ |
| "gsm8k", |
| "mbpp", |
| "sciq", |
| "arc_easy", |
| "openbookqa", |
| "svamp", |
| "multiarith", |
| "mmlu_high_school_biology", |
| "math_counting_easy", |
| "humaneval", |
| "mmlu_high_school_physics", |
| "mbpp_sanitized", |
| "mmlu_elementary_math", |
| "math_algebra_easy", |
| "aqua_rat", |
| "medmcqa_easy", |
| "gsm_hard", |
| "gsm8k_test_500", |
| "mbpp_test_held", |
| "mbpp_plus", |
| "arc_challenge", |
| "openbookqa_test" |
| ], |
| "anchor_domain_counts": { |
| "math": 6, |
| "code": 3, |
| "science": 7 |
| } |
| }, |
| "training": [ |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "mbpp", |
| "ok": true, |
| "gpu": 3 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "svamp", |
| "ok": true, |
| "gpu": 3 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "humaneval", |
| "ok": true, |
| "gpu": 3 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "math_algebra_easy", |
| "ok": true, |
| "gpu": 3 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "gsm8k_test_500", |
| "ok": true, |
| "gpu": 3 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "openbookqa_test", |
| "ok": true, |
| "gpu": 3 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "mbpp", |
| "ok": true, |
| "gpu": 2 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "svamp", |
| "ok": true, |
| "gpu": 2 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "humaneval", |
| "ok": true, |
| "gpu": 2 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "math_algebra_easy", |
| "ok": true, |
| "gpu": 2 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "gsm8k_test_500", |
| "ok": true, |
| "gpu": 2 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "openbookqa_test", |
| "ok": true, |
| "gpu": 2 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "sciq", |
| "ok": true, |
| "gpu": 5 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "multiarith", |
| "ok": true, |
| "gpu": 5 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "mmlu_high_school_physics", |
| "ok": true, |
| "gpu": 5 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "aqua_rat", |
| "ok": true, |
| "gpu": 5 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "mbpp_test_held", |
| "ok": true, |
| "gpu": 5 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "arc_easy", |
| "ok": true, |
| "gpu": 7 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "mmlu_high_school_biology", |
| "ok": true, |
| "gpu": 7 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "mbpp_sanitized", |
| "ok": true, |
| "gpu": 7 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "medmcqa_easy", |
| "ok": true, |
| "gpu": 7 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "mbpp_plus", |
| "ok": true, |
| "gpu": 7 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "sciq", |
| "ok": true, |
| "gpu": 4 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "multiarith", |
| "ok": true, |
| "gpu": 4 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "mmlu_high_school_physics", |
| "ok": true, |
| "gpu": 4 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "aqua_rat", |
| "ok": true, |
| "gpu": 4 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "mbpp_test_held", |
| "ok": true, |
| "gpu": 4 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "arc_easy", |
| "ok": true, |
| "gpu": 6 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "mmlu_high_school_biology", |
| "ok": true, |
| "gpu": 6 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "mbpp_sanitized", |
| "ok": true, |
| "gpu": 6 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "medmcqa_easy", |
| "ok": true, |
| "gpu": 6 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "mbpp_plus", |
| "ok": true, |
| "gpu": 6 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "gsm8k", |
| "ok": true, |
| "gpu": 1 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "openbookqa", |
| "ok": true, |
| "gpu": 1 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "math_counting_easy", |
| "ok": true, |
| "gpu": 1 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "mmlu_elementary_math", |
| "ok": true, |
| "gpu": 1 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "gsm_hard", |
| "ok": true, |
| "gpu": 1 |
| }, |
| { |
| "side": "Y", |
| "model": "meta-llama/Llama-3.2-3B-Instruct", |
| "task": "arc_challenge", |
| "ok": true, |
| "gpu": 1 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "gsm8k", |
| "ok": true, |
| "gpu": 0 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "openbookqa", |
| "ok": true, |
| "gpu": 0 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "math_counting_easy", |
| "ok": true, |
| "gpu": 0 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "mmlu_elementary_math", |
| "ok": true, |
| "gpu": 0 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "gsm_hard", |
| "ok": true, |
| "gpu": 0 |
| }, |
| { |
| "side": "X", |
| "model": "Qwen/Qwen2.5-3B-Instruct", |
| "task": "arc_challenge", |
| "ok": true, |
| "gpu": 0 |
| } |
| ], |
| "experiment1": { |
| "tasks": { |
| "gsm_hard": { |
| "domain": "math", |
| "selected": { |
| "topk8_global_ridge": [ |
| "math_counting_easy", |
| "mbpp_sanitized", |
| "mmlu_high_school_physics", |
| "humaneval", |
| "multiarith", |
| "math_algebra_easy", |
| "mmlu_elementary_math", |
| "mmlu_high_school_biology" |
| ], |
| "topk8_pertensor_ridge": [ |
| "math_counting_easy", |
| "mbpp_sanitized", |
| "mmlu_high_school_physics", |
| "humaneval", |
| "multiarith", |
| "math_algebra_easy", |
| "mmlu_elementary_math", |
| "mmlu_high_school_biology" |
| ] |
| }, |
| "metrics": { |
| "base_Y": 0.06333333333333334, |
| "oracle": 0.15, |
| "mean": 0.056666666666666664, |
| "mean__cos": 0.8538880348205566, |
| "global_ridge": 0.06, |
| "global_ridge__cos": 0.9063830375671387, |
| "pertensor_ridge": 0.06666666666666667, |
| "pertensor_ridge__cos": 0.904435396194458, |
| "pertensor_pca": 0.06666666666666667, |
| "pertensor_pca__cos": 0.9048259854316711, |
| "pertensor_mlp": 0.07333333333333333, |
| "pertensor_mlp__cos": 0.90220707654953, |
| "procrustes": 0.07, |
| "procrustes__cos": 0.89919513463974, |
| "topk8_global_ridge": 0.06666666666666667, |
| "topk8_global_ridge__cos": 0.9051854014396667, |
| "topk8_pertensor_ridge": 0.06333333333333334, |
| "topk8_pertensor_ridge__cos": 0.903511106967926 |
| }, |
| "main_row": { |
| "Domain": "math", |
| "Task": "gsm_hard", |
| "base_Y": 0.06333333333333334, |
| "mean": 0.056666666666666664, |
| "global_ridge": 0.06, |
| "pertensor_ridge": 0.06666666666666667, |
| "topk8_global_ridge": 0.06666666666666667, |
| "topk8_pertensor_ridge": 0.06333333333333334, |
| "pertensor_mlp": 0.07333333333333333, |
| "oracle": 0.15, |
| "oracle_minus_base_pp": 8.666666666666666, |
| "usable": true, |
| "gap_recovered": 0.11538461538461534 |
| } |
| }, |
| "gsm8k_test_500": { |
| "domain": "math", |
| "selected": { |
| "topk8_global_ridge": [ |
| "math_counting_easy", |
| "mbpp_sanitized", |
| "mmlu_high_school_physics", |
| "humaneval", |
| "multiarith", |
| "math_algebra_easy", |
| "mmlu_elementary_math", |
| "mmlu_high_school_biology" |
| ], |
| "topk8_pertensor_ridge": [ |
| "math_counting_easy", |
| "mbpp_sanitized", |
| "mmlu_high_school_physics", |
| "humaneval", |
| "multiarith", |
| "math_algebra_easy", |
| "mmlu_elementary_math", |
| "mmlu_high_school_biology" |
| ] |
| }, |
| "metrics": { |
| "base_Y": 0.08, |
| "oracle": 0.29333333333333333, |
| "mean": 0.09333333333333334, |
| "mean__cos": 0.9122087955474854, |
| "global_ridge": 0.1, |
| "global_ridge__cos": 0.9690855145454407, |
| "pertensor_ridge": 0.1, |
| "pertensor_ridge__cos": 0.9674875736236572, |
| "pertensor_pca": 0.1, |
| "pertensor_pca__cos": 0.9675261974334717, |
| "pertensor_mlp": 0.1, |
| "pertensor_mlp__cos": 0.9635453224182129, |
| "procrustes": 0.09666666666666666, |
| "procrustes__cos": 0.9608618021011353, |
| "topk8_global_ridge": 0.09333333333333334, |
| "topk8_global_ridge__cos": 0.9682586789131165, |
| "topk8_pertensor_ridge": 0.09666666666666666, |
| "topk8_pertensor_ridge__cos": 0.9668623805046082 |
| }, |
| "main_row": { |
| "Domain": "math", |
| "Task": "gsm8k_test_500", |
| "base_Y": 0.08, |
| "mean": 0.09333333333333334, |
| "global_ridge": 0.1, |
| "pertensor_ridge": 0.1, |
| "topk8_global_ridge": 0.09333333333333334, |
| "topk8_pertensor_ridge": 0.09666666666666666, |
| "pertensor_mlp": 0.1, |
| "oracle": 0.29333333333333333, |
| "oracle_minus_base_pp": 21.333333333333332, |
| "usable": true, |
| "gap_recovered": 0.09375000000000003 |
| } |
| }, |
| "mbpp_test_held": { |
| "domain": "code", |
| "selected": { |
| "topk8_global_ridge": [ |
| "mbpp_sanitized", |
| "math_counting_easy", |
| "humaneval", |
| "mmlu_high_school_physics", |
| "multiarith", |
| "mmlu_high_school_biology", |
| "mmlu_elementary_math", |
| "math_algebra_easy" |
| ], |
| "topk8_pertensor_ridge": [ |
| "mbpp_sanitized", |
| "math_counting_easy", |
| "humaneval", |
| "mmlu_high_school_physics", |
| "multiarith", |
| "mmlu_high_school_biology", |
| "mmlu_elementary_math", |
| "math_algebra_easy" |
| ] |
| }, |
| "metrics": { |
| "base_Y": 0.23, |
| "oracle": 0.32, |
| "mean": 0.24, |
| "mean__cos": 0.9324136972427368, |
| "global_ridge": 0.25, |
| "global_ridge__cos": 0.9998323917388916, |
| "pertensor_ridge": 0.25, |
| "pertensor_ridge__cos": 0.9997650980949402, |
| "pertensor_pca": 0.25, |
| "pertensor_pca__cos": 0.9902871251106262, |
| "pertensor_mlp": 0.24, |
| "pertensor_mlp__cos": 0.9861795902252197, |
| "procrustes": 0.24, |
| "procrustes__cos": 0.9862682819366455, |
| "topk8_global_ridge": 0.25, |
| "topk8_global_ridge__cos": 0.9998441338539124, |
| "topk8_pertensor_ridge": 0.25, |
| "topk8_pertensor_ridge__cos": 0.9995993375778198 |
| }, |
| "main_row": { |
| "Domain": "code", |
| "Task": "mbpp_test_held", |
| "base_Y": 0.23, |
| "mean": 0.24, |
| "global_ridge": 0.25, |
| "pertensor_ridge": 0.25, |
| "topk8_global_ridge": 0.25, |
| "topk8_pertensor_ridge": 0.25, |
| "pertensor_mlp": 0.24, |
| "oracle": 0.32, |
| "oracle_minus_base_pp": 9.0, |
| "usable": true, |
| "gap_recovered": 0.22222222222222213 |
| } |
| }, |
| "mbpp_plus": { |
| "domain": "code", |
| "selected": { |
| "topk8_global_ridge": [ |
| "mbpp_sanitized", |
| "humaneval", |
| "math_counting_easy", |
| "mmlu_high_school_physics", |
| "multiarith", |
| "mmlu_high_school_biology", |
| "mmlu_elementary_math", |
| "math_algebra_easy" |
| ], |
| "topk8_pertensor_ridge": [ |
| "mbpp_sanitized", |
| "humaneval", |
| "math_counting_easy", |
| "mmlu_high_school_physics", |
| "multiarith", |
| "mmlu_high_school_biology", |
| "mmlu_elementary_math", |
| "math_algebra_easy" |
| ] |
| }, |
| "metrics": { |
| "base_Y": 0.21666666666666667, |
| "oracle": 0.45, |
| "mean": 0.21333333333333335, |
| "mean__cos": 0.9124462008476257, |
| "global_ridge": 0.28, |
| "global_ridge__cos": 0.9848983287811279, |
| "pertensor_ridge": 0.27, |
| "pertensor_ridge__cos": 0.9846793413162231, |
| "pertensor_pca": 0.21, |
| "pertensor_pca__cos": 0.9684830904006958, |
| "pertensor_mlp": 0.21, |
| "pertensor_mlp__cos": 0.9642786383628845, |
| "procrustes": 0.22666666666666666, |
| "procrustes__cos": 0.9647142887115479, |
| "topk8_global_ridge": 0.27, |
| "topk8_global_ridge__cos": 0.9848766922950745, |
| "topk8_pertensor_ridge": 0.26666666666666666, |
| "topk8_pertensor_ridge__cos": 0.9846369624137878 |
| }, |
| "main_row": { |
| "Domain": "code", |
| "Task": "mbpp_plus", |
| "base_Y": 0.21666666666666667, |
| "mean": 0.21333333333333335, |
| "global_ridge": 0.28, |
| "pertensor_ridge": 0.27, |
| "topk8_global_ridge": 0.27, |
| "topk8_pertensor_ridge": 0.26666666666666666, |
| "pertensor_mlp": 0.21, |
| "oracle": 0.45, |
| "oracle_minus_base_pp": 23.333333333333332, |
| "usable": true, |
| "gap_recovered": 0.2714285714285715 |
| } |
| }, |
| "arc_challenge": { |
| "domain": "science", |
| "selected": { |
| "topk8_global_ridge": [ |
| "mmlu_high_school_physics", |
| "mmlu_high_school_biology", |
| "mmlu_elementary_math", |
| "math_counting_easy", |
| "mbpp_sanitized", |
| "humaneval", |
| "multiarith", |
| "math_algebra_easy" |
| ], |
| "topk8_pertensor_ridge": [ |
| "mmlu_high_school_physics", |
| "mmlu_high_school_biology", |
| "mmlu_elementary_math", |
| "math_counting_easy", |
| "mbpp_sanitized", |
| "humaneval", |
| "multiarith", |
| "math_algebra_easy" |
| ] |
| }, |
| "metrics": { |
| "base_Y": 0.7157190635451505, |
| "oracle": 0.7224080267558528, |
| "mean": 0.7324414715719063, |
| "mean__cos": 0.8707941174507141, |
| "global_ridge": 0.7357859531772575, |
| "global_ridge__cos": 0.9247814416885376, |
| "pertensor_ridge": 0.7290969899665551, |
| "pertensor_ridge__cos": 0.9244466423988342, |
| "pertensor_pca": 0.7357859531772575, |
| "pertensor_pca__cos": 0.9224098920822144, |
| "pertensor_mlp": 0.7391304347826086, |
| "pertensor_mlp__cos": 0.920215368270874, |
| "procrustes": 0.7491638795986622, |
| "procrustes__cos": 0.9195225238800049, |
| "topk8_global_ridge": 0.7357859531772575, |
| "topk8_global_ridge__cos": 0.9233748316764832, |
| "topk8_pertensor_ridge": 0.7290969899665551, |
| "topk8_pertensor_ridge__cos": 0.9231237173080444 |
| }, |
| "main_row": { |
| "Domain": "science", |
| "Task": "arc_challenge", |
| "base_Y": 0.7157190635451505, |
| "mean": 0.7324414715719063, |
| "global_ridge": 0.7357859531772575, |
| "pertensor_ridge": 0.7290969899665551, |
| "topk8_global_ridge": 0.7357859531772575, |
| "topk8_pertensor_ridge": 0.7290969899665551, |
| "pertensor_mlp": 0.7391304347826086, |
| "oracle": 0.7224080267558528, |
| "oracle_minus_base_pp": 0.6688963210702337, |
| "usable": false, |
| "gap_recovered": 5.0 |
| } |
| }, |
| "openbookqa_test": { |
| "domain": "science", |
| "selected": { |
| "topk8_global_ridge": [ |
| "mmlu_high_school_physics", |
| "mmlu_high_school_biology", |
| "mbpp_sanitized", |
| "math_counting_easy", |
| "mmlu_elementary_math", |
| "humaneval", |
| "multiarith", |
| "math_algebra_easy" |
| ], |
| "topk8_pertensor_ridge": [ |
| "mmlu_high_school_physics", |
| "mmlu_high_school_biology", |
| "mbpp_sanitized", |
| "math_counting_easy", |
| "mmlu_elementary_math", |
| "humaneval", |
| "multiarith", |
| "math_algebra_easy" |
| ] |
| }, |
| "metrics": { |
| "base_Y": 0.71, |
| "oracle": 0.9833333333333333, |
| "mean": 0.76, |
| "mean__cos": 0.9152830243110657, |
| "global_ridge": 0.7466666666666667, |
| "global_ridge__cos": 0.9711560010910034, |
| "pertensor_ridge": 0.7433333333333333, |
| "pertensor_ridge__cos": 0.9709675312042236, |
| "pertensor_pca": 0.76, |
| "pertensor_pca__cos": 0.9696671366691589, |
| "pertensor_mlp": 0.7533333333333333, |
| "pertensor_mlp__cos": 0.9671377539634705, |
| "procrustes": 0.7366666666666667, |
| "procrustes__cos": 0.9668078422546387, |
| "topk8_global_ridge": 0.7133333333333334, |
| "topk8_global_ridge__cos": 0.969284176826477, |
| "topk8_pertensor_ridge": 0.7166666666666667, |
| "topk8_pertensor_ridge__cos": 0.9690907597541809 |
| }, |
| "main_row": { |
| "Domain": "science", |
| "Task": "openbookqa_test", |
| "base_Y": 0.71, |
| "mean": 0.76, |
| "global_ridge": 0.7466666666666667, |
| "pertensor_ridge": 0.7433333333333333, |
| "topk8_global_ridge": 0.7133333333333334, |
| "topk8_pertensor_ridge": 0.7166666666666667, |
| "pertensor_mlp": 0.7533333333333333, |
| "oracle": 0.9833333333333333, |
| "oracle_minus_base_pp": 27.333333333333332, |
| "usable": true, |
| "gap_recovered": 0.18292682926829285 |
| } |
| } |
| }, |
| "main_table": [ |
| { |
| "Domain": "math", |
| "Task": "gsm_hard", |
| "base_Y": 0.06333333333333334, |
| "mean": 0.056666666666666664, |
| "global_ridge": 0.06, |
| "pertensor_ridge": 0.06666666666666667, |
| "topk8_global_ridge": 0.06666666666666667, |
| "topk8_pertensor_ridge": 0.06333333333333334, |
| "pertensor_mlp": 0.07333333333333333, |
| "oracle": 0.15, |
| "oracle_minus_base_pp": 8.666666666666666, |
| "usable": true, |
| "gap_recovered": 0.11538461538461534 |
| }, |
| { |
| "Domain": "math", |
| "Task": "gsm8k_test_500", |
| "base_Y": 0.08, |
| "mean": 0.09333333333333334, |
| "global_ridge": 0.1, |
| "pertensor_ridge": 0.1, |
| "topk8_global_ridge": 0.09333333333333334, |
| "topk8_pertensor_ridge": 0.09666666666666666, |
| "pertensor_mlp": 0.1, |
| "oracle": 0.29333333333333333, |
| "oracle_minus_base_pp": 21.333333333333332, |
| "usable": true, |
| "gap_recovered": 0.09375000000000003 |
| }, |
| { |
| "Domain": "code", |
| "Task": "mbpp_test_held", |
| "base_Y": 0.23, |
| "mean": 0.24, |
| "global_ridge": 0.25, |
| "pertensor_ridge": 0.25, |
| "topk8_global_ridge": 0.25, |
| "topk8_pertensor_ridge": 0.25, |
| "pertensor_mlp": 0.24, |
| "oracle": 0.32, |
| "oracle_minus_base_pp": 9.0, |
| "usable": true, |
| "gap_recovered": 0.22222222222222213 |
| }, |
| { |
| "Domain": "code", |
| "Task": "mbpp_plus", |
| "base_Y": 0.21666666666666667, |
| "mean": 0.21333333333333335, |
| "global_ridge": 0.28, |
| "pertensor_ridge": 0.27, |
| "topk8_global_ridge": 0.27, |
| "topk8_pertensor_ridge": 0.26666666666666666, |
| "pertensor_mlp": 0.21, |
| "oracle": 0.45, |
| "oracle_minus_base_pp": 23.333333333333332, |
| "usable": true, |
| "gap_recovered": 0.2714285714285715 |
| }, |
| { |
| "Domain": "science", |
| "Task": "arc_challenge", |
| "base_Y": 0.7157190635451505, |
| "mean": 0.7324414715719063, |
| "global_ridge": 0.7357859531772575, |
| "pertensor_ridge": 0.7290969899665551, |
| "topk8_global_ridge": 0.7357859531772575, |
| "topk8_pertensor_ridge": 0.7290969899665551, |
| "pertensor_mlp": 0.7391304347826086, |
| "oracle": 0.7224080267558528, |
| "oracle_minus_base_pp": 0.6688963210702337, |
| "usable": false, |
| "gap_recovered": 5.0 |
| }, |
| { |
| "Domain": "science", |
| "Task": "openbookqa_test", |
| "base_Y": 0.71, |
| "mean": 0.76, |
| "global_ridge": 0.7466666666666667, |
| "pertensor_ridge": 0.7433333333333333, |
| "topk8_global_ridge": 0.7133333333333334, |
| "topk8_pertensor_ridge": 0.7166666666666667, |
| "pertensor_mlp": 0.7533333333333333, |
| "oracle": 0.9833333333333333, |
| "oracle_minus_base_pp": 27.333333333333332, |
| "usable": true, |
| "gap_recovered": 0.18292682926829285 |
| } |
| ], |
| "anchor_names": [ |
| "gsm8k", |
| "svamp", |
| "multiarith", |
| "aqua_rat", |
| "math_algebra_easy", |
| "math_counting_easy", |
| "mbpp", |
| "humaneval", |
| "mbpp_sanitized", |
| "sciq", |
| "arc_easy", |
| "openbookqa", |
| "medmcqa_easy", |
| "mmlu_elementary_math", |
| "mmlu_high_school_biology", |
| "mmlu_high_school_physics" |
| ], |
| "heldout_names": [ |
| "gsm_hard", |
| "gsm8k_test_500", |
| "mbpp_test_held", |
| "mbpp_plus", |
| "arc_challenge", |
| "openbookqa_test" |
| ] |
| }, |
| "experiment2": { |
| "curves": { |
| "N5_global_ridge": { |
| "math": -0.010817307692307696, |
| "code": 0.026984126984126878, |
| "science": 1.5304878048780488 |
| }, |
| "N12_global_ridge": { |
| "math": -0.03425480769230772, |
| "code": 0.2539682539682539, |
| "science": 2.073170731707317 |
| }, |
| "N12_topk8_global_ridge": { |
| "math": -0.010817307692307696, |
| "code": 0.2738095238095238, |
| "science": 1.3353658536585367 |
| }, |
| "N12_topk12_global_ridge": { |
| "math": -0.018629807692307716, |
| "code": 0.23253968253968244, |
| "science": 1.829268292682927 |
| }, |
| "N16_global_ridge": { |
| "math": 0.027644230769230737, |
| "code": 0.24682539682539684, |
| "science": 1.5670731707317074 |
| }, |
| "N16_topk8_global_ridge": { |
| "math": 0.050480769230769204, |
| "code": 0.22539682539682537, |
| "science": 1.50609756097561 |
| }, |
| "N16_topk12_global_ridge": { |
| "math": 0.03906249999999999, |
| "code": 0.2396825396825396, |
| "science": 2.3170731707317076 |
| } |
| }, |
| "details": { |
| "N5_anchors": [ |
| "gsm8k", |
| "svamp", |
| "multiarith", |
| "aqua_rat", |
| "math_algebra_easy" |
| ], |
| "gsm_hard": { |
| "N5_global_ridge": { |
| "acc": 0.05333333333333334, |
| "gap_recovered": -0.11538461538461542 |
| }, |
| "N12_global_ridge": { |
| "acc": 0.05333333333333334, |
| "gap_recovered": -0.11538461538461542 |
| }, |
| "N12_topk8_global_ridge": { |
| "acc": 0.05333333333333334, |
| "gap_recovered": -0.11538461538461542 |
| }, |
| "N12_topk12_global_ridge": { |
| "acc": 0.05333333333333334, |
| "gap_recovered": -0.11538461538461542 |
| }, |
| "N16_topk12_global_ridge": { |
| "acc": 0.06333333333333334, |
| "gap_recovered": 0.0 |
| } |
| }, |
| "gsm8k_test_500": { |
| "N5_global_ridge": { |
| "acc": 0.1, |
| "gap_recovered": 0.09375000000000003 |
| }, |
| "N12_global_ridge": { |
| "acc": 0.09, |
| "gap_recovered": 0.04687499999999998 |
| }, |
| "N12_topk8_global_ridge": { |
| "acc": 0.1, |
| "gap_recovered": 0.09375000000000003 |
| }, |
| "N12_topk12_global_ridge": { |
| "acc": 0.09666666666666666, |
| "gap_recovered": 0.07812499999999999 |
| }, |
| "N16_topk12_global_ridge": { |
| "acc": 0.09666666666666666, |
| "gap_recovered": 0.07812499999999999 |
| } |
| }, |
| "mbpp_test_held": { |
| "N5_global_ridge": { |
| "acc": 0.24, |
| "gap_recovered": 0.11111111111111091 |
| }, |
| "N12_global_ridge": { |
| "acc": 0.25, |
| "gap_recovered": 0.22222222222222213 |
| }, |
| "N12_topk8_global_ridge": { |
| "acc": 0.26, |
| "gap_recovered": 0.3333333333333333 |
| }, |
| "N12_topk12_global_ridge": { |
| "acc": 0.25, |
| "gap_recovered": 0.22222222222222213 |
| }, |
| "N16_topk12_global_ridge": { |
| "acc": 0.25, |
| "gap_recovered": 0.22222222222222213 |
| } |
| }, |
| "mbpp_plus": { |
| "N5_global_ridge": { |
| "acc": 0.20333333333333334, |
| "gap_recovered": -0.057142857142857155 |
| }, |
| "N12_global_ridge": { |
| "acc": 0.2833333333333333, |
| "gap_recovered": 0.28571428571428564 |
| }, |
| "N12_topk8_global_ridge": { |
| "acc": 0.26666666666666666, |
| "gap_recovered": 0.21428571428571425 |
| }, |
| "N12_topk12_global_ridge": { |
| "acc": 0.2733333333333333, |
| "gap_recovered": 0.24285714285714274 |
| }, |
| "N16_topk12_global_ridge": { |
| "acc": 0.27666666666666667, |
| "gap_recovered": 0.2571428571428571 |
| } |
| }, |
| "arc_challenge": { |
| "N5_global_ridge": { |
| "acc": 0.7357859531772575, |
| "gap_recovered": 3.0 |
| }, |
| "N12_global_ridge": { |
| "acc": 0.7424749163879598, |
| "gap_recovered": 4.0 |
| }, |
| "N12_topk8_global_ridge": { |
| "acc": 0.7324414715719063, |
| "gap_recovered": 2.5 |
| }, |
| "N12_topk12_global_ridge": { |
| "acc": 0.7391304347826086, |
| "gap_recovered": 3.5 |
| }, |
| "N16_topk12_global_ridge": { |
| "acc": 0.745819397993311, |
| "gap_recovered": 4.5 |
| } |
| }, |
| "openbookqa_test": { |
| "N5_global_ridge": { |
| "acc": 0.7266666666666667, |
| "gap_recovered": 0.060975609756097754 |
| }, |
| "N12_global_ridge": { |
| "acc": 0.75, |
| "gap_recovered": 0.14634146341463428 |
| }, |
| "N12_topk8_global_ridge": { |
| "acc": 0.7566666666666667, |
| "gap_recovered": 0.17073170731707346 |
| }, |
| "N12_topk12_global_ridge": { |
| "acc": 0.7533333333333333, |
| "gap_recovered": 0.15853658536585366 |
| }, |
| "N16_topk12_global_ridge": { |
| "acc": 0.7466666666666667, |
| "gap_recovered": 0.1341463414634149 |
| } |
| }, |
| "N12_anchors": [ |
| "gsm8k", |
| "svamp", |
| "multiarith", |
| "aqua_rat", |
| "math_algebra_easy", |
| "math_counting_easy", |
| "mbpp", |
| "humaneval", |
| "mbpp_sanitized", |
| "sciq", |
| "arc_easy", |
| "openbookqa" |
| ], |
| "N16_anchors": [ |
| "gsm8k", |
| "svamp", |
| "multiarith", |
| "aqua_rat", |
| "math_algebra_easy", |
| "math_counting_easy", |
| "mbpp", |
| "humaneval", |
| "mbpp_sanitized", |
| "sciq", |
| "arc_easy", |
| "openbookqa", |
| "medmcqa_easy", |
| "mmlu_elementary_math", |
| "mmlu_high_school_biology", |
| "mmlu_high_school_physics" |
| ] |
| } |
| }, |
| "experiment3": { |
| "pools": { |
| "math-only": [ |
| "gsm8k", |
| "svamp", |
| "multiarith", |
| "aqua_rat", |
| "math_algebra_easy", |
| "math_counting_easy" |
| ], |
| "code-only": [ |
| "mbpp", |
| "humaneval", |
| "mbpp_sanitized" |
| ], |
| "science-only": [ |
| "sciq", |
| "arc_easy", |
| "openbookqa", |
| "medmcqa_easy", |
| "mmlu_elementary_math", |
| "mmlu_high_school_biology", |
| "mmlu_high_school_physics" |
| ], |
| "math+code": [ |
| "gsm8k", |
| "svamp", |
| "multiarith", |
| "aqua_rat", |
| "math_algebra_easy", |
| "math_counting_easy", |
| "mbpp", |
| "humaneval", |
| "mbpp_sanitized" |
| ], |
| "all": [ |
| "gsm8k", |
| "svamp", |
| "multiarith", |
| "aqua_rat", |
| "math_algebra_easy", |
| "math_counting_easy", |
| "mbpp", |
| "humaneval", |
| "mbpp_sanitized", |
| "sciq", |
| "arc_easy", |
| "openbookqa", |
| "medmcqa_easy", |
| "mmlu_elementary_math", |
| "mmlu_high_school_biology", |
| "mmlu_high_school_physics" |
| ] |
| }, |
| "heatmap": { |
| "math-only": { |
| "gsm_hard": -0.038461538461538554, |
| "gsm8k_test_500": 0.07812499999999999, |
| "mbpp_test_held": 0.0, |
| "mbpp_plus": -0.04285714285714289, |
| "arc_challenge": 2.0, |
| "openbookqa_test": 0.10975609756097571 |
| }, |
| "code-only": { |
| "gsm_hard": -0.15384615384615388, |
| "gsm8k_test_500": -0.03125000000000001, |
| "mbpp_test_held": 0.44444444444444453, |
| "mbpp_plus": 0.24285714285714274, |
| "arc_challenge": -1.5, |
| "openbookqa_test": 0.012195121951219795 |
| }, |
| "science-only": { |
| "gsm_hard": -0.038461538461538554, |
| "gsm8k_test_500": 0.171875, |
| "mbpp_test_held": 0.11111111111111091, |
| "mbpp_plus": -0.01428571428571426, |
| "arc_challenge": 3.5, |
| "openbookqa_test": 0.10975609756097571 |
| }, |
| "math+code": { |
| "gsm_hard": -0.15384615384615388, |
| "gsm8k_test_500": 0.09375000000000003, |
| "mbpp_test_held": 0.22222222222222213, |
| "mbpp_plus": 0.21428571428571425, |
| "arc_challenge": 0.5, |
| "openbookqa_test": 0.09756097560975632 |
| }, |
| "all": { |
| "gsm_hard": 0.038461538461538394, |
| "gsm8k_test_500": 0.06250000000000001, |
| "mbpp_test_held": 0.22222222222222213, |
| "mbpp_plus": 0.22857142857142862, |
| "arc_challenge": 3.0, |
| "openbookqa_test": 0.012195121951219795 |
| } |
| }, |
| "selected_top3": { |
| "math-only": { |
| "gsm_hard": [ |
| "math_counting_easy", |
| "multiarith", |
| "math_algebra_easy" |
| ], |
| "gsm8k_test_500": [ |
| "math_counting_easy", |
| "multiarith", |
| "math_algebra_easy" |
| ], |
| "mbpp_test_held": [ |
| "math_counting_easy", |
| "multiarith", |
| "math_algebra_easy" |
| ], |
| "mbpp_plus": [ |
| "math_counting_easy", |
| "multiarith", |
| "math_algebra_easy" |
| ], |
| "arc_challenge": [ |
| "math_counting_easy", |
| "multiarith", |
| "math_algebra_easy" |
| ], |
| "openbookqa_test": [ |
| "math_counting_easy", |
| "multiarith", |
| "math_algebra_easy" |
| ] |
| }, |
| "code-only": { |
| "gsm_hard": [ |
| "mbpp_sanitized", |
| "humaneval", |
| "mbpp" |
| ], |
| "gsm8k_test_500": [ |
| "mbpp_sanitized", |
| "humaneval", |
| "mbpp" |
| ], |
| "mbpp_test_held": [ |
| "mbpp_sanitized", |
| "humaneval", |
| "mbpp" |
| ], |
| "mbpp_plus": [ |
| "mbpp_sanitized", |
| "humaneval", |
| "mbpp" |
| ], |
| "arc_challenge": [ |
| "mbpp_sanitized", |
| "humaneval", |
| "mbpp" |
| ], |
| "openbookqa_test": [ |
| "mbpp_sanitized", |
| "humaneval", |
| "mbpp" |
| ] |
| }, |
| "science-only": { |
| "gsm_hard": [ |
| "mmlu_high_school_physics", |
| "mmlu_elementary_math", |
| "mmlu_high_school_biology" |
| ], |
| "gsm8k_test_500": [ |
| "mmlu_high_school_physics", |
| "mmlu_elementary_math", |
| "mmlu_high_school_biology" |
| ], |
| "mbpp_test_held": [ |
| "mmlu_high_school_physics", |
| "mmlu_high_school_biology", |
| "mmlu_elementary_math" |
| ], |
| "mbpp_plus": [ |
| "mmlu_high_school_physics", |
| "mmlu_high_school_biology", |
| "mmlu_elementary_math" |
| ], |
| "arc_challenge": [ |
| "mmlu_high_school_physics", |
| "mmlu_high_school_biology", |
| "mmlu_elementary_math" |
| ], |
| "openbookqa_test": [ |
| "mmlu_high_school_physics", |
| "mmlu_high_school_biology", |
| "mmlu_elementary_math" |
| ] |
| }, |
| "math+code": { |
| "gsm_hard": [ |
| "math_counting_easy", |
| "mbpp_sanitized", |
| "humaneval" |
| ], |
| "gsm8k_test_500": [ |
| "math_counting_easy", |
| "mbpp_sanitized", |
| "humaneval" |
| ], |
| "mbpp_test_held": [ |
| "mbpp_sanitized", |
| "math_counting_easy", |
| "humaneval" |
| ], |
| "mbpp_plus": [ |
| "mbpp_sanitized", |
| "humaneval", |
| "math_counting_easy" |
| ], |
| "arc_challenge": [ |
| "math_counting_easy", |
| "mbpp_sanitized", |
| "humaneval" |
| ], |
| "openbookqa_test": [ |
| "mbpp_sanitized", |
| "math_counting_easy", |
| "humaneval" |
| ] |
| }, |
| "all": { |
| "gsm_hard": [ |
| "math_counting_easy", |
| "mbpp_sanitized", |
| "mmlu_high_school_physics" |
| ], |
| "gsm8k_test_500": [ |
| "math_counting_easy", |
| "mbpp_sanitized", |
| "mmlu_high_school_physics" |
| ], |
| "mbpp_test_held": [ |
| "mbpp_sanitized", |
| "math_counting_easy", |
| "humaneval" |
| ], |
| "mbpp_plus": [ |
| "mbpp_sanitized", |
| "humaneval", |
| "math_counting_easy" |
| ], |
| "arc_challenge": [ |
| "mmlu_high_school_physics", |
| "mmlu_high_school_biology", |
| "mmlu_elementary_math" |
| ], |
| "openbookqa_test": [ |
| "mmlu_high_school_physics", |
| "mmlu_high_school_biology", |
| "mbpp_sanitized" |
| ] |
| } |
| }, |
| "best_pool_by_domain": { |
| "math": { |
| "best_anchor_pool": "science-only", |
| "score": 0.06670673076923073, |
| "top3_selections": { |
| "gsm_hard": [ |
| "mmlu_high_school_physics", |
| "mmlu_elementary_math", |
| "mmlu_high_school_biology" |
| ], |
| "gsm8k_test_500": [ |
| "mmlu_high_school_physics", |
| "mmlu_elementary_math", |
| "mmlu_high_school_biology" |
| ] |
| } |
| }, |
| "code": { |
| "best_anchor_pool": "code-only", |
| "score": 0.34365079365079365, |
| "top3_selections": { |
| "mbpp_test_held": [ |
| "mbpp_sanitized", |
| "humaneval", |
| "mbpp" |
| ], |
| "mbpp_plus": [ |
| "mbpp_sanitized", |
| "humaneval", |
| "mbpp" |
| ] |
| } |
| }, |
| "science": { |
| "best_anchor_pool": "science-only", |
| "score": 1.8048780487804879, |
| "top3_selections": { |
| "arc_challenge": [ |
| "mmlu_high_school_physics", |
| "mmlu_high_school_biology", |
| "mmlu_elementary_math" |
| ], |
| "openbookqa_test": [ |
| "mmlu_high_school_physics", |
| "mmlu_high_school_biology", |
| "mmlu_elementary_math" |
| ] |
| } |
| } |
| }, |
| "heldout_names": [ |
| "gsm_hard", |
| "gsm8k_test_500", |
| "mbpp_test_held", |
| "mbpp_plus", |
| "arc_challenge", |
| "openbookqa_test" |
| ] |
| } |
| } |