{ "results": { "hendrycksTest-miscellaneous": { "acc": 0.26181353767560667, "acc_stderr": 0.01572083867844526, "acc_norm": 0.24776500638569604, "acc_norm_stderr": 0.015438083080568965 }, "hendrycksTest-professional_accounting": { "acc": 0.2978723404255319, "acc_stderr": 0.027281608344469414, "acc_norm": 0.2695035460992908, "acc_norm_stderr": 0.02646903681859062 }, "hendrycksTest-moral_scenarios": { "acc": 0.2346368715083799, "acc_stderr": 0.014173044098303654, "acc_norm": 0.2569832402234637, "acc_norm_stderr": 0.014614465821966361 }, "sciq": { "acc": 0.633, "acc_stderr": 0.015249378464171749, "acc_norm": 0.552, "acc_norm_stderr": 0.01573351656634783 }, "hendrycksTest-nutrition": { "acc": 0.27450980392156865, "acc_stderr": 0.02555316999182651, "acc_norm": 0.3366013071895425, "acc_norm_stderr": 0.027057974624494382 }, "piqa": { "acc": 0.5973884657236126, "acc_stderr": 0.011442395233488698, "acc_norm": 0.5854189336235038, "acc_norm_stderr": 0.011494326682255158 }, "hendrycksTest-high_school_us_history": { "acc": 0.2549019607843137, "acc_stderr": 0.030587591351604243, "acc_norm": 0.27941176470588236, "acc_norm_stderr": 0.031493281045079556 }, "hendrycksTest-international_law": { "acc": 0.15702479338842976, "acc_stderr": 0.0332124484254713, "acc_norm": 0.4132231404958678, "acc_norm_stderr": 0.04495087843548408 }, "hendrycksTest-anatomy": { "acc": 0.2222222222222222, "acc_stderr": 0.035914440841969694, "acc_norm": 0.2740740740740741, "acc_norm_stderr": 0.03853254836552003 }, "crows_pairs_french_gender": { "likelihood_difference": 4.660533489096573, "likelihood_difference_stderr": 0.22532366484380598, "pct_stereotype": 0.5077881619937694, "pct_stereotype_stderr": 0.027947458769356347 }, "hendrycksTest-professional_medicine": { "acc": 0.2757352941176471, "acc_stderr": 0.027146271936625162, "acc_norm": 0.3125, "acc_norm_stderr": 0.02815637344037142 }, "hendrycksTest-high_school_psychology": { "acc": 0.27339449541284405, "acc_stderr": 0.0191092998460983, "acc_norm": 0.24403669724770644, "acc_norm_stderr": 0.018415286351416395 }, "hendrycksTest-astronomy": { "acc": 0.23684210526315788, "acc_stderr": 0.03459777606810535, "acc_norm": 0.3355263157894737, "acc_norm_stderr": 0.03842498559395268 }, "hendrycksTest-logical_fallacies": { "acc": 0.22699386503067484, "acc_stderr": 0.03291099578615769, "acc_norm": 0.26380368098159507, "acc_norm_stderr": 0.03462419931615623 }, "crows_pairs_french_disability": { "likelihood_difference": 6.6946022727272725, "likelihood_difference_stderr": 0.7491237826255029, "pct_stereotype": 0.5, "pct_stereotype_stderr": 0.06201736729460421 }, "hendrycksTest-high_school_chemistry": { "acc": 0.2315270935960591, "acc_stderr": 0.029678333141444455, "acc_norm": 0.3054187192118227, "acc_norm_stderr": 0.03240661565868408 }, "hendrycksTest-elementary_mathematics": { "acc": 0.25925925925925924, "acc_stderr": 0.022569897074918424, "acc_norm": 0.25925925925925924, "acc_norm_stderr": 0.022569897074918424 }, "hendrycksTest-human_sexuality": { "acc": 0.3053435114503817, "acc_stderr": 0.040393149787245605, "acc_norm": 0.22137404580152673, "acc_norm_stderr": 0.03641297081313729 }, "hendrycksTest-professional_psychology": { "acc": 0.25, "acc_stderr": 0.01751781884501444, "acc_norm": 0.2630718954248366, "acc_norm_stderr": 0.017812676542320653 }, "crows_pairs_french_religion": { "likelihood_difference": 4.175815217391304, "likelihood_difference_stderr": 0.5425080644657401, "pct_stereotype": 0.5130434782608696, "pct_stereotype_stderr": 0.04681335351503156 }, "hendrycksTest-college_computer_science": { "acc": 0.28, "acc_stderr": 0.045126085985421255, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684 }, "crows_pairs_english_autre": { "likelihood_difference": 5.3977272727272725, "likelihood_difference_stderr": 1.790491828842816, "pct_stereotype": 0.6363636363636364, "pct_stereotype_stderr": 0.15212000482437738 }, "hendrycksTest-econometrics": { "acc": 0.30701754385964913, "acc_stderr": 0.04339138322579861, "acc_norm": 0.2631578947368421, "acc_norm_stderr": 0.041424397194893624 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.23529411764705882, "acc_stderr": 0.027553614467863825, "acc_norm": 0.3403361344537815, "acc_norm_stderr": 0.030778057422931673 }, "hendrycksTest-moral_disputes": { "acc": 0.2774566473988439, "acc_stderr": 0.024105712607754307, "acc_norm": 0.2947976878612717, "acc_norm_stderr": 0.024547617794803835 }, "hendrycksTest-machine_learning": { "acc": 0.33035714285714285, "acc_stderr": 0.04464285714285713, "acc_norm": 0.20535714285714285, "acc_norm_stderr": 0.03834241021419073 }, "hendrycksTest-management": { "acc": 0.2621359223300971, "acc_stderr": 0.043546310772605935, "acc_norm": 0.27184466019417475, "acc_norm_stderr": 0.044052680241409216 }, "crows_pairs_french_socioeconomic": { "likelihood_difference": 5.223692602040816, "likelihood_difference_stderr": 0.3716517632652829, "pct_stereotype": 0.5510204081632653, "pct_stereotype_stderr": 0.03561884533975955 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.2564102564102564, "acc_stderr": 0.022139081103971534, "acc_norm": 0.28205128205128205, "acc_norm_stderr": 0.022815813098896597 }, "hendrycksTest-security_studies": { "acc": 0.2897959183673469, "acc_stderr": 0.029043088683304345, "acc_norm": 0.2530612244897959, "acc_norm_stderr": 0.027833023871399683 }, "hendrycksTest-medical_genetics": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "hendrycksTest-high_school_statistics": { "acc": 0.2361111111111111, "acc_stderr": 0.028963702570791033, "acc_norm": 0.27314814814814814, "acc_norm_stderr": 0.03038805130167812 }, "crows_pairs_english": { "likelihood_difference": 3.675657796660704, "likelihood_difference_stderr": 0.10428478695252169, "pct_stereotype": 0.5438282647584973, "pct_stereotype_stderr": 0.012166287275376289 }, "hendrycksTest-high_school_physics": { "acc": 0.2185430463576159, "acc_stderr": 0.03374235550425694, "acc_norm": 0.25165562913907286, "acc_norm_stderr": 0.03543304234389985 }, "crows_pairs_english_religion": { "likelihood_difference": 3.5057713963963963, "likelihood_difference_stderr": 0.4253117969664197, "pct_stereotype": 0.6216216216216216, "pct_stereotype_stderr": 0.04624128233851482 }, "crows_pairs_english_sexual_orientation": { "likelihood_difference": 4.478158602150538, "likelihood_difference_stderr": 0.5463367427565824, "pct_stereotype": 0.7849462365591398, "pct_stereotype_stderr": 0.04283507835554755 }, "crows_pairs_english_socioeconomic": { "likelihood_difference": 4.158223684210526, "likelihood_difference_stderr": 0.2827099752616182, "pct_stereotype": 0.5842105263157895, "pct_stereotype_stderr": 0.0358501132552001 }, "crows_pairs_english_gender": { "likelihood_difference": 2.90234375, "likelihood_difference_stderr": 0.26743360486517015, "pct_stereotype": 0.5375, "pct_stereotype_stderr": 0.02791577963000663 }, "hendrycksTest-electrical_engineering": { "acc": 0.2413793103448276, "acc_stderr": 0.03565998174135303, "acc_norm": 0.27586206896551724, "acc_norm_stderr": 0.03724563619774632 }, "hendrycksTest-business_ethics": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "hendrycksTest-global_facts": { "acc": 0.19, "acc_stderr": 0.03942772444036625, "acc_norm": 0.24, "acc_norm_stderr": 0.042923469599092816 }, "hendrycksTest-public_relations": { "acc": 0.2545454545454545, "acc_stderr": 0.04172343038705383, "acc_norm": 0.20909090909090908, "acc_norm_stderr": 0.03895091015724137 }, "crows_pairs_french_age": { "likelihood_difference": 4.967708333333333, "likelihood_difference_stderr": 0.4550873657608913, "pct_stereotype": 0.43333333333333335, "pct_stereotype_stderr": 0.05252667118728807 }, "hendrycksTest-virology": { "acc": 0.25301204819277107, "acc_stderr": 0.033844291552331346, "acc_norm": 0.21686746987951808, "acc_norm_stderr": 0.03208284450356365 }, "crows_pairs_french_physical_appearance": { "likelihood_difference": 5.224392361111111, "likelihood_difference_stderr": 0.5949955425776441, "pct_stereotype": 0.4861111111111111, "pct_stereotype_stderr": 0.059316185327165566 }, "hendrycksTest-human_aging": { "acc": 0.26905829596412556, "acc_stderr": 0.02976377940687497, "acc_norm": 0.21524663677130046, "acc_norm_stderr": 0.027584066602208274 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.25906735751295334, "acc_stderr": 0.03161877917935409, "acc_norm": 0.3005181347150259, "acc_norm_stderr": 0.033088185944157515 }, "hendrycksTest-philosophy": { "acc": 0.24437299035369775, "acc_stderr": 0.024406162094668893, "acc_norm": 0.26688102893890675, "acc_norm_stderr": 0.025122637608816646 }, "lambada_openai": { "ppl": 124.26962204175287, "ppl_stderr": 5.363117769801199, "acc": 0.22627595575392975, "acc_stderr": 0.005829406265404375 }, "crows_pairs_english_physical_appearance": { "likelihood_difference": 3.8758680555555554, "likelihood_difference_stderr": 0.41377726625457284, "pct_stereotype": 0.625, "pct_stereotype_stderr": 0.05745481997211521 }, "winogrande": { "acc": 0.5193370165745856, "acc_stderr": 0.014041972733712972 }, "crows_pairs_english_nationality": { "likelihood_difference": 3.6435908564814814, "likelihood_difference_stderr": 0.26705840381438256, "pct_stereotype": 0.4305555555555556, "pct_stereotype_stderr": 0.03376922151252336 }, "hendrycksTest-college_physics": { "acc": 0.17647058823529413, "acc_stderr": 0.03793281185307809, "acc_norm": 0.23529411764705882, "acc_norm_stderr": 0.04220773659171453 }, "crows_pairs_english_race_color": { "likelihood_difference": 3.7424950787401574, "likelihood_difference_stderr": 0.18169346622004526, "pct_stereotype": 0.5059055118110236, "pct_stereotype_stderr": 0.02220423067397246 }, "hendrycksTest-conceptual_physics": { "acc": 0.2936170212765957, "acc_stderr": 0.02977164271249123, "acc_norm": 0.1829787234042553, "acc_norm_stderr": 0.025276041000449966 }, "hendrycksTest-clinical_knowledge": { "acc": 0.23773584905660378, "acc_stderr": 0.026199808807561915, "acc_norm": 0.3018867924528302, "acc_norm_stderr": 0.028254200344438662 }, "hendrycksTest-college_mathematics": { "acc": 0.18, "acc_stderr": 0.03861229196653697, "acc_norm": 0.2, "acc_norm_stderr": 0.04020151261036845 }, "hendrycksTest-abstract_algebra": { "acc": 0.23, "acc_stderr": 0.04229525846816505, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "hendrycksTest-computer_security": { "acc": 0.22, "acc_stderr": 0.041633319989322716, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621503 }, "hendrycksTest-world_religions": { "acc": 0.23976608187134502, "acc_stderr": 0.03274485211946956, "acc_norm": 0.3157894736842105, "acc_norm_stderr": 0.03565079670708311 }, "hendrycksTest-sociology": { "acc": 0.24378109452736318, "acc_stderr": 0.030360490154014638, "acc_norm": 0.2835820895522388, "acc_norm_stderr": 0.03187187537919798 }, "hendrycksTest-college_chemistry": { "acc": 0.32, "acc_stderr": 0.04688261722621504, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "hendrycksTest-high_school_world_history": { "acc": 0.2109704641350211, "acc_stderr": 0.02655837250266192, "acc_norm": 0.2742616033755274, "acc_norm_stderr": 0.029041333510598046 }, "logiqa": { "acc": 0.23195084485407066, "acc_stderr": 0.0165552524979259, "acc_norm": 0.27035330261136714, "acc_norm_stderr": 0.01742069478339314 }, "wsc": { "acc": 0.36538461538461536, "acc_stderr": 0.0474473339327792 }, "hendrycksTest-high_school_computer_science": { "acc": 0.2, "acc_stderr": 0.04020151261036843, "acc_norm": 0.26, "acc_norm_stderr": 0.044084400227680814 }, "hendrycksTest-high_school_biology": { "acc": 0.2870967741935484, "acc_stderr": 0.025736542745594528, "acc_norm": 0.3, "acc_norm_stderr": 0.02606936229533513 }, "hendrycksTest-marketing": { "acc": 0.27350427350427353, "acc_stderr": 0.029202540153431177, "acc_norm": 0.2606837606837607, "acc_norm_stderr": 0.028760348956523414 }, "hendrycksTest-professional_law": { "acc": 0.24771838331160365, "acc_stderr": 0.011025499291443742, "acc_norm": 0.27444589308996087, "acc_norm_stderr": 0.011397043163078154 }, "crows_pairs_french_nationality": { "likelihood_difference": 7.527667984189724, "likelihood_difference_stderr": 0.4209795564667756, "pct_stereotype": 0.308300395256917, "pct_stereotype_stderr": 0.02909012143059231 }, "hendrycksTest-prehistory": { "acc": 0.26851851851851855, "acc_stderr": 0.024659685185967284, "acc_norm": 0.21296296296296297, "acc_norm_stderr": 0.0227797190887334 }, "crows_pairs_french_sexual_orientation": { "likelihood_difference": 7.548076923076923, "likelihood_difference_stderr": 0.5113727094452629, "pct_stereotype": 0.8131868131868132, "pct_stereotype_stderr": 0.04108446855035883 }, "hendrycksTest-high_school_mathematics": { "acc": 0.1814814814814815, "acc_stderr": 0.023499264669407292, "acc_norm": 0.22962962962962963, "acc_norm_stderr": 0.025644108639267613 }, "hendrycksTest-jurisprudence": { "acc": 0.2037037037037037, "acc_stderr": 0.038935425188248475, "acc_norm": 0.3611111111111111, "acc_norm_stderr": 0.04643454608906275 }, "crows_pairs_french_race_color": { "likelihood_difference": 4.847758152173913, "likelihood_difference_stderr": 0.2507391728199927, "pct_stereotype": 0.3239130434782609, "pct_stereotype_stderr": 0.021842842500532617 }, "hendrycksTest-us_foreign_policy": { "acc": 0.24, "acc_stderr": 0.04292346959909283, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909283 }, "arc_easy": { "acc": 0.37415824915824913, "acc_stderr": 0.009929516948977625, "acc_norm": 0.3367003367003367, "acc_norm_stderr": 0.009697166595752477 }, "arc_challenge": { "acc": 0.18600682593856654, "acc_stderr": 0.011370940183266749, "acc_norm": 0.22440273037542663, "acc_norm_stderr": 0.012191404938603843 }, "crows_pairs_english_age": { "likelihood_difference": 2.8133585164835164, "likelihood_difference_stderr": 0.27309263450343635, "pct_stereotype": 0.4725274725274725, "pct_stereotype_stderr": 0.05262501097748859 }, "crows_pairs_english_disability": { "likelihood_difference": 5.492307692307692, "likelihood_difference_stderr": 0.571991498636384, "pct_stereotype": 0.6461538461538462, "pct_stereotype_stderr": 0.05977027026123099 }, "crows_pairs_french": { "likelihood_difference": 5.452854800238521, "likelihood_difference_stderr": 0.13262546821335017, "pct_stereotype": 0.4442456768038163, "pct_stereotype_stderr": 0.012137130534698507 }, "hendrycksTest-formal_logic": { "acc": 0.2777777777777778, "acc_stderr": 0.040061680838488774, "acc_norm": 0.29365079365079366, "acc_norm_stderr": 0.04073524322147125 }, "hendrycksTest-high_school_european_history": { "acc": 0.18181818181818182, "acc_stderr": 0.030117688929503585, "acc_norm": 0.2606060606060606, "acc_norm_stderr": 0.03427743175816524 }, "hendrycksTest-high_school_geography": { "acc": 0.2878787878787879, "acc_stderr": 0.03225883512300992, "acc_norm": 0.3181818181818182, "acc_norm_stderr": 0.03318477333845331 }, "hendrycksTest-college_medicine": { "acc": 0.24277456647398843, "acc_stderr": 0.0326926380614177, "acc_norm": 0.3063583815028902, "acc_norm_stderr": 0.03514942551267437 }, "crows_pairs_french_autre": { "likelihood_difference": 4.454326923076923, "likelihood_difference_stderr": 1.3817380041698064, "pct_stereotype": 0.5384615384615384, "pct_stereotype_stderr": 0.14390989949130545 }, "hendrycksTest-college_biology": { "acc": 0.2569444444444444, "acc_stderr": 0.03653946969442099, "acc_norm": 0.2777777777777778, "acc_norm_stderr": 0.037455547914624576 } }, "versions": { "hendrycksTest-miscellaneous": 0, "hendrycksTest-professional_accounting": 0, "hendrycksTest-moral_scenarios": 0, "sciq": 0, "hendrycksTest-nutrition": 0, "piqa": 0, "hendrycksTest-high_school_us_history": 0, "hendrycksTest-international_law": 0, "hendrycksTest-anatomy": 0, "crows_pairs_french_gender": 0, "hendrycksTest-professional_medicine": 0, "hendrycksTest-high_school_psychology": 0, "hendrycksTest-astronomy": 0, "hendrycksTest-logical_fallacies": 0, "crows_pairs_french_disability": 0, "hendrycksTest-high_school_chemistry": 0, "hendrycksTest-elementary_mathematics": 0, "hendrycksTest-human_sexuality": 0, "hendrycksTest-professional_psychology": 0, "crows_pairs_french_religion": 0, "hendrycksTest-college_computer_science": 0, "crows_pairs_english_autre": 0, "hendrycksTest-econometrics": 0, "hendrycksTest-high_school_microeconomics": 0, "hendrycksTest-moral_disputes": 0, "hendrycksTest-machine_learning": 0, "hendrycksTest-management": 0, "crows_pairs_french_socioeconomic": 0, "hendrycksTest-high_school_macroeconomics": 0, "hendrycksTest-security_studies": 0, "hendrycksTest-medical_genetics": 0, "hendrycksTest-high_school_statistics": 0, "crows_pairs_english": 0, "hendrycksTest-high_school_physics": 0, "crows_pairs_english_religion": 0, "crows_pairs_english_sexual_orientation": 0, "crows_pairs_english_socioeconomic": 0, "crows_pairs_english_gender": 0, "hendrycksTest-electrical_engineering": 0, "hendrycksTest-business_ethics": 0, "hendrycksTest-global_facts": 0, "hendrycksTest-public_relations": 0, "crows_pairs_french_age": 0, "hendrycksTest-virology": 0, "crows_pairs_french_physical_appearance": 0, "hendrycksTest-human_aging": 0, "hendrycksTest-high_school_government_and_politics": 0, "hendrycksTest-philosophy": 0, "lambada_openai": 0, "crows_pairs_english_physical_appearance": 0, "winogrande": 0, "crows_pairs_english_nationality": 0, "hendrycksTest-college_physics": 0, "crows_pairs_english_race_color": 0, "hendrycksTest-conceptual_physics": 0, "hendrycksTest-clinical_knowledge": 0, "hendrycksTest-college_mathematics": 0, "hendrycksTest-abstract_algebra": 0, "hendrycksTest-computer_security": 0, "hendrycksTest-world_religions": 0, "hendrycksTest-sociology": 0, "hendrycksTest-college_chemistry": 0, "hendrycksTest-high_school_world_history": 0, "logiqa": 0, "wsc": 0, "hendrycksTest-high_school_computer_science": 0, "hendrycksTest-high_school_biology": 0, "hendrycksTest-marketing": 0, "hendrycksTest-professional_law": 0, "crows_pairs_french_nationality": 0, "hendrycksTest-prehistory": 0, "crows_pairs_french_sexual_orientation": 0, "hendrycksTest-high_school_mathematics": 0, "hendrycksTest-jurisprudence": 0, "crows_pairs_french_race_color": 0, "hendrycksTest-us_foreign_policy": 0, "arc_easy": 0, "arc_challenge": 0, "crows_pairs_english_age": 0, "crows_pairs_english_disability": 0, "crows_pairs_french": 0, "hendrycksTest-formal_logic": 0, "hendrycksTest-high_school_european_history": 0, "hendrycksTest-high_school_geography": 0, "hendrycksTest-college_medicine": 0, "crows_pairs_french_autre": 0, "hendrycksTest-college_biology": 0 }, "config": { "model": "hf-causal", "model_args": "pretrained=EleutherAI/pythia-v1.1-70m,revision=step83000", "num_fewshot": 0, "batch_size": 16, "device": "cuda:0", "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }