{ "results": { "hendrycksTest-high_school_chemistry": { "acc": 0.19704433497536947, "acc_stderr": 0.02798672466673622, "acc_norm": 0.26108374384236455, "acc_norm_stderr": 0.03090379695211447 }, "crows_pairs_english_disability": { "likelihood_difference": 5.444230769230769, "likelihood_difference_stderr": 0.5923616154197596, "pct_stereotype": 0.6153846153846154, "pct_stereotype_stderr": 0.06081303192631497 }, "hendrycksTest-marketing": { "acc": 0.2564102564102564, "acc_stderr": 0.028605953702004253, "acc_norm": 0.2863247863247863, "acc_norm_stderr": 0.02961432369045665 }, "hendrycksTest-moral_disputes": { "acc": 0.24855491329479767, "acc_stderr": 0.023267528432100174, "acc_norm": 0.31213872832369943, "acc_norm_stderr": 0.02494679222527231 }, "crows_pairs_french_physical_appearance": { "likelihood_difference": 5.701822916666667, "likelihood_difference_stderr": 0.6164179150786165, "pct_stereotype": 0.5, "pct_stereotype_stderr": 0.05933908290969268 }, "hendrycksTest-astronomy": { "acc": 0.2565789473684211, "acc_stderr": 0.035541803680256896, "acc_norm": 0.3815789473684211, "acc_norm_stderr": 0.03953173377749194 }, "hendrycksTest-professional_law": { "acc": 0.2607561929595828, "acc_stderr": 0.011213471559602336, "acc_norm": 0.2777053455019557, "acc_norm_stderr": 0.01143874142276956 }, "hendrycksTest-high_school_world_history": { "acc": 0.2320675105485232, "acc_stderr": 0.02747974455080852, "acc_norm": 0.2616033755274262, "acc_norm_stderr": 0.028609516716994934 }, "hendrycksTest-elementary_mathematics": { "acc": 0.2566137566137566, "acc_stderr": 0.022494510767503154, "acc_norm": 0.2698412698412698, "acc_norm_stderr": 0.022860838309232072 }, "crows_pairs_french_autre": { "likelihood_difference": 5.016826923076923, "likelihood_difference_stderr": 1.136143920954686, "pct_stereotype": 0.46153846153846156, "pct_stereotype_stderr": 0.14390989949130548 }, "hendrycksTest-high_school_computer_science": { "acc": 0.22, "acc_stderr": 0.0416333199893227, "acc_norm": 0.31, "acc_norm_stderr": 0.046482319871173156 }, "hendrycksTest-logical_fallacies": { "acc": 0.1901840490797546, "acc_stderr": 0.03083349114628123, "acc_norm": 0.2822085889570552, "acc_norm_stderr": 0.03536117886664743 }, "hendrycksTest-prehistory": { "acc": 0.2962962962962963, "acc_stderr": 0.025407197798890165, "acc_norm": 0.22839506172839505, "acc_norm_stderr": 0.023358211840626267 }, "hendrycksTest-machine_learning": { "acc": 0.24107142857142858, "acc_stderr": 0.040598672469526864, "acc_norm": 0.21428571428571427, "acc_norm_stderr": 0.03894641120044792 }, "wsc": { "acc": 0.36538461538461536, "acc_stderr": 0.0474473339327792 }, "hendrycksTest-econometrics": { "acc": 0.3157894736842105, "acc_stderr": 0.04372748290278007, "acc_norm": 0.2719298245614035, "acc_norm_stderr": 0.041857744240220554 }, "hendrycksTest-high_school_statistics": { "acc": 0.24537037037037038, "acc_stderr": 0.029346665094372955, "acc_norm": 0.2638888888888889, "acc_norm_stderr": 0.03005820270430985 }, "crows_pairs_french_religion": { "likelihood_difference": 4.592934782608696, "likelihood_difference_stderr": 0.5258696424725507, "pct_stereotype": 0.591304347826087, "pct_stereotype_stderr": 0.04604188749503789 }, "crows_pairs_french_sexual_orientation": { "likelihood_difference": 6.3279532967032965, "likelihood_difference_stderr": 0.4838492961401693, "pct_stereotype": 0.7912087912087912, "pct_stereotype_stderr": 0.04284305206509431 }, "crows_pairs_french_socioeconomic": { "likelihood_difference": 5.059749681122449, "likelihood_difference_stderr": 0.4036977090086114, "pct_stereotype": 0.5459183673469388, "pct_stereotype_stderr": 0.035654431417332814 }, "hendrycksTest-virology": { "acc": 0.28313253012048195, "acc_stderr": 0.03507295431370519, "acc_norm": 0.22289156626506024, "acc_norm_stderr": 0.03240004825594688 }, "arc_easy": { "acc": 0.359006734006734, "acc_stderr": 0.009843424713072176, "acc_norm": 0.3514309764309764, "acc_norm_stderr": 0.00979639558281772 }, "logiqa": { "acc": 0.2196620583717358, "acc_stderr": 0.01623910941493396, "acc_norm": 0.2642089093701997, "acc_norm_stderr": 0.017293954549744518 }, "hendrycksTest-abstract_algebra": { "acc": 0.23, "acc_stderr": 0.04229525846816505, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542126 }, "crows_pairs_english_sexual_orientation": { "likelihood_difference": 4.742103494623656, "likelihood_difference_stderr": 0.5813902425880072, "pct_stereotype": 0.7526881720430108, "pct_stereotype_stderr": 0.0449817218566707 }, "crows_pairs_french_age": { "likelihood_difference": 5.097569444444445, "likelihood_difference_stderr": 0.5161568677503977, "pct_stereotype": 0.4888888888888889, "pct_stereotype_stderr": 0.05298680599073449 }, "hendrycksTest-computer_security": { "acc": 0.16, "acc_stderr": 0.036845294917747094, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542127 }, "hendrycksTest-high_school_us_history": { "acc": 0.22549019607843138, "acc_stderr": 0.029331162294251714, "acc_norm": 0.2549019607843137, "acc_norm_stderr": 0.030587591351604246 }, "hendrycksTest-high_school_european_history": { "acc": 0.21212121212121213, "acc_stderr": 0.03192271569548299, "acc_norm": 0.3090909090909091, "acc_norm_stderr": 0.036085410115739666 }, "crows_pairs_french_gender": { "likelihood_difference": 4.377141744548287, "likelihood_difference_stderr": 0.2263841012514179, "pct_stereotype": 0.5077881619937694, "pct_stereotype_stderr": 0.027947458769356347 }, "hendrycksTest-high_school_mathematics": { "acc": 0.16296296296296298, "acc_stderr": 0.02251856199768264, "acc_norm": 0.23333333333333334, "acc_norm_stderr": 0.02578787422095933 }, "crows_pairs_english_religion": { "likelihood_difference": 3.8074324324324325, "likelihood_difference_stderr": 0.44590802870655577, "pct_stereotype": 0.6126126126126126, "pct_stereotype_stderr": 0.0464482507235508 }, "hendrycksTest-global_facts": { "acc": 0.23, "acc_stderr": 0.04229525846816506, "acc_norm": 0.23, "acc_norm_stderr": 0.042295258468165065 }, "hendrycksTest-clinical_knowledge": { "acc": 0.2528301886792453, "acc_stderr": 0.026749899771241235, "acc_norm": 0.30943396226415093, "acc_norm_stderr": 0.028450154794118627 }, "hendrycksTest-anatomy": { "acc": 0.2222222222222222, "acc_stderr": 0.035914440841969694, "acc_norm": 0.2222222222222222, "acc_norm_stderr": 0.035914440841969694 }, "hendrycksTest-business_ethics": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "arc_challenge": { "acc": 0.17235494880546076, "acc_stderr": 0.011037113093461295, "acc_norm": 0.22013651877133106, "acc_norm_stderr": 0.01210812488346098 }, "hendrycksTest-college_computer_science": { "acc": 0.24, "acc_stderr": 0.04292346959909282, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909282 }, "hendrycksTest-world_religions": { "acc": 0.2046783625730994, "acc_stderr": 0.030944459778533207, "acc_norm": 0.2573099415204678, "acc_norm_stderr": 0.03352799844161865 }, "hendrycksTest-human_aging": { "acc": 0.29596412556053814, "acc_stderr": 0.030636591348699796, "acc_norm": 0.26905829596412556, "acc_norm_stderr": 0.029763779406874972 }, "hendrycksTest-philosophy": { "acc": 0.21543408360128619, "acc_stderr": 0.023350225475471418, "acc_norm": 0.2572347266881029, "acc_norm_stderr": 0.024826171289250888 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.23834196891191708, "acc_stderr": 0.030748905363909892, "acc_norm": 0.27979274611398963, "acc_norm_stderr": 0.032396370467357036 }, "hendrycksTest-miscellaneous": { "acc": 0.27330779054916987, "acc_stderr": 0.01593668106262856, "acc_norm": 0.25287356321839083, "acc_norm_stderr": 0.015543377313719681 }, "hendrycksTest-security_studies": { "acc": 0.2897959183673469, "acc_stderr": 0.029043088683304342, "acc_norm": 0.2530612244897959, "acc_norm_stderr": 0.02783302387139968 }, "hendrycksTest-management": { "acc": 0.18446601941747573, "acc_stderr": 0.03840423627288276, "acc_norm": 0.2815533980582524, "acc_norm_stderr": 0.04453254836326468 }, "crows_pairs_english_gender": { "likelihood_difference": 3.0462890625, "likelihood_difference_stderr": 0.27914349499619723, "pct_stereotype": 0.4875, "pct_stereotype_stderr": 0.02798587585995666 }, "crows_pairs_english_autre": { "likelihood_difference": 5.657670454545454, "likelihood_difference_stderr": 1.600279703203965, "pct_stereotype": 0.5454545454545454, "pct_stereotype_stderr": 0.1574591643244434 }, "hendrycksTest-college_biology": { "acc": 0.2777777777777778, "acc_stderr": 0.037455547914624576, "acc_norm": 0.3125, "acc_norm_stderr": 0.038760854559127644 }, "hendrycksTest-medical_genetics": { "acc": 0.26, "acc_stderr": 0.04408440022768078, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "hendrycksTest-high_school_psychology": { "acc": 0.26605504587155965, "acc_stderr": 0.018946022322225604, "acc_norm": 0.26055045871559634, "acc_norm_stderr": 0.01881918203485007 }, "hendrycksTest-nutrition": { "acc": 0.22549019607843138, "acc_stderr": 0.023929155517351277, "acc_norm": 0.34967320261437906, "acc_norm_stderr": 0.027305308076274702 }, "hendrycksTest-jurisprudence": { "acc": 0.23148148148148148, "acc_stderr": 0.04077494709252626, "acc_norm": 0.3425925925925926, "acc_norm_stderr": 0.04587904741301811 }, "crows_pairs_french_nationality": { "likelihood_difference": 8.008646245059289, "likelihood_difference_stderr": 0.4304168992412896, "pct_stereotype": 0.2924901185770751, "pct_stereotype_stderr": 0.028656396908494263 }, "hendrycksTest-human_sexuality": { "acc": 0.2900763358778626, "acc_stderr": 0.03980066246467766, "acc_norm": 0.22900763358778625, "acc_norm_stderr": 0.036853466317118506 }, "hendrycksTest-high_school_biology": { "acc": 0.27741935483870966, "acc_stderr": 0.025470196835900055, "acc_norm": 0.29354838709677417, "acc_norm_stderr": 0.02590608702131929 }, "winogrande": { "acc": 0.5082872928176796, "acc_stderr": 0.014050555322824194 }, "crows_pairs_english_age": { "likelihood_difference": 2.9198145604395602, "likelihood_difference_stderr": 0.29304502937485644, "pct_stereotype": 0.42857142857142855, "pct_stereotype_stderr": 0.05216405309573015 }, "hendrycksTest-international_law": { "acc": 0.15702479338842976, "acc_stderr": 0.0332124484254713, "acc_norm": 0.38016528925619836, "acc_norm_stderr": 0.04431324501968431 }, "piqa": { "acc": 0.5979325353645266, "acc_stderr": 0.011439867127267533, "acc_norm": 0.5843307943416758, "acc_norm_stderr": 0.011498699770894792 }, "crows_pairs_english_physical_appearance": { "likelihood_difference": 4.0234375, "likelihood_difference_stderr": 0.43062333624877625, "pct_stereotype": 0.6388888888888888, "pct_stereotype_stderr": 0.0570038146170086 }, "hendrycksTest-professional_medicine": { "acc": 0.2536764705882353, "acc_stderr": 0.02643132987078952, "acc_norm": 0.26838235294117646, "acc_norm_stderr": 0.0269174812243772 }, "sciq": { "acc": 0.642, "acc_stderr": 0.01516792886540756, "acc_norm": 0.554, "acc_norm_stderr": 0.015726771166750357 }, "crows_pairs_french_disability": { "likelihood_difference": 6.504734848484849, "likelihood_difference_stderr": 0.682563745515591, "pct_stereotype": 0.4696969696969697, "pct_stereotype_stderr": 0.06190336468479955 }, "hendrycksTest-college_medicine": { "acc": 0.23121387283236994, "acc_stderr": 0.032147373020294696, "acc_norm": 0.2947976878612717, "acc_norm_stderr": 0.034765996075164785 }, "crows_pairs_english": { "likelihood_difference": 3.6300033542039354, "likelihood_difference_stderr": 0.10616730589284588, "pct_stereotype": 0.5020870602265951, "pct_stereotype_stderr": 0.012213192820312024 }, "crows_pairs_english_socioeconomic": { "likelihood_difference": 3.774342105263158, "likelihood_difference_stderr": 0.25861312687838434, "pct_stereotype": 0.6526315789473685, "pct_stereotype_stderr": 0.03463365347393427 }, "hendrycksTest-professional_psychology": { "acc": 0.2581699346405229, "acc_stderr": 0.017704531653250068, "acc_norm": 0.24509803921568626, "acc_norm_stderr": 0.017401816711427657 }, "hendrycksTest-high_school_physics": { "acc": 0.23178807947019867, "acc_stderr": 0.034454062719870525, "acc_norm": 0.2582781456953642, "acc_norm_stderr": 0.035737053147634576 }, "hendrycksTest-conceptual_physics": { "acc": 0.28936170212765955, "acc_stderr": 0.02964400657700962, "acc_norm": 0.22127659574468084, "acc_norm_stderr": 0.02713634960242406 }, "hendrycksTest-high_school_geography": { "acc": 0.24242424242424243, "acc_stderr": 0.030532892233932036, "acc_norm": 0.25252525252525254, "acc_norm_stderr": 0.030954055470365904 }, "hendrycksTest-college_physics": { "acc": 0.13725490196078433, "acc_stderr": 0.034240846698915216, "acc_norm": 0.20588235294117646, "acc_norm_stderr": 0.04023382273617747 }, "crows_pairs_french_race_color": { "likelihood_difference": 4.933559782608696, "likelihood_difference_stderr": 0.23732252395314296, "pct_stereotype": 0.26956521739130435, "pct_stereotype_stderr": 0.02071172670289539 }, "hendrycksTest-public_relations": { "acc": 0.2636363636363636, "acc_stderr": 0.04220224692971987, "acc_norm": 0.17272727272727273, "acc_norm_stderr": 0.03620691833929219 }, "hendrycksTest-sociology": { "acc": 0.27860696517412936, "acc_stderr": 0.031700561834973086, "acc_norm": 0.2835820895522388, "acc_norm_stderr": 0.031871875379197966 }, "hendrycksTest-formal_logic": { "acc": 0.2857142857142857, "acc_stderr": 0.04040610178208841, "acc_norm": 0.2619047619047619, "acc_norm_stderr": 0.03932537680392871 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.2773109243697479, "acc_stderr": 0.029079374539480007, "acc_norm": 0.3487394957983193, "acc_norm_stderr": 0.030956636328566545 }, "hendrycksTest-electrical_engineering": { "acc": 0.27586206896551724, "acc_stderr": 0.037245636197746325, "acc_norm": 0.30344827586206896, "acc_norm_stderr": 0.038312260488503336 }, "hendrycksTest-college_mathematics": { "acc": 0.2, "acc_stderr": 0.040201512610368445, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "hendrycksTest-college_chemistry": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "hendrycksTest-us_foreign_policy": { "acc": 0.26, "acc_stderr": 0.04408440022768077, "acc_norm": 0.36, "acc_norm_stderr": 0.048241815132442176 }, "crows_pairs_english_race_color": { "likelihood_difference": 3.545275590551181, "likelihood_difference_stderr": 0.17976008372744548, "pct_stereotype": 0.4094488188976378, "pct_stereotype_stderr": 0.021838590402568178 }, "hendrycksTest-professional_accounting": { "acc": 0.2730496453900709, "acc_stderr": 0.02657786094330786, "acc_norm": 0.24822695035460993, "acc_norm_stderr": 0.02577001564429039 }, "hendrycksTest-moral_scenarios": { "acc": 0.24804469273743016, "acc_stderr": 0.014444157808261427, "acc_norm": 0.2737430167597765, "acc_norm_stderr": 0.014912413096372432 }, "lambada_openai": { "ppl": 140.52328755411287, "ppl_stderr": 6.002931469828659, "acc": 0.21405006792159906, "acc_stderr": 0.00571435475116112 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.2282051282051282, "acc_stderr": 0.021278393863586282, "acc_norm": 0.28205128205128205, "acc_norm_stderr": 0.0228158130988966 }, "crows_pairs_french": { "likelihood_difference": 5.462297816040548, "likelihood_difference_stderr": 0.1326279922858767, "pct_stereotype": 0.43231961836613, "pct_stereotype_stderr": 0.012100892636108567 }, "crows_pairs_english_nationality": { "likelihood_difference": 3.5159143518518516, "likelihood_difference_stderr": 0.2836003217082339, "pct_stereotype": 0.39351851851851855, "pct_stereotype_stderr": 0.03331747876370312 } }, "versions": { "hendrycksTest-high_school_chemistry": 0, "crows_pairs_english_disability": 0, "hendrycksTest-marketing": 0, "hendrycksTest-moral_disputes": 0, "crows_pairs_french_physical_appearance": 0, "hendrycksTest-astronomy": 0, "hendrycksTest-professional_law": 0, "hendrycksTest-high_school_world_history": 0, "hendrycksTest-elementary_mathematics": 0, "crows_pairs_french_autre": 0, "hendrycksTest-high_school_computer_science": 0, "hendrycksTest-logical_fallacies": 0, "hendrycksTest-prehistory": 0, "hendrycksTest-machine_learning": 0, "wsc": 0, "hendrycksTest-econometrics": 0, "hendrycksTest-high_school_statistics": 0, "crows_pairs_french_religion": 0, "crows_pairs_french_sexual_orientation": 0, "crows_pairs_french_socioeconomic": 0, "hendrycksTest-virology": 0, "arc_easy": 0, "logiqa": 0, "hendrycksTest-abstract_algebra": 0, "crows_pairs_english_sexual_orientation": 0, "crows_pairs_french_age": 0, "hendrycksTest-computer_security": 0, "hendrycksTest-high_school_us_history": 0, "hendrycksTest-high_school_european_history": 0, "crows_pairs_french_gender": 0, "hendrycksTest-high_school_mathematics": 0, "crows_pairs_english_religion": 0, "hendrycksTest-global_facts": 0, "hendrycksTest-clinical_knowledge": 0, "hendrycksTest-anatomy": 0, "hendrycksTest-business_ethics": 0, "arc_challenge": 0, "hendrycksTest-college_computer_science": 0, "hendrycksTest-world_religions": 0, "hendrycksTest-human_aging": 0, "hendrycksTest-philosophy": 0, "hendrycksTest-high_school_government_and_politics": 0, "hendrycksTest-miscellaneous": 0, "hendrycksTest-security_studies": 0, "hendrycksTest-management": 0, "crows_pairs_english_gender": 0, "crows_pairs_english_autre": 0, "hendrycksTest-college_biology": 0, "hendrycksTest-medical_genetics": 0, "hendrycksTest-high_school_psychology": 0, "hendrycksTest-nutrition": 0, "hendrycksTest-jurisprudence": 0, "crows_pairs_french_nationality": 0, "hendrycksTest-human_sexuality": 0, "hendrycksTest-high_school_biology": 0, "winogrande": 0, "crows_pairs_english_age": 0, "hendrycksTest-international_law": 0, "piqa": 0, "crows_pairs_english_physical_appearance": 0, "hendrycksTest-professional_medicine": 0, "sciq": 0, "crows_pairs_french_disability": 0, "hendrycksTest-college_medicine": 0, "crows_pairs_english": 0, "crows_pairs_english_socioeconomic": 0, "hendrycksTest-professional_psychology": 0, "hendrycksTest-high_school_physics": 0, "hendrycksTest-conceptual_physics": 0, "hendrycksTest-high_school_geography": 0, "hendrycksTest-college_physics": 0, "crows_pairs_french_race_color": 0, "hendrycksTest-public_relations": 0, "hendrycksTest-sociology": 0, "hendrycksTest-formal_logic": 0, "hendrycksTest-high_school_microeconomics": 0, "hendrycksTest-electrical_engineering": 0, "hendrycksTest-college_mathematics": 0, "hendrycksTest-college_chemistry": 0, "hendrycksTest-us_foreign_policy": 0, "crows_pairs_english_race_color": 0, "hendrycksTest-professional_accounting": 0, "hendrycksTest-moral_scenarios": 0, "lambada_openai": 0, "hendrycksTest-high_school_macroeconomics": 0, "crows_pairs_french": 0, "crows_pairs_english_nationality": 0 }, "config": { "model": "hf-causal", "model_args": "pretrained=EleutherAI/pythia-v1.1-70m,revision=step93000", "num_fewshot": 0, "batch_size": 16, "device": "cuda:1", "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }