{ "results": { "hendrycksTest-logical_fallacies": { "acc": 0.20245398773006135, "acc_stderr": 0.031570650789119005, "acc_norm": 0.3312883435582822, "acc_norm_stderr": 0.03697983910025588 }, "hendrycksTest-high_school_computer_science": { "acc": 0.24, "acc_stderr": 0.04292346959909284, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "crows_pairs_english_autre": { "likelihood_difference": 5.623579545454546, "likelihood_difference_stderr": 2.0995916372617076, "pct_stereotype": 0.5454545454545454, "pct_stereotype_stderr": 0.1574591643244434 }, "hendrycksTest-anatomy": { "acc": 0.17777777777777778, "acc_stderr": 0.03302789859901717, "acc_norm": 0.1925925925925926, "acc_norm_stderr": 0.0340654205850265 }, "crows_pairs_french_disability": { "likelihood_difference": 6.588541666666667, "likelihood_difference_stderr": 0.6318561818523015, "pct_stereotype": 0.3939393939393939, "pct_stereotype_stderr": 0.06060606060606063 }, "lambada_openai": { "ppl": 121.29880202709046, "ppl_stderr": 5.126206628890121, "acc": 0.2225887832330681, "acc_stderr": 0.005795476001421499 }, "hendrycksTest-prehistory": { "acc": 0.27469135802469136, "acc_stderr": 0.024836057868294677, "acc_norm": 0.20679012345679013, "acc_norm_stderr": 0.02253500670594282 }, "hendrycksTest-college_computer_science": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768079 }, "hendrycksTest-conceptual_physics": { "acc": 0.23404255319148937, "acc_stderr": 0.027678452578212404, "acc_norm": 0.20425531914893616, "acc_norm_stderr": 0.026355158413349424 }, "crows_pairs_english_physical_appearance": { "likelihood_difference": 3.7319878472222223, "likelihood_difference_stderr": 0.367552757457845, "pct_stereotype": 0.5833333333333334, "pct_stereotype_stderr": 0.05850912479161746 }, "hendrycksTest-virology": { "acc": 0.25301204819277107, "acc_stderr": 0.033844291552331346, "acc_norm": 0.2289156626506024, "acc_norm_stderr": 0.03270745277352477 }, "hendrycksTest-high_school_physics": { "acc": 0.2251655629139073, "acc_stderr": 0.03410435282008937, "acc_norm": 0.2251655629139073, "acc_norm_stderr": 0.03410435282008937 }, "piqa": { "acc": 0.6077257889009793, "acc_stderr": 0.01139184674407223, "acc_norm": 0.5968443960826986, "acc_norm_stderr": 0.011444908701768742 }, "hendrycksTest-college_physics": { "acc": 0.18627450980392157, "acc_stderr": 0.03873958714149352, "acc_norm": 0.23529411764705882, "acc_norm_stderr": 0.04220773659171452 }, "hendrycksTest-jurisprudence": { "acc": 0.25925925925925924, "acc_stderr": 0.04236511258094632, "acc_norm": 0.37962962962962965, "acc_norm_stderr": 0.04691521224077742 }, "hendrycksTest-management": { "acc": 0.23300970873786409, "acc_stderr": 0.04185832598928313, "acc_norm": 0.2621359223300971, "acc_norm_stderr": 0.04354631077260597 }, "hendrycksTest-machine_learning": { "acc": 0.30357142857142855, "acc_stderr": 0.04364226155841044, "acc_norm": 0.1875, "acc_norm_stderr": 0.0370468111477387 }, "hendrycksTest-public_relations": { "acc": 0.3181818181818182, "acc_stderr": 0.04461272175910508, "acc_norm": 0.17272727272727273, "acc_norm_stderr": 0.03620691833929219 }, "wsc": { "acc": 0.36538461538461536, "acc_stderr": 0.0474473339327792 }, "crows_pairs_english_nationality": { "likelihood_difference": 3.5847077546296298, "likelihood_difference_stderr": 0.27013774973583443, "pct_stereotype": 0.4212962962962963, "pct_stereotype_stderr": 0.03367462138896078 }, "crows_pairs_french": { "likelihood_difference": 5.100849731663685, "likelihood_difference_stderr": 0.1250618981891961, "pct_stereotype": 0.4561717352415027, "pct_stereotype_stderr": 0.012166287275376289 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.24352331606217617, "acc_stderr": 0.030975436386845436, "acc_norm": 0.24352331606217617, "acc_norm_stderr": 0.03097543638684543 }, "hendrycksTest-sociology": { "acc": 0.3034825870646766, "acc_stderr": 0.03251006816458619, "acc_norm": 0.2885572139303483, "acc_norm_stderr": 0.03203841040213321 }, "hendrycksTest-international_law": { "acc": 0.19008264462809918, "acc_stderr": 0.03581796951709282, "acc_norm": 0.4297520661157025, "acc_norm_stderr": 0.04519082021319774 }, "hendrycksTest-high_school_mathematics": { "acc": 0.17037037037037037, "acc_stderr": 0.022922554863074956, "acc_norm": 0.24814814814814815, "acc_norm_stderr": 0.0263357394040558 }, "crows_pairs_english_age": { "likelihood_difference": 2.7034684065934065, "likelihood_difference_stderr": 0.24752294883796136, "pct_stereotype": 0.5054945054945055, "pct_stereotype_stderr": 0.05270144531112881 }, "hendrycksTest-moral_scenarios": { "acc": 0.23687150837988827, "acc_stderr": 0.01421957078810399, "acc_norm": 0.27262569832402234, "acc_norm_stderr": 0.014893391735249588 }, "hendrycksTest-marketing": { "acc": 0.2606837606837607, "acc_stderr": 0.02876034895652341, "acc_norm": 0.27350427350427353, "acc_norm_stderr": 0.02920254015343117 }, "hendrycksTest-high_school_statistics": { "acc": 0.1712962962962963, "acc_stderr": 0.025695341643824688, "acc_norm": 0.25462962962962965, "acc_norm_stderr": 0.029711275860005344 }, "logiqa": { "acc": 0.20583717357910905, "acc_stderr": 0.01585842321932388, "acc_norm": 0.30568356374807987, "acc_norm_stderr": 0.018069997343763473 }, "hendrycksTest-high_school_us_history": { "acc": 0.22058823529411764, "acc_stderr": 0.029102254389674093, "acc_norm": 0.24509803921568626, "acc_norm_stderr": 0.03019028245350194 }, "crows_pairs_english_socioeconomic": { "likelihood_difference": 3.896134868421053, "likelihood_difference_stderr": 0.2740385405281198, "pct_stereotype": 0.631578947368421, "pct_stereotype_stderr": 0.03508771929824559 }, "hendrycksTest-high_school_european_history": { "acc": 0.24242424242424243, "acc_stderr": 0.03346409881055953, "acc_norm": 0.296969696969697, "acc_norm_stderr": 0.03567969772268048 }, "hendrycksTest-high_school_biology": { "acc": 0.23225806451612904, "acc_stderr": 0.024022256130308235, "acc_norm": 0.3, "acc_norm_stderr": 0.026069362295335134 }, "hendrycksTest-world_religions": { "acc": 0.26900584795321636, "acc_stderr": 0.03401052620104089, "acc_norm": 0.3216374269005848, "acc_norm_stderr": 0.03582529442573122 }, "hendrycksTest-college_medicine": { "acc": 0.24855491329479767, "acc_stderr": 0.03295304696818317, "acc_norm": 0.31213872832369943, "acc_norm_stderr": 0.03533133389323657 }, "hendrycksTest-high_school_geography": { "acc": 0.26262626262626265, "acc_stderr": 0.031353050095330855, "acc_norm": 0.2727272727272727, "acc_norm_stderr": 0.03173071239071724 }, "hendrycksTest-abstract_algebra": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206824 }, "crows_pairs_french_nationality": { "likelihood_difference": 7.371294466403162, "likelihood_difference_stderr": 0.39282681114851664, "pct_stereotype": 0.2845849802371542, "pct_stereotype_stderr": 0.02842397052208522 }, "hendrycksTest-philosophy": { "acc": 0.19935691318327975, "acc_stderr": 0.022691033780549656, "acc_norm": 0.2604501607717042, "acc_norm_stderr": 0.02492672322484555 }, "crows_pairs_french_race_color": { "likelihood_difference": 4.2132472826086955, "likelihood_difference_stderr": 0.2256584973219656, "pct_stereotype": 0.3673913043478261, "pct_stereotype_stderr": 0.022502235852959178 }, "hendrycksTest-computer_security": { "acc": 0.19, "acc_stderr": 0.03942772444036623, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394 }, "hendrycksTest-human_sexuality": { "acc": 0.2900763358778626, "acc_stderr": 0.03980066246467765, "acc_norm": 0.2595419847328244, "acc_norm_stderr": 0.03844876139785271 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.20588235294117646, "acc_stderr": 0.026265024608275882, "acc_norm": 0.31932773109243695, "acc_norm_stderr": 0.030283995525884396 }, "hendrycksTest-human_aging": { "acc": 0.25112107623318386, "acc_stderr": 0.029105220833224598, "acc_norm": 0.23318385650224216, "acc_norm_stderr": 0.028380391147094713 }, "hendrycksTest-astronomy": { "acc": 0.19736842105263158, "acc_stderr": 0.03238981601699397, "acc_norm": 0.3026315789473684, "acc_norm_stderr": 0.037385206761196686 }, "sciq": { "acc": 0.653, "acc_stderr": 0.015060472031706615, "acc_norm": 0.561, "acc_norm_stderr": 0.015701131345400778 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.24102564102564103, "acc_stderr": 0.02168554666533318, "acc_norm": 0.26666666666666666, "acc_norm_stderr": 0.022421273612923717 }, "hendrycksTest-professional_medicine": { "acc": 0.29044117647058826, "acc_stderr": 0.027576468622740522, "acc_norm": 0.27205882352941174, "acc_norm_stderr": 0.027033041151681456 }, "hendrycksTest-electrical_engineering": { "acc": 0.296551724137931, "acc_stderr": 0.03806142687309994, "acc_norm": 0.31724137931034485, "acc_norm_stderr": 0.03878352372138622 }, "hendrycksTest-professional_accounting": { "acc": 0.2624113475177305, "acc_stderr": 0.026244920349842993, "acc_norm": 0.23404255319148937, "acc_norm_stderr": 0.025257861359432414 }, "crows_pairs_english_race_color": { "likelihood_difference": 3.4817913385826773, "likelihood_difference_stderr": 0.1785277431576976, "pct_stereotype": 0.4822834645669291, "pct_stereotype_stderr": 0.022191835500120254 }, "crows_pairs_french_sexual_orientation": { "likelihood_difference": 5.0161401098901095, "likelihood_difference_stderr": 0.46342554021316945, "pct_stereotype": 0.7912087912087912, "pct_stereotype_stderr": 0.04284305206509431 }, "hendrycksTest-professional_psychology": { "acc": 0.23202614379084968, "acc_stderr": 0.017077373377856996, "acc_norm": 0.2696078431372549, "acc_norm_stderr": 0.017952449196987862 }, "hendrycksTest-college_mathematics": { "acc": 0.17, "acc_stderr": 0.03775251680686371, "acc_norm": 0.26, "acc_norm_stderr": 0.0440844002276808 }, "hendrycksTest-high_school_psychology": { "acc": 0.25871559633027524, "acc_stderr": 0.01877605231961962, "acc_norm": 0.25137614678899084, "acc_norm_stderr": 0.01859920636028741 }, "crows_pairs_english_gender": { "likelihood_difference": 3.19052734375, "likelihood_difference_stderr": 0.27369024853254786, "pct_stereotype": 0.565625, "pct_stereotype_stderr": 0.02775245248136476 }, "crows_pairs_french_age": { "likelihood_difference": 5.053125, "likelihood_difference_stderr": 0.49175013822402935, "pct_stereotype": 0.4444444444444444, "pct_stereotype_stderr": 0.05267171812666418 }, "crows_pairs_french_autre": { "likelihood_difference": 3.076923076923077, "likelihood_difference_stderr": 0.5135827293235896, "pct_stereotype": 0.5384615384615384, "pct_stereotype_stderr": 0.14390989949130545 }, "hendrycksTest-business_ethics": { "acc": 0.35, "acc_stderr": 0.04793724854411019, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "crows_pairs_english_sexual_orientation": { "likelihood_difference": 4.696572580645161, "likelihood_difference_stderr": 0.5990114491774148, "pct_stereotype": 0.7634408602150538, "pct_stereotype_stderr": 0.04430611317732682 }, "hendrycksTest-nutrition": { "acc": 0.27450980392156865, "acc_stderr": 0.025553169991826514, "acc_norm": 0.35947712418300654, "acc_norm_stderr": 0.027475969910660952 }, "hendrycksTest-miscellaneous": { "acc": 0.24393358876117496, "acc_stderr": 0.015357212665829496, "acc_norm": 0.2413793103448276, "acc_norm_stderr": 0.015302380123542094 }, "crows_pairs_english": { "likelihood_difference": 3.5903957960644006, "likelihood_difference_stderr": 0.10456293216351101, "pct_stereotype": 0.5426356589147286, "pct_stereotype_stderr": 0.01216881555248585 }, "hendrycksTest-high_school_chemistry": { "acc": 0.17733990147783252, "acc_stderr": 0.02687433727680835, "acc_norm": 0.26108374384236455, "acc_norm_stderr": 0.03090379695211447 }, "hendrycksTest-formal_logic": { "acc": 0.2857142857142857, "acc_stderr": 0.04040610178208841, "acc_norm": 0.24603174603174602, "acc_norm_stderr": 0.03852273364924315 }, "hendrycksTest-us_foreign_policy": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "crows_pairs_french_socioeconomic": { "likelihood_difference": 4.474330357142857, "likelihood_difference_stderr": 0.36811002483302646, "pct_stereotype": 0.4642857142857143, "pct_stereotype_stderr": 0.035714285714285705 }, "crows_pairs_french_religion": { "likelihood_difference": 4.437771739130435, "likelihood_difference_stderr": 0.5024669037031032, "pct_stereotype": 0.6, "pct_stereotype_stderr": 0.04588314677411234 }, "hendrycksTest-college_biology": { "acc": 0.2916666666666667, "acc_stderr": 0.038009680605548574, "acc_norm": 0.24305555555555555, "acc_norm_stderr": 0.0358687928008034 }, "hendrycksTest-econometrics": { "acc": 0.22807017543859648, "acc_stderr": 0.03947152782669415, "acc_norm": 0.2631578947368421, "acc_norm_stderr": 0.04142439719489362 }, "crows_pairs_french_physical_appearance": { "likelihood_difference": 5.053819444444445, "likelihood_difference_stderr": 0.6134387786113794, "pct_stereotype": 0.5555555555555556, "pct_stereotype_stderr": 0.05897165471491952 }, "hendrycksTest-moral_disputes": { "acc": 0.26011560693641617, "acc_stderr": 0.023618678310069363, "acc_norm": 0.2832369942196532, "acc_norm_stderr": 0.024257901705323374 }, "crows_pairs_french_gender": { "likelihood_difference": 5.027453271028038, "likelihood_difference_stderr": 0.2316347109876809, "pct_stereotype": 0.557632398753894, "pct_stereotype_stderr": 0.02776455173721248 }, "hendrycksTest-high_school_world_history": { "acc": 0.24050632911392406, "acc_stderr": 0.027820781981149675, "acc_norm": 0.3037974683544304, "acc_norm_stderr": 0.02993669638713861 }, "arc_challenge": { "acc": 0.18600682593856654, "acc_stderr": 0.011370940183266759, "acc_norm": 0.20477815699658702, "acc_norm_stderr": 0.011792544338513405 }, "winogrande": { "acc": 0.48539857932123126, "acc_stderr": 0.014046492383275846 }, "crows_pairs_english_religion": { "likelihood_difference": 3.318130630630631, "likelihood_difference_stderr": 0.4059063464506704, "pct_stereotype": 0.6036036036036037, "pct_stereotype_stderr": 0.04663848326322448 }, "hendrycksTest-college_chemistry": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.27, "acc_norm_stderr": 0.0446196043338474 }, "hendrycksTest-global_facts": { "acc": 0.22, "acc_stderr": 0.041633319989322674, "acc_norm": 0.24, "acc_norm_stderr": 0.042923469599092816 }, "hendrycksTest-elementary_mathematics": { "acc": 0.24338624338624337, "acc_stderr": 0.022101128787415436, "acc_norm": 0.2619047619047619, "acc_norm_stderr": 0.022644212615525218 }, "hendrycksTest-security_studies": { "acc": 0.3020408163265306, "acc_stderr": 0.029393609319879818, "acc_norm": 0.23673469387755103, "acc_norm_stderr": 0.027212835884073163 }, "hendrycksTest-professional_law": { "acc": 0.26140808344198174, "acc_stderr": 0.011222528169771312, "acc_norm": 0.2790091264667536, "acc_norm_stderr": 0.011455208832803534 }, "crows_pairs_english_disability": { "likelihood_difference": 5.156009615384615, "likelihood_difference_stderr": 0.6114325636078373, "pct_stereotype": 0.6307692307692307, "pct_stereotype_stderr": 0.060324565928300454 }, "arc_easy": { "acc": 0.3952020202020202, "acc_stderr": 0.01003189405279098, "acc_norm": 0.3602693602693603, "acc_norm_stderr": 0.009851002584732387 }, "hendrycksTest-medical_genetics": { "acc": 0.21, "acc_stderr": 0.04093601807403326, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "hendrycksTest-clinical_knowledge": { "acc": 0.24150943396226415, "acc_stderr": 0.026341480371118362, "acc_norm": 0.3132075471698113, "acc_norm_stderr": 0.02854479331905533 } }, "versions": { "hendrycksTest-logical_fallacies": 0, "hendrycksTest-high_school_computer_science": 0, "crows_pairs_english_autre": 0, "hendrycksTest-anatomy": 0, "crows_pairs_french_disability": 0, "lambada_openai": 0, "hendrycksTest-prehistory": 0, "hendrycksTest-college_computer_science": 0, "hendrycksTest-conceptual_physics": 0, "crows_pairs_english_physical_appearance": 0, "hendrycksTest-virology": 0, "hendrycksTest-high_school_physics": 0, "piqa": 0, "hendrycksTest-college_physics": 0, "hendrycksTest-jurisprudence": 0, "hendrycksTest-management": 0, "hendrycksTest-machine_learning": 0, "hendrycksTest-public_relations": 0, "wsc": 0, "crows_pairs_english_nationality": 0, "crows_pairs_french": 0, "hendrycksTest-high_school_government_and_politics": 0, "hendrycksTest-sociology": 0, "hendrycksTest-international_law": 0, "hendrycksTest-high_school_mathematics": 0, "crows_pairs_english_age": 0, "hendrycksTest-moral_scenarios": 0, "hendrycksTest-marketing": 0, "hendrycksTest-high_school_statistics": 0, "logiqa": 0, "hendrycksTest-high_school_us_history": 0, "crows_pairs_english_socioeconomic": 0, "hendrycksTest-high_school_european_history": 0, "hendrycksTest-high_school_biology": 0, "hendrycksTest-world_religions": 0, "hendrycksTest-college_medicine": 0, "hendrycksTest-high_school_geography": 0, "hendrycksTest-abstract_algebra": 0, "crows_pairs_french_nationality": 0, "hendrycksTest-philosophy": 0, "crows_pairs_french_race_color": 0, "hendrycksTest-computer_security": 0, "hendrycksTest-human_sexuality": 0, "hendrycksTest-high_school_microeconomics": 0, "hendrycksTest-human_aging": 0, "hendrycksTest-astronomy": 0, "sciq": 0, "hendrycksTest-high_school_macroeconomics": 0, "hendrycksTest-professional_medicine": 0, "hendrycksTest-electrical_engineering": 0, "hendrycksTest-professional_accounting": 0, "crows_pairs_english_race_color": 0, "crows_pairs_french_sexual_orientation": 0, "hendrycksTest-professional_psychology": 0, "hendrycksTest-college_mathematics": 0, "hendrycksTest-high_school_psychology": 0, "crows_pairs_english_gender": 0, "crows_pairs_french_age": 0, "crows_pairs_french_autre": 0, "hendrycksTest-business_ethics": 0, "crows_pairs_english_sexual_orientation": 0, "hendrycksTest-nutrition": 0, "hendrycksTest-miscellaneous": 0, "crows_pairs_english": 0, "hendrycksTest-high_school_chemistry": 0, "hendrycksTest-formal_logic": 0, "hendrycksTest-us_foreign_policy": 0, "crows_pairs_french_socioeconomic": 0, "crows_pairs_french_religion": 0, "hendrycksTest-college_biology": 0, "hendrycksTest-econometrics": 0, "crows_pairs_french_physical_appearance": 0, "hendrycksTest-moral_disputes": 0, "crows_pairs_french_gender": 0, "hendrycksTest-high_school_world_history": 0, "arc_challenge": 0, "winogrande": 0, "crows_pairs_english_religion": 0, "hendrycksTest-college_chemistry": 0, "hendrycksTest-global_facts": 0, "hendrycksTest-elementary_mathematics": 0, "hendrycksTest-security_studies": 0, "hendrycksTest-professional_law": 0, "crows_pairs_english_disability": 0, "arc_easy": 0, "hendrycksTest-medical_genetics": 0, "hendrycksTest-clinical_knowledge": 0 }, "config": { "model": "hf-causal", "model_args": "pretrained=EleutherAI/pythia-v1.1-70m,revision=step23000", "num_fewshot": 0, "batch_size": 16, "device": "cuda:2", "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }