{ "results": { "crows_pairs_french": { "likelihood_difference": 6.341835308586762, "likelihood_difference_stderr": 0.15048967426229426, "pct_stereotype": 0.49314251639833034, "pct_stereotype_stderr": 0.012212150501851282 }, "hendrycksTest-virology": { "acc": 0.2289156626506024, "acc_stderr": 0.03270745277352477, "acc_norm": 0.24096385542168675, "acc_norm_stderr": 0.03329394119073528 }, "hendrycksTest-econometrics": { "acc": 0.2631578947368421, "acc_stderr": 0.0414243971948936, "acc_norm": 0.2982456140350877, "acc_norm_stderr": 0.04303684033537315 }, "hendrycksTest-high_school_psychology": { "acc": 0.23669724770642203, "acc_stderr": 0.01822407811729908, "acc_norm": 0.25137614678899084, "acc_norm_stderr": 0.018599206360287415 }, "crows_pairs_english_socioeconomic": { "likelihood_difference": 4.267434210526316, "likelihood_difference_stderr": 0.33123322148424716, "pct_stereotype": 0.5210526315789473, "pct_stereotype_stderr": 0.03633739504773335 }, "hendrycksTest-security_studies": { "acc": 0.3224489795918367, "acc_stderr": 0.029923100563683913, "acc_norm": 0.2, "acc_norm_stderr": 0.025607375986579157 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.2128205128205128, "acc_stderr": 0.020752423722128006, "acc_norm": 0.24615384615384617, "acc_norm_stderr": 0.021840866990423088 }, "crows_pairs_english_religion": { "likelihood_difference": 3.94481981981982, "likelihood_difference_stderr": 0.43861537266600115, "pct_stereotype": 0.5135135135135135, "pct_stereotype_stderr": 0.04765571461988585 }, "hendrycksTest-high_school_geography": { "acc": 0.23737373737373738, "acc_stderr": 0.030313710538198892, "acc_norm": 0.26262626262626265, "acc_norm_stderr": 0.03135305009533087 }, "crows_pairs_english": { "likelihood_difference": 4.10218209600477, "likelihood_difference_stderr": 0.1236077959409224, "pct_stereotype": 0.48598688133571855, "pct_stereotype_stderr": 0.012208501686447064 }, "logiqa": { "acc": 0.19201228878648233, "acc_stderr": 0.01544934998590095, "acc_norm": 0.22427035330261136, "acc_norm_stderr": 0.016360043348265515 }, "hendrycksTest-professional_medicine": { "acc": 0.22794117647058823, "acc_stderr": 0.025483081468029804, "acc_norm": 0.22426470588235295, "acc_norm_stderr": 0.025336848563332348 }, "hendrycksTest-moral_disputes": { "acc": 0.22254335260115607, "acc_stderr": 0.02239421566194282, "acc_norm": 0.17630057803468208, "acc_norm_stderr": 0.02051642567249071 }, "crows_pairs_french_age": { "likelihood_difference": 3.9694444444444446, "likelihood_difference_stderr": 0.42986839067167343, "pct_stereotype": 0.4444444444444444, "pct_stereotype_stderr": 0.052671718126664185 }, "crows_pairs_french_gender": { "likelihood_difference": 4.424454828660436, "likelihood_difference_stderr": 0.22752081870537358, "pct_stereotype": 0.5389408099688473, "pct_stereotype_stderr": 0.027865952192986033 }, "hendrycksTest-college_biology": { "acc": 0.2222222222222222, "acc_stderr": 0.034765901043041336, "acc_norm": 0.24305555555555555, "acc_norm_stderr": 0.03586879280080342 }, "hendrycksTest-high_school_physics": { "acc": 0.2185430463576159, "acc_stderr": 0.03374235550425694, "acc_norm": 0.25165562913907286, "acc_norm_stderr": 0.035433042343899844 }, "crows_pairs_french_physical_appearance": { "likelihood_difference": 6.619357638888889, "likelihood_difference_stderr": 0.8210291220143636, "pct_stereotype": 0.5555555555555556, "pct_stereotype_stderr": 0.05897165471491952 }, "crows_pairs_french_disability": { "likelihood_difference": 8.846590909090908, "likelihood_difference_stderr": 0.9260560084979663, "pct_stereotype": 0.4090909090909091, "pct_stereotype_stderr": 0.060983672113630656 }, "hendrycksTest-global_facts": { "acc": 0.26, "acc_stderr": 0.04408440022768079, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394 }, "crows_pairs_french_socioeconomic": { "likelihood_difference": 6.335817920918367, "likelihood_difference_stderr": 0.49605368420562174, "pct_stereotype": 0.3877551020408163, "pct_stereotype_stderr": 0.03489185364347385 }, "hendrycksTest-jurisprudence": { "acc": 0.18518518518518517, "acc_stderr": 0.03755265865037181, "acc_norm": 0.18518518518518517, "acc_norm_stderr": 0.03755265865037182 }, "hendrycksTest-machine_learning": { "acc": 0.24107142857142858, "acc_stderr": 0.04059867246952686, "acc_norm": 0.2767857142857143, "acc_norm_stderr": 0.042466243366976256 }, "hendrycksTest-high_school_mathematics": { "acc": 0.14074074074074075, "acc_stderr": 0.0212029303435688, "acc_norm": 0.2, "acc_norm_stderr": 0.024388430433987664 }, "hendrycksTest-human_sexuality": { "acc": 0.2900763358778626, "acc_stderr": 0.03980066246467765, "acc_norm": 0.31297709923664124, "acc_norm_stderr": 0.04066962905677698 }, "hendrycksTest-high_school_computer_science": { "acc": 0.23, "acc_stderr": 0.04229525846816505, "acc_norm": 0.23, "acc_norm_stderr": 0.042295258468165044 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.18652849740932642, "acc_stderr": 0.02811209121011746, "acc_norm": 0.2538860103626943, "acc_norm_stderr": 0.03141024780565319 }, "hendrycksTest-professional_law": { "acc": 0.2379400260756193, "acc_stderr": 0.010875700787694231, "acc_norm": 0.26988265971316816, "acc_norm_stderr": 0.011337381084250423 }, "hendrycksTest-clinical_knowledge": { "acc": 0.2, "acc_stderr": 0.024618298195866518, "acc_norm": 0.2943396226415094, "acc_norm_stderr": 0.028049186315695245 }, "hendrycksTest-high_school_chemistry": { "acc": 0.20689655172413793, "acc_stderr": 0.02850137816789395, "acc_norm": 0.2512315270935961, "acc_norm_stderr": 0.030516530732694436 }, "hendrycksTest-management": { "acc": 0.22330097087378642, "acc_stderr": 0.04123553189891431, "acc_norm": 0.24271844660194175, "acc_norm_stderr": 0.04245022486384495 }, "hendrycksTest-astronomy": { "acc": 0.21710526315789475, "acc_stderr": 0.03355045304882924, "acc_norm": 0.2894736842105263, "acc_norm_stderr": 0.03690677986137282 }, "hendrycksTest-high_school_european_history": { "acc": 0.1393939393939394, "acc_stderr": 0.0270459488258654, "acc_norm": 0.23636363636363636, "acc_norm_stderr": 0.03317505930009182 }, "hendrycksTest-college_medicine": { "acc": 0.24277456647398843, "acc_stderr": 0.0326926380614177, "acc_norm": 0.2254335260115607, "acc_norm_stderr": 0.03186209851641143 }, "hendrycksTest-college_physics": { "acc": 0.16666666666666666, "acc_stderr": 0.03708284662416542, "acc_norm": 0.24509803921568626, "acc_norm_stderr": 0.042801058373643966 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.21008403361344538, "acc_stderr": 0.026461398717471874, "acc_norm": 0.27310924369747897, "acc_norm_stderr": 0.02894200404099817 }, "crows_pairs_english_physical_appearance": { "likelihood_difference": 3.7877604166666665, "likelihood_difference_stderr": 0.4734800383795231, "pct_stereotype": 0.5138888888888888, "pct_stereotype_stderr": 0.05931618532716555 }, "hendrycksTest-abstract_algebra": { "acc": 0.19, "acc_stderr": 0.03942772444036625, "acc_norm": 0.2, "acc_norm_stderr": 0.04020151261036843 }, "hendrycksTest-elementary_mathematics": { "acc": 0.2275132275132275, "acc_stderr": 0.021591269407823774, "acc_norm": 0.2222222222222222, "acc_norm_stderr": 0.021411684393694185 }, "hendrycksTest-human_aging": { "acc": 0.3452914798206278, "acc_stderr": 0.03191100192835794, "acc_norm": 0.2825112107623318, "acc_norm_stderr": 0.030216831011508773 }, "winogrande": { "acc": 0.5098658247829518, "acc_stderr": 0.014049749833367592 }, "crows_pairs_english_disability": { "likelihood_difference": 6.442788461538462, "likelihood_difference_stderr": 0.7741982131043712, "pct_stereotype": 0.5230769230769231, "pct_stereotype_stderr": 0.06243339646441512 }, "crows_pairs_french_sexual_orientation": { "likelihood_difference": 14.371565934065934, "likelihood_difference_stderr": 1.0132795779676502, "pct_stereotype": 0.8021978021978022, "pct_stereotype_stderr": 0.04198895203196222 }, "hendrycksTest-high_school_biology": { "acc": 0.22258064516129034, "acc_stderr": 0.023664216671642514, "acc_norm": 0.24516129032258063, "acc_norm_stderr": 0.024472243840895525 }, "arc_challenge": { "acc": 0.18003412969283278, "acc_stderr": 0.011227856729050028, "acc_norm": 0.2175767918088737, "acc_norm_stderr": 0.012057262020972499 }, "hendrycksTest-us_foreign_policy": { "acc": 0.2, "acc_stderr": 0.04020151261036845, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909283 }, "hendrycksTest-college_computer_science": { "acc": 0.24, "acc_stderr": 0.04292346959909282, "acc_norm": 0.19, "acc_norm_stderr": 0.03942772444036624 }, "lambada_openai": { "ppl": 116756.33428953367, "ppl_stderr": 6456.789280142739, "acc": 0.0, "acc_stderr": 0.0 }, "crows_pairs_english_sexual_orientation": { "likelihood_difference": 4.444220430107527, "likelihood_difference_stderr": 0.6472437756111237, "pct_stereotype": 0.7096774193548387, "pct_stereotype_stderr": 0.04732351421824121 }, "hendrycksTest-high_school_statistics": { "acc": 0.25462962962962965, "acc_stderr": 0.02971127586000536, "acc_norm": 0.24074074074074073, "acc_norm_stderr": 0.029157522184605596 }, "hendrycksTest-computer_security": { "acc": 0.27, "acc_stderr": 0.04461960433384739, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "crows_pairs_french_race_color": { "likelihood_difference": 4.9554347826086955, "likelihood_difference_stderr": 0.22275405195298537, "pct_stereotype": 0.5260869565217391, "pct_stereotype_stderr": 0.0233062153668594 }, "crows_pairs_english_age": { "likelihood_difference": 3.65625, "likelihood_difference_stderr": 0.4765308636339587, "pct_stereotype": 0.38461538461538464, "pct_stereotype_stderr": 0.051282051282051246 }, "hendrycksTest-philosophy": { "acc": 0.22186495176848875, "acc_stderr": 0.02359885829286305, "acc_norm": 0.2797427652733119, "acc_norm_stderr": 0.0254942593506949 }, "hendrycksTest-electrical_engineering": { "acc": 0.2827586206896552, "acc_stderr": 0.037528339580033376, "acc_norm": 0.2689655172413793, "acc_norm_stderr": 0.036951833116502325 }, "hendrycksTest-sociology": { "acc": 0.21393034825870647, "acc_stderr": 0.028996909693328927, "acc_norm": 0.21393034825870647, "acc_norm_stderr": 0.02899690969332891 }, "hendrycksTest-professional_accounting": { "acc": 0.2695035460992908, "acc_stderr": 0.026469036818590624, "acc_norm": 0.2730496453900709, "acc_norm_stderr": 0.02657786094330786 }, "hendrycksTest-miscellaneous": { "acc": 0.24393358876117496, "acc_stderr": 0.01535721266582948, "acc_norm": 0.25287356321839083, "acc_norm_stderr": 0.015543377313719681 }, "sciq": { "acc": 0.264, "acc_stderr": 0.01394627184944047, "acc_norm": 0.275, "acc_norm_stderr": 0.014127086556490528 }, "crows_pairs_french_autre": { "likelihood_difference": 5.8173076923076925, "likelihood_difference_stderr": 0.8524880376227814, "pct_stereotype": 0.3076923076923077, "pct_stereotype_stderr": 0.13323467750529824 }, "hendrycksTest-public_relations": { "acc": 0.3181818181818182, "acc_stderr": 0.044612721759105085, "acc_norm": 0.15454545454545454, "acc_norm_stderr": 0.03462262571262667 }, "crows_pairs_french_religion": { "likelihood_difference": 7.920380434782609, "likelihood_difference_stderr": 0.5131357048721925, "pct_stereotype": 0.5652173913043478, "pct_stereotype_stderr": 0.046429222863564275 }, "hendrycksTest-international_law": { "acc": 0.12396694214876033, "acc_stderr": 0.030083098716035227, "acc_norm": 0.2727272727272727, "acc_norm_stderr": 0.04065578140908705 }, "hendrycksTest-anatomy": { "acc": 0.22962962962962963, "acc_stderr": 0.036333844140734664, "acc_norm": 0.2518518518518518, "acc_norm_stderr": 0.03749850709174023 }, "hendrycksTest-conceptual_physics": { "acc": 0.2765957446808511, "acc_stderr": 0.029241883869628827, "acc_norm": 0.20425531914893616, "acc_norm_stderr": 0.026355158413349424 }, "wsc": { "acc": 0.5192307692307693, "acc_stderr": 0.049230010729780505 }, "hendrycksTest-moral_scenarios": { "acc": 0.23798882681564246, "acc_stderr": 0.014242630070574915, "acc_norm": 0.27262569832402234, "acc_norm_stderr": 0.014893391735249588 }, "hendrycksTest-medical_genetics": { "acc": 0.23, "acc_stderr": 0.042295258468165044, "acc_norm": 0.22, "acc_norm_stderr": 0.04163331998932269 }, "piqa": { "acc": 0.5386289445048966, "acc_stderr": 0.011630956681145914, "acc_norm": 0.5244831338411317, "acc_norm_stderr": 0.011651830225709979 }, "hendrycksTest-college_mathematics": { "acc": 0.16, "acc_stderr": 0.03684529491774709, "acc_norm": 0.19, "acc_norm_stderr": 0.03942772444036625 }, "hendrycksTest-prehistory": { "acc": 0.27469135802469136, "acc_stderr": 0.024836057868294688, "acc_norm": 0.2191358024691358, "acc_norm_stderr": 0.023016705640262196 }, "hendrycksTest-professional_psychology": { "acc": 0.2369281045751634, "acc_stderr": 0.017201662169789782, "acc_norm": 0.2973856209150327, "acc_norm_stderr": 0.01849259653639695 }, "hendrycksTest-logical_fallacies": { "acc": 0.2147239263803681, "acc_stderr": 0.03226219377286774, "acc_norm": 0.3128834355828221, "acc_norm_stderr": 0.036429145782924055 }, "crows_pairs_english_nationality": { "likelihood_difference": 4.462456597222222, "likelihood_difference_stderr": 0.33400887699163057, "pct_stereotype": 0.33796296296296297, "pct_stereotype_stderr": 0.03225941352631295 }, "hendrycksTest-formal_logic": { "acc": 0.23015873015873015, "acc_stderr": 0.03764950879790607, "acc_norm": 0.25396825396825395, "acc_norm_stderr": 0.038932596106046755 }, "crows_pairs_english_gender": { "likelihood_difference": 3.560498046875, "likelihood_difference_stderr": 0.32984136074752396, "pct_stereotype": 0.5375, "pct_stereotype_stderr": 0.02791577963000664 }, "hendrycksTest-high_school_world_history": { "acc": 0.189873417721519, "acc_stderr": 0.025530100460233497, "acc_norm": 0.23628691983122363, "acc_norm_stderr": 0.02765215314415925 }, "hendrycksTest-college_chemistry": { "acc": 0.24, "acc_stderr": 0.042923469599092816, "acc_norm": 0.21, "acc_norm_stderr": 0.040936018074033256 }, "hendrycksTest-business_ethics": { "acc": 0.23, "acc_stderr": 0.04229525846816506, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684 }, "hendrycksTest-world_religions": { "acc": 0.14035087719298245, "acc_stderr": 0.0266405825391332, "acc_norm": 0.21052631578947367, "acc_norm_stderr": 0.03126781714663179 }, "crows_pairs_english_autre": { "likelihood_difference": 6.849431818181818, "likelihood_difference_stderr": 2.586994276246196, "pct_stereotype": 0.36363636363636365, "pct_stereotype_stderr": 0.15212000482437738 }, "hendrycksTest-high_school_us_history": { "acc": 0.20588235294117646, "acc_stderr": 0.028379449451588667, "acc_norm": 0.23529411764705882, "acc_norm_stderr": 0.02977177522814565 }, "hendrycksTest-nutrition": { "acc": 0.17973856209150327, "acc_stderr": 0.021986032182064148, "acc_norm": 0.27450980392156865, "acc_norm_stderr": 0.025553169991826517 }, "crows_pairs_french_nationality": { "likelihood_difference": 7.832756916996048, "likelihood_difference_stderr": 0.35785767445511346, "pct_stereotype": 0.3438735177865613, "pct_stereotype_stderr": 0.029922155720849428 }, "arc_easy": { "acc": 0.2984006734006734, "acc_stderr": 0.009388855914040428, "acc_norm": 0.30134680134680136, "acc_norm_stderr": 0.0094152598793516 }, "hendrycksTest-marketing": { "acc": 0.24358974358974358, "acc_stderr": 0.0281209665039144, "acc_norm": 0.25213675213675213, "acc_norm_stderr": 0.02844796547623101 }, "crows_pairs_english_race_color": { "likelihood_difference": 3.9656434547244093, "likelihood_difference_stderr": 0.19638996276808376, "pct_stereotype": 0.468503937007874, "pct_stereotype_stderr": 0.022161679438492773 } }, "versions": { "crows_pairs_french": 0, "hendrycksTest-virology": 0, "hendrycksTest-econometrics": 0, "hendrycksTest-high_school_psychology": 0, "crows_pairs_english_socioeconomic": 0, "hendrycksTest-security_studies": 0, "hendrycksTest-high_school_macroeconomics": 0, "crows_pairs_english_religion": 0, "hendrycksTest-high_school_geography": 0, "crows_pairs_english": 0, "logiqa": 0, "hendrycksTest-professional_medicine": 0, "hendrycksTest-moral_disputes": 0, "crows_pairs_french_age": 0, "crows_pairs_french_gender": 0, "hendrycksTest-college_biology": 0, "hendrycksTest-high_school_physics": 0, "crows_pairs_french_physical_appearance": 0, "crows_pairs_french_disability": 0, "hendrycksTest-global_facts": 0, "crows_pairs_french_socioeconomic": 0, "hendrycksTest-jurisprudence": 0, "hendrycksTest-machine_learning": 0, "hendrycksTest-high_school_mathematics": 0, "hendrycksTest-human_sexuality": 0, "hendrycksTest-high_school_computer_science": 0, "hendrycksTest-high_school_government_and_politics": 0, "hendrycksTest-professional_law": 0, "hendrycksTest-clinical_knowledge": 0, "hendrycksTest-high_school_chemistry": 0, "hendrycksTest-management": 0, "hendrycksTest-astronomy": 0, "hendrycksTest-high_school_european_history": 0, "hendrycksTest-college_medicine": 0, "hendrycksTest-college_physics": 0, "hendrycksTest-high_school_microeconomics": 0, "crows_pairs_english_physical_appearance": 0, "hendrycksTest-abstract_algebra": 0, "hendrycksTest-elementary_mathematics": 0, "hendrycksTest-human_aging": 0, "winogrande": 0, "crows_pairs_english_disability": 0, "crows_pairs_french_sexual_orientation": 0, "hendrycksTest-high_school_biology": 0, "arc_challenge": 0, "hendrycksTest-us_foreign_policy": 0, "hendrycksTest-college_computer_science": 0, "lambada_openai": 0, "crows_pairs_english_sexual_orientation": 0, "hendrycksTest-high_school_statistics": 0, "hendrycksTest-computer_security": 0, "crows_pairs_french_race_color": 0, "crows_pairs_english_age": 0, "hendrycksTest-philosophy": 0, "hendrycksTest-electrical_engineering": 0, "hendrycksTest-sociology": 0, "hendrycksTest-professional_accounting": 0, "hendrycksTest-miscellaneous": 0, "sciq": 0, "crows_pairs_french_autre": 0, "hendrycksTest-public_relations": 0, "crows_pairs_french_religion": 0, "hendrycksTest-international_law": 0, "hendrycksTest-anatomy": 0, "hendrycksTest-conceptual_physics": 0, "wsc": 0, "hendrycksTest-moral_scenarios": 0, "hendrycksTest-medical_genetics": 0, "piqa": 0, "hendrycksTest-college_mathematics": 0, "hendrycksTest-prehistory": 0, "hendrycksTest-professional_psychology": 0, "hendrycksTest-logical_fallacies": 0, "crows_pairs_english_nationality": 0, "hendrycksTest-formal_logic": 0, "crows_pairs_english_gender": 0, "hendrycksTest-high_school_world_history": 0, "hendrycksTest-college_chemistry": 0, "hendrycksTest-business_ethics": 0, "hendrycksTest-world_religions": 0, "crows_pairs_english_autre": 0, "hendrycksTest-high_school_us_history": 0, "hendrycksTest-nutrition": 0, "crows_pairs_french_nationality": 0, "arc_easy": 0, "hendrycksTest-marketing": 0, "crows_pairs_english_race_color": 0 }, "config": { "model": "hf-causal", "model_args": "use_accelerate=True,pretrained=EleutherAI/pythia-v1.1-70m,revision=step512", "num_fewshot": 0, "batch_size": 32, "device": null, "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }