{ "results": { "crows_pairs_english_gender": { "likelihood_difference": 2.968603515625, "likelihood_difference_stderr": 0.27632562920815934, "pct_stereotype": 0.55625, "pct_stereotype_stderr": 0.02781690795790493 }, "hendrycksTest-college_chemistry": { "acc": 0.35, "acc_stderr": 0.047937248544110196, "acc_norm": 0.35, "acc_norm_stderr": 0.047937248544110196 }, "crows_pairs_french_gender": { "likelihood_difference": 4.533975856697819, "likelihood_difference_stderr": 0.23505554828445724, "pct_stereotype": 0.5295950155763239, "pct_stereotype_stderr": 0.027901844420051187 }, "hendrycksTest-machine_learning": { "acc": 0.26785714285714285, "acc_stderr": 0.04203277291467763, "acc_norm": 0.26785714285714285, "acc_norm_stderr": 0.04203277291467762 }, "hendrycksTest-conceptual_physics": { "acc": 0.25957446808510637, "acc_stderr": 0.02865917937429232, "acc_norm": 0.20425531914893616, "acc_norm_stderr": 0.02635515841334942 }, "crows_pairs_english_nationality": { "likelihood_difference": 3.651982060185185, "likelihood_difference_stderr": 0.2683321501875231, "pct_stereotype": 0.4074074074074074, "pct_stereotype_stderr": 0.03350991604696042 }, "hendrycksTest-us_foreign_policy": { "acc": 0.28, "acc_stderr": 0.04512608598542126, "acc_norm": 0.34, "acc_norm_stderr": 0.047609522856952365 }, "hendrycksTest-college_computer_science": { "acc": 0.26, "acc_stderr": 0.04408440022768079, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768079 }, "logiqa": { "acc": 0.20890937019969277, "acc_stderr": 0.015945399396423927, "acc_norm": 0.2764976958525346, "acc_norm_stderr": 0.01754320907582518 }, "hendrycksTest-human_aging": { "acc": 0.2600896860986547, "acc_stderr": 0.029442495585857483, "acc_norm": 0.21524663677130046, "acc_norm_stderr": 0.02758406660220826 }, "hendrycksTest-virology": { "acc": 0.28313253012048195, "acc_stderr": 0.03507295431370519, "acc_norm": 0.25903614457831325, "acc_norm_stderr": 0.03410646614071856 }, "crows_pairs_english_disability": { "likelihood_difference": 5.518028846153846, "likelihood_difference_stderr": 0.6386232385225414, "pct_stereotype": 0.6615384615384615, "pct_stereotype_stderr": 0.059148294227806535 }, "hendrycksTest-security_studies": { "acc": 0.34285714285714286, "acc_stderr": 0.030387262919547724, "acc_norm": 0.2530612244897959, "acc_norm_stderr": 0.02783302387139968 }, "crows_pairs_english_race_color": { "likelihood_difference": 3.539185531496063, "likelihood_difference_stderr": 0.16924052532261086, "pct_stereotype": 0.5118110236220472, "pct_stereotype_stderr": 0.022199583294816916 }, "hendrycksTest-college_medicine": { "acc": 0.2543352601156069, "acc_stderr": 0.0332055644308557, "acc_norm": 0.28901734104046245, "acc_norm_stderr": 0.034564257450870016 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.23834196891191708, "acc_stderr": 0.030748905363909902, "acc_norm": 0.2849740932642487, "acc_norm_stderr": 0.03257714077709661 }, "crows_pairs_french": { "likelihood_difference": 5.358885845259392, "likelihood_difference_stderr": 0.13730895956322042, "pct_stereotype": 0.43828264758497315, "pct_stereotype_stderr": 0.012119900409052399 }, "hendrycksTest-formal_logic": { "acc": 0.29365079365079366, "acc_stderr": 0.040735243221471276, "acc_norm": 0.2857142857142857, "acc_norm_stderr": 0.040406101782088394 }, "hendrycksTest-logical_fallacies": { "acc": 0.2331288343558282, "acc_stderr": 0.0332201579577674, "acc_norm": 0.3128834355828221, "acc_norm_stderr": 0.03642914578292405 }, "hendrycksTest-world_religions": { "acc": 0.21052631578947367, "acc_stderr": 0.031267817146631786, "acc_norm": 0.26900584795321636, "acc_norm_stderr": 0.0340105262010409 }, "crows_pairs_english_socioeconomic": { "likelihood_difference": 3.822203947368421, "likelihood_difference_stderr": 0.25490868312334425, "pct_stereotype": 0.6157894736842106, "pct_stereotype_stderr": 0.03538097998767891 }, "hendrycksTest-high_school_world_history": { "acc": 0.270042194092827, "acc_stderr": 0.028900721906293426, "acc_norm": 0.25738396624472576, "acc_norm_stderr": 0.028458820991460285 }, "wsc": { "acc": 0.36538461538461536, "acc_stderr": 0.0474473339327792 }, "hendrycksTest-moral_scenarios": { "acc": 0.25921787709497207, "acc_stderr": 0.014655780837497731, "acc_norm": 0.24692737430167597, "acc_norm_stderr": 0.014422292204808836 }, "hendrycksTest-high_school_mathematics": { "acc": 0.2037037037037037, "acc_stderr": 0.02455617221914125, "acc_norm": 0.26296296296296295, "acc_norm_stderr": 0.026842057873833706 }, "crows_pairs_english": { "likelihood_difference": 3.589249776386404, "likelihood_difference_stderr": 0.10257069027145715, "pct_stereotype": 0.5497912939773405, "pct_stereotype_stderr": 0.012152590574174898 }, "crows_pairs_french_autre": { "likelihood_difference": 4.9423076923076925, "likelihood_difference_stderr": 1.2264143363162354, "pct_stereotype": 0.46153846153846156, "pct_stereotype_stderr": 0.14390989949130548 }, "hendrycksTest-business_ethics": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "hendrycksTest-professional_medicine": { "acc": 0.2426470588235294, "acc_stderr": 0.026040662474201275, "acc_norm": 0.26838235294117646, "acc_norm_stderr": 0.02691748122437722 }, "piqa": { "acc": 0.5984766050054406, "acc_stderr": 0.011437324373397846, "acc_norm": 0.5865070729053319, "acc_norm_stderr": 0.011489895831821136 }, "hendrycksTest-moral_disputes": { "acc": 0.26011560693641617, "acc_stderr": 0.023618678310069363, "acc_norm": 0.3092485549132948, "acc_norm_stderr": 0.024883140570071755 }, "hendrycksTest-professional_accounting": { "acc": 0.23404255319148937, "acc_stderr": 0.025257861359432428, "acc_norm": 0.25886524822695034, "acc_norm_stderr": 0.026129572527180848 }, "winogrande": { "acc": 0.5232833464877664, "acc_stderr": 0.014037241309573642 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.29411764705882354, "acc_stderr": 0.02959732973097809, "acc_norm": 0.3697478991596639, "acc_norm_stderr": 0.031357095996135904 }, "hendrycksTest-prehistory": { "acc": 0.32098765432098764, "acc_stderr": 0.025976566010862737, "acc_norm": 0.22530864197530864, "acc_norm_stderr": 0.02324620264781975 }, "arc_easy": { "acc": 0.3707912457912458, "acc_stderr": 0.009911292822056923, "acc_norm": 0.34553872053872053, "acc_norm_stderr": 0.009757948730670301 }, "hendrycksTest-clinical_knowledge": { "acc": 0.25660377358490566, "acc_stderr": 0.026880647889051968, "acc_norm": 0.3320754716981132, "acc_norm_stderr": 0.02898545565233439 }, "hendrycksTest-high_school_statistics": { "acc": 0.2222222222222222, "acc_stderr": 0.02835321286686343, "acc_norm": 0.2361111111111111, "acc_norm_stderr": 0.028963702570791044 }, "crows_pairs_french_sexual_orientation": { "likelihood_difference": 5.0625, "likelihood_difference_stderr": 0.4286061255414444, "pct_stereotype": 0.7362637362637363, "pct_stereotype_stderr": 0.04644942852497396 }, "hendrycksTest-medical_genetics": { "acc": 0.2, "acc_stderr": 0.04020151261036846, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "hendrycksTest-elementary_mathematics": { "acc": 0.24074074074074073, "acc_stderr": 0.0220190800122179, "acc_norm": 0.25132275132275134, "acc_norm_stderr": 0.022340482339643895 }, "crows_pairs_english_age": { "likelihood_difference": 2.770776098901099, "likelihood_difference_stderr": 0.2827474692485144, "pct_stereotype": 0.5494505494505495, "pct_stereotype_stderr": 0.05244623100101224 }, "hendrycksTest-international_law": { "acc": 0.18181818181818182, "acc_stderr": 0.03520893951097655, "acc_norm": 0.4380165289256198, "acc_norm_stderr": 0.045291468044357915 }, "lambada_openai": { "ppl": 148.4586759416483, "ppl_stderr": 6.263736488398032, "acc": 0.19347952648942363, "acc_stderr": 0.005503478560447365 }, "arc_challenge": { "acc": 0.17491467576791808, "acc_stderr": 0.01110156250182823, "acc_norm": 0.22013651877133106, "acc_norm_stderr": 0.012108124883460988 }, "crows_pairs_french_age": { "likelihood_difference": 4.508333333333334, "likelihood_difference_stderr": 0.4073546275703716, "pct_stereotype": 0.36666666666666664, "pct_stereotype_stderr": 0.05108070528032164 }, "hendrycksTest-high_school_us_history": { "acc": 0.2696078431372549, "acc_stderr": 0.031145570659486782, "acc_norm": 0.27450980392156865, "acc_norm_stderr": 0.031321798030832904 }, "hendrycksTest-college_mathematics": { "acc": 0.19, "acc_stderr": 0.039427724440366234, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768078 }, "hendrycksTest-electrical_engineering": { "acc": 0.30344827586206896, "acc_stderr": 0.038312260488503336, "acc_norm": 0.2827586206896552, "acc_norm_stderr": 0.037528339580033376 }, "crows_pairs_english_autre": { "likelihood_difference": 4.394886363636363, "likelihood_difference_stderr": 1.3966545911585055, "pct_stereotype": 0.45454545454545453, "pct_stereotype_stderr": 0.15745916432444335 }, "hendrycksTest-professional_law": { "acc": 0.24315514993481094, "acc_stderr": 0.010956556654417346, "acc_norm": 0.273142112125163, "acc_norm_stderr": 0.011380150567830396 }, "hendrycksTest-jurisprudence": { "acc": 0.2037037037037037, "acc_stderr": 0.03893542518824847, "acc_norm": 0.3425925925925926, "acc_norm_stderr": 0.04587904741301811 }, "hendrycksTest-human_sexuality": { "acc": 0.31297709923664124, "acc_stderr": 0.04066962905677698, "acc_norm": 0.2366412213740458, "acc_norm_stderr": 0.037276735755969195 }, "hendrycksTest-public_relations": { "acc": 0.2545454545454545, "acc_stderr": 0.04172343038705383, "acc_norm": 0.18181818181818182, "acc_norm_stderr": 0.03694284335337798 }, "crows_pairs_french_race_color": { "likelihood_difference": 4.710665760869565, "likelihood_difference_stderr": 0.2548812285925958, "pct_stereotype": 0.35434782608695653, "pct_stereotype_stderr": 0.02232584228256916 }, "hendrycksTest-high_school_biology": { "acc": 0.2806451612903226, "acc_stderr": 0.02556060472102288, "acc_norm": 0.3096774193548387, "acc_norm_stderr": 0.026302774983517418 }, "hendrycksTest-high_school_psychology": { "acc": 0.26422018348623855, "acc_stderr": 0.0189041641715102, "acc_norm": 0.25688073394495414, "acc_norm_stderr": 0.018732492928342465 }, "hendrycksTest-high_school_european_history": { "acc": 0.2545454545454545, "acc_stderr": 0.03401506715249039, "acc_norm": 0.32727272727272727, "acc_norm_stderr": 0.03663974994391242 }, "hendrycksTest-high_school_geography": { "acc": 0.26262626262626265, "acc_stderr": 0.031353050095330855, "acc_norm": 0.29292929292929293, "acc_norm_stderr": 0.03242497958178815 }, "crows_pairs_english_sexual_orientation": { "likelihood_difference": 4.517809139784946, "likelihood_difference_stderr": 0.5509527681471865, "pct_stereotype": 0.7096774193548387, "pct_stereotype_stderr": 0.04732351421824122 }, "hendrycksTest-philosophy": { "acc": 0.21221864951768488, "acc_stderr": 0.02322275679743512, "acc_norm": 0.24758842443729903, "acc_norm_stderr": 0.024513879973621967 }, "hendrycksTest-high_school_chemistry": { "acc": 0.21182266009852216, "acc_stderr": 0.02874898368994106, "acc_norm": 0.28078817733990147, "acc_norm_stderr": 0.03161856335358609 }, "hendrycksTest-college_biology": { "acc": 0.25, "acc_stderr": 0.03621034121889507, "acc_norm": 0.2708333333333333, "acc_norm_stderr": 0.03716177437566017 }, "crows_pairs_french_socioeconomic": { "likelihood_difference": 5.1596380739795915, "likelihood_difference_stderr": 0.43468629725261915, "pct_stereotype": 0.46938775510204084, "pct_stereotype_stderr": 0.035738572888608724 }, "crows_pairs_english_religion": { "likelihood_difference": 3.549268018018018, "likelihood_difference_stderr": 0.4167156533862711, "pct_stereotype": 0.6396396396396397, "pct_stereotype_stderr": 0.04577621167070314 }, "crows_pairs_french_disability": { "likelihood_difference": 6.535037878787879, "likelihood_difference_stderr": 0.7494925060241215, "pct_stereotype": 0.4696969696969697, "pct_stereotype_stderr": 0.06190336468479955 }, "hendrycksTest-marketing": { "acc": 0.2905982905982906, "acc_stderr": 0.029745048572674054, "acc_norm": 0.3034188034188034, "acc_norm_stderr": 0.030118210106942645 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.26666666666666666, "acc_stderr": 0.022421273612923714, "acc_norm": 0.30256410256410254, "acc_norm_stderr": 0.02329088805377273 }, "hendrycksTest-professional_psychology": { "acc": 0.2434640522875817, "acc_stderr": 0.017362473762146637, "acc_norm": 0.24509803921568626, "acc_norm_stderr": 0.01740181671142766 }, "crows_pairs_english_physical_appearance": { "likelihood_difference": 3.9303385416666665, "likelihood_difference_stderr": 0.4204840146607544, "pct_stereotype": 0.6111111111111112, "pct_stereotype_stderr": 0.057855371034784615 }, "hendrycksTest-econometrics": { "acc": 0.2631578947368421, "acc_stderr": 0.041424397194893624, "acc_norm": 0.2543859649122807, "acc_norm_stderr": 0.040969851398436716 }, "crows_pairs_french_religion": { "likelihood_difference": 5.23695652173913, "likelihood_difference_stderr": 0.5548918938463742, "pct_stereotype": 0.5391304347826087, "pct_stereotype_stderr": 0.04668566114758418 }, "crows_pairs_french_nationality": { "likelihood_difference": 7.778532608695652, "likelihood_difference_stderr": 0.44524534158910173, "pct_stereotype": 0.2924901185770751, "pct_stereotype_stderr": 0.028656396908494267 }, "hendrycksTest-anatomy": { "acc": 0.2222222222222222, "acc_stderr": 0.035914440841969694, "acc_norm": 0.22962962962962963, "acc_norm_stderr": 0.03633384414073463 }, "sciq": { "acc": 0.617, "acc_stderr": 0.01538010232565271, "acc_norm": 0.541, "acc_norm_stderr": 0.015766025737882165 }, "hendrycksTest-high_school_physics": { "acc": 0.18543046357615894, "acc_stderr": 0.03173284384294287, "acc_norm": 0.2185430463576159, "acc_norm_stderr": 0.03374235550425694 }, "hendrycksTest-computer_security": { "acc": 0.22, "acc_stderr": 0.04163331998932269, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "hendrycksTest-global_facts": { "acc": 0.2, "acc_stderr": 0.040201512610368466, "acc_norm": 0.22, "acc_norm_stderr": 0.041633319989322695 }, "crows_pairs_french_physical_appearance": { "likelihood_difference": 5.84765625, "likelihood_difference_stderr": 0.6465954210577143, "pct_stereotype": 0.5138888888888888, "pct_stereotype_stderr": 0.05931618532716554 }, "hendrycksTest-high_school_computer_science": { "acc": 0.22, "acc_stderr": 0.0416333199893227, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "hendrycksTest-nutrition": { "acc": 0.2777777777777778, "acc_stderr": 0.025646863097137908, "acc_norm": 0.35947712418300654, "acc_norm_stderr": 0.027475969910660952 }, "hendrycksTest-astronomy": { "acc": 0.20394736842105263, "acc_stderr": 0.0327900040631005, "acc_norm": 0.3618421052631579, "acc_norm_stderr": 0.03910525752849724 }, "hendrycksTest-college_physics": { "acc": 0.21568627450980393, "acc_stderr": 0.040925639582376536, "acc_norm": 0.24509803921568626, "acc_norm_stderr": 0.04280105837364395 }, "hendrycksTest-sociology": { "acc": 0.29850746268656714, "acc_stderr": 0.03235743789355043, "acc_norm": 0.27860696517412936, "acc_norm_stderr": 0.031700561834973086 }, "hendrycksTest-management": { "acc": 0.23300970873786409, "acc_stderr": 0.04185832598928312, "acc_norm": 0.3106796116504854, "acc_norm_stderr": 0.0458212416016155 }, "hendrycksTest-miscellaneous": { "acc": 0.24265644955300128, "acc_stderr": 0.015329888940899858, "acc_norm": 0.23754789272030652, "acc_norm_stderr": 0.015218733046150195 }, "hendrycksTest-abstract_algebra": { "acc": 0.23, "acc_stderr": 0.04229525846816507, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 } }, "versions": { "crows_pairs_english_gender": 0, "hendrycksTest-college_chemistry": 0, "crows_pairs_french_gender": 0, "hendrycksTest-machine_learning": 0, "hendrycksTest-conceptual_physics": 0, "crows_pairs_english_nationality": 0, "hendrycksTest-us_foreign_policy": 0, "hendrycksTest-college_computer_science": 0, "logiqa": 0, "hendrycksTest-human_aging": 0, "hendrycksTest-virology": 0, "crows_pairs_english_disability": 0, "hendrycksTest-security_studies": 0, "crows_pairs_english_race_color": 0, "hendrycksTest-college_medicine": 0, "hendrycksTest-high_school_government_and_politics": 0, "crows_pairs_french": 0, "hendrycksTest-formal_logic": 0, "hendrycksTest-logical_fallacies": 0, "hendrycksTest-world_religions": 0, "crows_pairs_english_socioeconomic": 0, "hendrycksTest-high_school_world_history": 0, "wsc": 0, "hendrycksTest-moral_scenarios": 0, "hendrycksTest-high_school_mathematics": 0, "crows_pairs_english": 0, "crows_pairs_french_autre": 0, "hendrycksTest-business_ethics": 0, "hendrycksTest-professional_medicine": 0, "piqa": 0, "hendrycksTest-moral_disputes": 0, "hendrycksTest-professional_accounting": 0, "winogrande": 0, "hendrycksTest-high_school_microeconomics": 0, "hendrycksTest-prehistory": 0, "arc_easy": 0, "hendrycksTest-clinical_knowledge": 0, "hendrycksTest-high_school_statistics": 0, "crows_pairs_french_sexual_orientation": 0, "hendrycksTest-medical_genetics": 0, "hendrycksTest-elementary_mathematics": 0, "crows_pairs_english_age": 0, "hendrycksTest-international_law": 0, "lambada_openai": 0, "arc_challenge": 0, "crows_pairs_french_age": 0, "hendrycksTest-high_school_us_history": 0, "hendrycksTest-college_mathematics": 0, "hendrycksTest-electrical_engineering": 0, "crows_pairs_english_autre": 0, "hendrycksTest-professional_law": 0, "hendrycksTest-jurisprudence": 0, "hendrycksTest-human_sexuality": 0, "hendrycksTest-public_relations": 0, "crows_pairs_french_race_color": 0, "hendrycksTest-high_school_biology": 0, "hendrycksTest-high_school_psychology": 0, "hendrycksTest-high_school_european_history": 0, "hendrycksTest-high_school_geography": 0, "crows_pairs_english_sexual_orientation": 0, "hendrycksTest-philosophy": 0, "hendrycksTest-high_school_chemistry": 0, "hendrycksTest-college_biology": 0, "crows_pairs_french_socioeconomic": 0, "crows_pairs_english_religion": 0, "crows_pairs_french_disability": 0, "hendrycksTest-marketing": 0, "hendrycksTest-high_school_macroeconomics": 0, "hendrycksTest-professional_psychology": 0, "crows_pairs_english_physical_appearance": 0, "hendrycksTest-econometrics": 0, "crows_pairs_french_religion": 0, "crows_pairs_french_nationality": 0, "hendrycksTest-anatomy": 0, "sciq": 0, "hendrycksTest-high_school_physics": 0, "hendrycksTest-computer_security": 0, "hendrycksTest-global_facts": 0, "crows_pairs_french_physical_appearance": 0, "hendrycksTest-high_school_computer_science": 0, "hendrycksTest-nutrition": 0, "hendrycksTest-astronomy": 0, "hendrycksTest-college_physics": 0, "hendrycksTest-sociology": 0, "hendrycksTest-management": 0, "hendrycksTest-miscellaneous": 0, "hendrycksTest-abstract_algebra": 0 }, "config": { "model": "hf-causal", "model_args": "pretrained=EleutherAI/pythia-v1.1-70m,revision=step133000", "num_fewshot": 0, "batch_size": 16, "device": "cuda:5", "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }