{ "results": { "hendrycksTest-high_school_microeconomics": { "acc": 0.24789915966386555, "acc_stderr": 0.028047967224176896, "acc_norm": 0.3697478991596639, "acc_norm_stderr": 0.031357095996135904 }, "hendrycksTest-moral_disputes": { "acc": 0.25722543352601157, "acc_stderr": 0.02353292543104428, "acc_norm": 0.2861271676300578, "acc_norm_stderr": 0.02433214677913413 }, "hendrycksTest-elementary_mathematics": { "acc": 0.23544973544973544, "acc_stderr": 0.02185150982203172, "acc_norm": 0.23544973544973544, "acc_norm_stderr": 0.021851509822031715 }, "hendrycksTest-security_studies": { "acc": 0.3469387755102041, "acc_stderr": 0.0304725260267265, "acc_norm": 0.23673469387755103, "acc_norm_stderr": 0.02721283588407315 }, "hendrycksTest-philosophy": { "acc": 0.18971061093247588, "acc_stderr": 0.022268196258783218, "acc_norm": 0.2829581993569132, "acc_norm_stderr": 0.025583062489984838 }, "hendrycksTest-college_mathematics": { "acc": 0.16, "acc_stderr": 0.03684529491774709, "acc_norm": 0.24, "acc_norm_stderr": 0.042923469599092816 }, "crows_pairs_english_autre": { "likelihood_difference": 4.8039772727272725, "likelihood_difference_stderr": 1.5170731185765034, "pct_stereotype": 0.5454545454545454, "pct_stereotype_stderr": 0.1574591643244434 }, "hendrycksTest-human_aging": { "acc": 0.3004484304932735, "acc_stderr": 0.03076935200822914, "acc_norm": 0.21524663677130046, "acc_norm_stderr": 0.027584066602208263 }, "lambada_openai": { "ppl": 101.68439461161867, "ppl_stderr": 4.289464289805073, "acc": 0.23287405394915583, "acc_stderr": 0.00588851737109305 }, "crows_pairs_english": { "likelihood_difference": 3.5585494931425163, "likelihood_difference_stderr": 0.10448474706694104, "pct_stereotype": 0.545020870602266, "pct_stereotype_stderr": 0.012163688705232118 }, "hendrycksTest-college_chemistry": { "acc": 0.28, "acc_stderr": 0.04512608598542127, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "crows_pairs_english_nationality": { "likelihood_difference": 3.626591435185185, "likelihood_difference_stderr": 0.25256110110560454, "pct_stereotype": 0.4305555555555556, "pct_stereotype_stderr": 0.03376922151252336 }, "hendrycksTest-abstract_algebra": { "acc": 0.23, "acc_stderr": 0.04229525846816508, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "crows_pairs_french_gender": { "likelihood_difference": 4.325545171339564, "likelihood_difference_stderr": 0.21316376809344068, "pct_stereotype": 0.5233644859813084, "pct_stereotype_stderr": 0.02792031634820498 }, "hendrycksTest-professional_medicine": { "acc": 0.2647058823529412, "acc_stderr": 0.026799562024887674, "acc_norm": 0.25, "acc_norm_stderr": 0.026303648393696036 }, "crows_pairs_french_race_color": { "likelihood_difference": 4.393002717391305, "likelihood_difference_stderr": 0.22538878372864266, "pct_stereotype": 0.40652173913043477, "pct_stereotype_stderr": 0.022926510173270086 }, "piqa": { "acc": 0.5930359085963003, "acc_stderr": 0.011462093919190166, "acc_norm": 0.5990206746463548, "acc_norm_stderr": 0.011434766962108316 }, "crows_pairs_english_sexual_orientation": { "likelihood_difference": 4.674395161290323, "likelihood_difference_stderr": 0.5663537493866246, "pct_stereotype": 0.8064516129032258, "pct_stereotype_stderr": 0.041189832133487855 }, "hendrycksTest-professional_law": { "acc": 0.2692307692307692, "acc_stderr": 0.01132873440314032, "acc_norm": 0.27835723598435463, "acc_norm_stderr": 0.011446990197380989 }, "hendrycksTest-public_relations": { "acc": 0.24545454545454545, "acc_stderr": 0.041220665028782834, "acc_norm": 0.19090909090909092, "acc_norm_stderr": 0.03764425585984924 }, "hendrycksTest-anatomy": { "acc": 0.21481481481481482, "acc_stderr": 0.035478541985608236, "acc_norm": 0.22962962962962963, "acc_norm_stderr": 0.03633384414073462 }, "crows_pairs_french_socioeconomic": { "likelihood_difference": 4.721580038265306, "likelihood_difference_stderr": 0.3810892591687784, "pct_stereotype": 0.41836734693877553, "pct_stereotype_stderr": 0.03532530943876561 }, "hendrycksTest-medical_genetics": { "acc": 0.22, "acc_stderr": 0.04163331998932269, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "hendrycksTest-logical_fallacies": { "acc": 0.26993865030674846, "acc_stderr": 0.03487825168497892, "acc_norm": 0.294478527607362, "acc_norm_stderr": 0.03581165790474082 }, "hendrycksTest-machine_learning": { "acc": 0.2767857142857143, "acc_stderr": 0.042466243366976256, "acc_norm": 0.25, "acc_norm_stderr": 0.04109974682633932 }, "hendrycksTest-professional_accounting": { "acc": 0.2765957446808511, "acc_stderr": 0.02668456434046101, "acc_norm": 0.26595744680851063, "acc_norm_stderr": 0.026358065698880582 }, "hendrycksTest-electrical_engineering": { "acc": 0.2620689655172414, "acc_stderr": 0.036646663372252565, "acc_norm": 0.31724137931034485, "acc_norm_stderr": 0.038783523721386215 }, "hendrycksTest-management": { "acc": 0.18446601941747573, "acc_stderr": 0.03840423627288276, "acc_norm": 0.21359223300970873, "acc_norm_stderr": 0.04058042015646034 }, "hendrycksTest-human_sexuality": { "acc": 0.32061068702290074, "acc_stderr": 0.04093329229834278, "acc_norm": 0.2748091603053435, "acc_norm_stderr": 0.03915345408847835 }, "crows_pairs_english_socioeconomic": { "likelihood_difference": 3.8314144736842106, "likelihood_difference_stderr": 0.2638399370112327, "pct_stereotype": 0.6105263157894737, "pct_stereotype_stderr": 0.035469931637371596 }, "crows_pairs_english_age": { "likelihood_difference": 2.534855769230769, "likelihood_difference_stderr": 0.2981689993648672, "pct_stereotype": 0.5054945054945055, "pct_stereotype_stderr": 0.05270144531112881 }, "crows_pairs_english_disability": { "likelihood_difference": 5.341346153846154, "likelihood_difference_stderr": 0.6140137588967796, "pct_stereotype": 0.6307692307692307, "pct_stereotype_stderr": 0.06032456592830047 }, "hendrycksTest-astronomy": { "acc": 0.19078947368421054, "acc_stderr": 0.031975658210325, "acc_norm": 0.3684210526315789, "acc_norm_stderr": 0.03925523381052932 }, "hendrycksTest-marketing": { "acc": 0.2606837606837607, "acc_stderr": 0.028760348956523414, "acc_norm": 0.2948717948717949, "acc_norm_stderr": 0.02987257770889117 }, "hendrycksTest-nutrition": { "acc": 0.2581699346405229, "acc_stderr": 0.025058503316958157, "acc_norm": 0.3300653594771242, "acc_norm_stderr": 0.02692565465361569 }, "hendrycksTest-college_medicine": { "acc": 0.24855491329479767, "acc_stderr": 0.03295304696818318, "acc_norm": 0.3179190751445087, "acc_norm_stderr": 0.03550683989165581 }, "hendrycksTest-international_law": { "acc": 0.19008264462809918, "acc_stderr": 0.03581796951709282, "acc_norm": 0.38016528925619836, "acc_norm_stderr": 0.04431324501968431 }, "crows_pairs_french_religion": { "likelihood_difference": 4.681521739130435, "likelihood_difference_stderr": 0.49794984189910335, "pct_stereotype": 0.5826086956521739, "pct_stereotype_stderr": 0.04618572379512261 }, "hendrycksTest-prehistory": { "acc": 0.31790123456790126, "acc_stderr": 0.025910063528240868, "acc_norm": 0.23765432098765432, "acc_norm_stderr": 0.02368359183700855 }, "hendrycksTest-professional_psychology": { "acc": 0.2434640522875817, "acc_stderr": 0.017362473762146634, "acc_norm": 0.272875816993464, "acc_norm_stderr": 0.018020474148393577 }, "hendrycksTest-clinical_knowledge": { "acc": 0.22641509433962265, "acc_stderr": 0.025757559893106737, "acc_norm": 0.3283018867924528, "acc_norm_stderr": 0.028901593612411784 }, "crows_pairs_french_sexual_orientation": { "likelihood_difference": 6.927197802197802, "likelihood_difference_stderr": 0.492595571005076, "pct_stereotype": 0.7912087912087912, "pct_stereotype_stderr": 0.04284305206509431 }, "hendrycksTest-high_school_psychology": { "acc": 0.29541284403669726, "acc_stderr": 0.019560619182976, "acc_norm": 0.26788990825688075, "acc_norm_stderr": 0.018987462257978652 }, "arc_easy": { "acc": 0.4090909090909091, "acc_stderr": 0.01008877515261578, "acc_norm": 0.3720538720538721, "acc_norm_stderr": 0.00991818719309646 }, "hendrycksTest-global_facts": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542127 }, "hendrycksTest-business_ethics": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.27, "acc_norm_stderr": 0.0446196043338474 }, "crows_pairs_french": { "likelihood_difference": 5.22263994484198, "likelihood_difference_stderr": 0.12633351172363225, "pct_stereotype": 0.4531902206320811, "pct_stereotype_stderr": 0.012159658951661536 }, "hendrycksTest-college_computer_science": { "acc": 0.23, "acc_stderr": 0.04229525846816506, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542127 }, "crows_pairs_english_physical_appearance": { "likelihood_difference": 3.8509114583333335, "likelihood_difference_stderr": 0.4282178435765489, "pct_stereotype": 0.6388888888888888, "pct_stereotype_stderr": 0.05700381461700859 }, "crows_pairs_french_nationality": { "likelihood_difference": 7.573616600790514, "likelihood_difference_stderr": 0.40025626909052686, "pct_stereotype": 0.2845849802371542, "pct_stereotype_stderr": 0.028423970522085215 }, "hendrycksTest-high_school_computer_science": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252606 }, "hendrycksTest-miscellaneous": { "acc": 0.27330779054916987, "acc_stderr": 0.015936681062628556, "acc_norm": 0.24393358876117496, "acc_norm_stderr": 0.015357212665829475 }, "hendrycksTest-world_religions": { "acc": 0.26900584795321636, "acc_stderr": 0.0340105262010409, "acc_norm": 0.32748538011695905, "acc_norm_stderr": 0.035993357714560276 }, "crows_pairs_english_religion": { "likelihood_difference": 3.5273085585585586, "likelihood_difference_stderr": 0.42554000873855896, "pct_stereotype": 0.6396396396396397, "pct_stereotype_stderr": 0.04577621167070314 }, "hendrycksTest-us_foreign_policy": { "acc": 0.29, "acc_stderr": 0.04560480215720683, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "crows_pairs_french_disability": { "likelihood_difference": 6.388257575757576, "likelihood_difference_stderr": 0.6176380444815222, "pct_stereotype": 0.4393939393939394, "pct_stereotype_stderr": 0.06156009014560979 }, "hendrycksTest-sociology": { "acc": 0.27860696517412936, "acc_stderr": 0.031700561834973086, "acc_norm": 0.31343283582089554, "acc_norm_stderr": 0.032801882053486435 }, "logiqa": { "acc": 0.22427035330261136, "acc_stderr": 0.016360043348265515, "acc_norm": 0.26881720430107525, "acc_norm_stderr": 0.01738940946371263 }, "wsc": { "acc": 0.36538461538461536, "acc_stderr": 0.0474473339327792 }, "hendrycksTest-high_school_mathematics": { "acc": 0.17407407407407408, "acc_stderr": 0.02311859603355185, "acc_norm": 0.26666666666666666, "acc_norm_stderr": 0.026962424325073838 }, "hendrycksTest-college_biology": { "acc": 0.3055555555555556, "acc_stderr": 0.03852084696008534, "acc_norm": 0.2847222222222222, "acc_norm_stderr": 0.03773809990686934 }, "hendrycksTest-high_school_us_history": { "acc": 0.28431372549019607, "acc_stderr": 0.031660096793998116, "acc_norm": 0.2696078431372549, "acc_norm_stderr": 0.031145570659486782 }, "hendrycksTest-college_physics": { "acc": 0.17647058823529413, "acc_stderr": 0.03793281185307809, "acc_norm": 0.24509803921568626, "acc_norm_stderr": 0.04280105837364396 }, "crows_pairs_english_race_color": { "likelihood_difference": 3.5295890748031495, "likelihood_difference_stderr": 0.19219930990870296, "pct_stereotype": 0.484251968503937, "pct_stereotype_stderr": 0.02219476276265932 }, "hendrycksTest-high_school_statistics": { "acc": 0.20833333333333334, "acc_stderr": 0.02769691071309394, "acc_norm": 0.2638888888888889, "acc_norm_stderr": 0.03005820270430985 }, "hendrycksTest-high_school_physics": { "acc": 0.15894039735099338, "acc_stderr": 0.02985278852870104, "acc_norm": 0.2185430463576159, "acc_norm_stderr": 0.03374235550425694 }, "hendrycksTest-jurisprudence": { "acc": 0.2037037037037037, "acc_stderr": 0.03893542518824847, "acc_norm": 0.37037037037037035, "acc_norm_stderr": 0.04668408033024931 }, "arc_challenge": { "acc": 0.18003412969283278, "acc_stderr": 0.011227856729050054, "acc_norm": 0.20733788395904437, "acc_norm_stderr": 0.011846905782971363 }, "winogrande": { "acc": 0.4996053670086819, "acc_stderr": 0.01405248130604952 }, "hendrycksTest-econometrics": { "acc": 0.21052631578947367, "acc_stderr": 0.038351539543994194, "acc_norm": 0.20175438596491227, "acc_norm_stderr": 0.037752050135836386 }, "hendrycksTest-virology": { "acc": 0.24096385542168675, "acc_stderr": 0.033293941190735296, "acc_norm": 0.2289156626506024, "acc_norm_stderr": 0.03270745277352477 }, "hendrycksTest-high_school_biology": { "acc": 0.25161290322580643, "acc_stderr": 0.02468597928623996, "acc_norm": 0.3419354838709677, "acc_norm_stderr": 0.026985289576552732 }, "crows_pairs_english_gender": { "likelihood_difference": 2.903515625, "likelihood_difference_stderr": 0.25564415610007174, "pct_stereotype": 0.54375, "pct_stereotype_stderr": 0.027887252708654657 }, "crows_pairs_french_physical_appearance": { "likelihood_difference": 5.650173611111111, "likelihood_difference_stderr": 0.6364202127315652, "pct_stereotype": 0.4444444444444444, "pct_stereotype_stderr": 0.05897165471491952 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.25384615384615383, "acc_stderr": 0.022066054378726257, "acc_norm": 0.2743589743589744, "acc_norm_stderr": 0.02262276576749322 }, "hendrycksTest-formal_logic": { "acc": 0.30952380952380953, "acc_stderr": 0.04134913018303317, "acc_norm": 0.2698412698412698, "acc_norm_stderr": 0.03970158273235173 }, "sciq": { "acc": 0.681, "acc_stderr": 0.014746404865473484, "acc_norm": 0.616, "acc_norm_stderr": 0.01538768276189707 }, "crows_pairs_french_autre": { "likelihood_difference": 4.677884615384615, "likelihood_difference_stderr": 0.9490566058977856, "pct_stereotype": 0.46153846153846156, "pct_stereotype_stderr": 0.14390989949130545 }, "hendrycksTest-conceptual_physics": { "acc": 0.2680851063829787, "acc_stderr": 0.02895734278834235, "acc_norm": 0.20425531914893616, "acc_norm_stderr": 0.026355158413349424 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.2538860103626943, "acc_stderr": 0.03141024780565318, "acc_norm": 0.24352331606217617, "acc_norm_stderr": 0.03097543638684542 }, "hendrycksTest-high_school_chemistry": { "acc": 0.21674876847290642, "acc_stderr": 0.028990331252516235, "acc_norm": 0.2857142857142857, "acc_norm_stderr": 0.031785297106427496 }, "hendrycksTest-high_school_european_history": { "acc": 0.21818181818181817, "acc_stderr": 0.03225078108306289, "acc_norm": 0.24848484848484848, "acc_norm_stderr": 0.03374402644139403 }, "hendrycksTest-high_school_geography": { "acc": 0.2676767676767677, "acc_stderr": 0.03154449888270286, "acc_norm": 0.3282828282828283, "acc_norm_stderr": 0.03345678422756775 }, "hendrycksTest-computer_security": { "acc": 0.22, "acc_stderr": 0.0416333199893227, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768081 }, "crows_pairs_french_age": { "likelihood_difference": 4.994791666666667, "likelihood_difference_stderr": 0.5220305178059566, "pct_stereotype": 0.5, "pct_stereotype_stderr": 0.052999894000318 }, "hendrycksTest-moral_scenarios": { "acc": 0.23798882681564246, "acc_stderr": 0.014242630070574915, "acc_norm": 0.23798882681564246, "acc_norm_stderr": 0.014242630070574915 }, "hendrycksTest-high_school_world_history": { "acc": 0.23628691983122363, "acc_stderr": 0.027652153144159277, "acc_norm": 0.3206751054852321, "acc_norm_stderr": 0.03038193194999042 } }, "versions": { "hendrycksTest-high_school_microeconomics": 0, "hendrycksTest-moral_disputes": 0, "hendrycksTest-elementary_mathematics": 0, "hendrycksTest-security_studies": 0, "hendrycksTest-philosophy": 0, "hendrycksTest-college_mathematics": 0, "crows_pairs_english_autre": 0, "hendrycksTest-human_aging": 0, "lambada_openai": 0, "crows_pairs_english": 0, "hendrycksTest-college_chemistry": 0, "crows_pairs_english_nationality": 0, "hendrycksTest-abstract_algebra": 0, "crows_pairs_french_gender": 0, "hendrycksTest-professional_medicine": 0, "crows_pairs_french_race_color": 0, "piqa": 0, "crows_pairs_english_sexual_orientation": 0, "hendrycksTest-professional_law": 0, "hendrycksTest-public_relations": 0, "hendrycksTest-anatomy": 0, "crows_pairs_french_socioeconomic": 0, "hendrycksTest-medical_genetics": 0, "hendrycksTest-logical_fallacies": 0, "hendrycksTest-machine_learning": 0, "hendrycksTest-professional_accounting": 0, "hendrycksTest-electrical_engineering": 0, "hendrycksTest-management": 0, "hendrycksTest-human_sexuality": 0, "crows_pairs_english_socioeconomic": 0, "crows_pairs_english_age": 0, "crows_pairs_english_disability": 0, "hendrycksTest-astronomy": 0, "hendrycksTest-marketing": 0, "hendrycksTest-nutrition": 0, "hendrycksTest-college_medicine": 0, "hendrycksTest-international_law": 0, "crows_pairs_french_religion": 0, "hendrycksTest-prehistory": 0, "hendrycksTest-professional_psychology": 0, "hendrycksTest-clinical_knowledge": 0, "crows_pairs_french_sexual_orientation": 0, "hendrycksTest-high_school_psychology": 0, "arc_easy": 0, "hendrycksTest-global_facts": 0, "hendrycksTest-business_ethics": 0, "crows_pairs_french": 0, "hendrycksTest-college_computer_science": 0, "crows_pairs_english_physical_appearance": 0, "crows_pairs_french_nationality": 0, "hendrycksTest-high_school_computer_science": 0, "hendrycksTest-miscellaneous": 0, "hendrycksTest-world_religions": 0, "crows_pairs_english_religion": 0, "hendrycksTest-us_foreign_policy": 0, "crows_pairs_french_disability": 0, "hendrycksTest-sociology": 0, "logiqa": 0, "wsc": 0, "hendrycksTest-high_school_mathematics": 0, "hendrycksTest-college_biology": 0, "hendrycksTest-high_school_us_history": 0, "hendrycksTest-college_physics": 0, "crows_pairs_english_race_color": 0, "hendrycksTest-high_school_statistics": 0, "hendrycksTest-high_school_physics": 0, "hendrycksTest-jurisprudence": 0, "arc_challenge": 0, "winogrande": 0, "hendrycksTest-econometrics": 0, "hendrycksTest-virology": 0, "hendrycksTest-high_school_biology": 0, "crows_pairs_english_gender": 0, "crows_pairs_french_physical_appearance": 0, "hendrycksTest-high_school_macroeconomics": 0, "hendrycksTest-formal_logic": 0, "sciq": 0, "crows_pairs_french_autre": 0, "hendrycksTest-conceptual_physics": 0, "hendrycksTest-high_school_government_and_politics": 0, "hendrycksTest-high_school_chemistry": 0, "hendrycksTest-high_school_european_history": 0, "hendrycksTest-high_school_geography": 0, "hendrycksTest-computer_security": 0, "crows_pairs_french_age": 0, "hendrycksTest-moral_scenarios": 0, "hendrycksTest-high_school_world_history": 0 }, "config": { "model": "hf-causal", "model_args": "pretrained=EleutherAI/pythia-v1.1-70m,revision=step63000", "num_fewshot": 0, "batch_size": 16, "device": "cuda:6", "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }