{ "results": { "hendrycksTest-high_school_world_history": { "acc": 0.25738396624472576, "acc_stderr": 0.028458820991460267, "acc_norm": 0.3037974683544304, "acc_norm_stderr": 0.029936696387138615 }, "lambada_openai": { "ppl": 136.24789967804702, "ppl_stderr": 5.841149004257701, "acc": 0.21036289540073744, "acc_stderr": 0.005678196483274596 }, "winogrande": { "acc": 0.5027624309392266, "acc_stderr": 0.014052271211616441 }, "hendrycksTest-world_religions": { "acc": 0.28654970760233917, "acc_stderr": 0.03467826685703826, "acc_norm": 0.34502923976608185, "acc_norm_stderr": 0.036459813773888065 }, "crows_pairs_english_nationality": { "likelihood_difference": 3.7294560185185186, "likelihood_difference_stderr": 0.28249531206763423, "pct_stereotype": 0.39351851851851855, "pct_stereotype_stderr": 0.03331747876370312 }, "hendrycksTest-professional_law": { "acc": 0.24771838331160365, "acc_stderr": 0.011025499291443738, "acc_norm": 0.2770534550195567, "acc_norm_stderr": 0.01143046244371968 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.25210084033613445, "acc_stderr": 0.02820554503327771, "acc_norm": 0.3277310924369748, "acc_norm_stderr": 0.030489911417673227 }, "hendrycksTest-medical_genetics": { "acc": 0.24, "acc_stderr": 0.04292346959909282, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "hendrycksTest-logical_fallacies": { "acc": 0.2331288343558282, "acc_stderr": 0.0332201579577674, "acc_norm": 0.2822085889570552, "acc_norm_stderr": 0.03536117886664743 }, "crows_pairs_french_sexual_orientation": { "likelihood_difference": 5.96875, "likelihood_difference_stderr": 0.6136503830214869, "pct_stereotype": 0.8021978021978022, "pct_stereotype_stderr": 0.04198895203196222 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.24871794871794872, "acc_stderr": 0.021916957709213803, "acc_norm": 0.26153846153846155, "acc_norm_stderr": 0.022282141204204426 }, "hendrycksTest-college_biology": { "acc": 0.2361111111111111, "acc_stderr": 0.03551446610810826, "acc_norm": 0.2222222222222222, "acc_norm_stderr": 0.03476590104304134 }, "hendrycksTest-high_school_us_history": { "acc": 0.23529411764705882, "acc_stderr": 0.029771775228145628, "acc_norm": 0.24509803921568626, "acc_norm_stderr": 0.03019028245350195 }, "crows_pairs_french_socioeconomic": { "likelihood_difference": 4.261439732142857, "likelihood_difference_stderr": 0.3621777755494698, "pct_stereotype": 0.42857142857142855, "pct_stereotype_stderr": 0.035438495596916704 }, "hendrycksTest-college_mathematics": { "acc": 0.22, "acc_stderr": 0.041633319989322695, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909282 }, "crows_pairs_french_race_color": { "likelihood_difference": 4.111616847826087, "likelihood_difference_stderr": 0.21388555403339707, "pct_stereotype": 0.4391304347826087, "pct_stereotype_stderr": 0.023164416405982075 }, "hendrycksTest-electrical_engineering": { "acc": 0.2689655172413793, "acc_stderr": 0.036951833116502325, "acc_norm": 0.31724137931034485, "acc_norm_stderr": 0.03878352372138623 }, "hendrycksTest-sociology": { "acc": 0.2935323383084577, "acc_stderr": 0.03220024104534205, "acc_norm": 0.2835820895522388, "acc_norm_stderr": 0.03187187537919797 }, "hendrycksTest-high_school_mathematics": { "acc": 0.16666666666666666, "acc_stderr": 0.022722578464550523, "acc_norm": 0.25555555555555554, "acc_norm_stderr": 0.026593939101844075 }, "wsc": { "acc": 0.36538461538461536, "acc_stderr": 0.0474473339327792 }, "hendrycksTest-high_school_psychology": { "acc": 0.26055045871559634, "acc_stderr": 0.01881918203485007, "acc_norm": 0.24220183486238533, "acc_norm_stderr": 0.01836817630659862 }, "piqa": { "acc": 0.5865070729053319, "acc_stderr": 0.011489895831821131, "acc_norm": 0.5930359085963003, "acc_norm_stderr": 0.011462093919190166 }, "crows_pairs_french_religion": { "likelihood_difference": 5.045380434782609, "likelihood_difference_stderr": 0.5425900004201588, "pct_stereotype": 0.40869565217391307, "pct_stereotype_stderr": 0.04604188749503788 }, "hendrycksTest-machine_learning": { "acc": 0.26785714285714285, "acc_stderr": 0.04203277291467764, "acc_norm": 0.19642857142857142, "acc_norm_stderr": 0.03770970049347018 }, "hendrycksTest-econometrics": { "acc": 0.2894736842105263, "acc_stderr": 0.04266339443159394, "acc_norm": 0.2543859649122807, "acc_norm_stderr": 0.04096985139843671 }, "hendrycksTest-college_computer_science": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.21, "acc_norm_stderr": 0.040936018074033256 }, "logiqa": { "acc": 0.2012288786482335, "acc_stderr": 0.01572532582742823, "acc_norm": 0.2780337941628264, "acc_norm_stderr": 0.017573187770282713 }, "hendrycksTest-formal_logic": { "acc": 0.3253968253968254, "acc_stderr": 0.041905964388711366, "acc_norm": 0.29365079365079366, "acc_norm_stderr": 0.040735243221471255 }, "hendrycksTest-elementary_mathematics": { "acc": 0.25132275132275134, "acc_stderr": 0.022340482339643895, "acc_norm": 0.2777777777777778, "acc_norm_stderr": 0.02306818884826111 }, "hendrycksTest-international_law": { "acc": 0.12396694214876033, "acc_stderr": 0.030083098716035237, "acc_norm": 0.4049586776859504, "acc_norm_stderr": 0.044811377559424694 }, "hendrycksTest-virology": { "acc": 0.21686746987951808, "acc_stderr": 0.03208284450356365, "acc_norm": 0.2469879518072289, "acc_norm_stderr": 0.03357351982064536 }, "crows_pairs_english_socioeconomic": { "likelihood_difference": 3.7626644736842105, "likelihood_difference_stderr": 0.263472659726499, "pct_stereotype": 0.6421052631578947, "pct_stereotype_stderr": 0.03486983309720002 }, "hendrycksTest-computer_security": { "acc": 0.19, "acc_stderr": 0.03942772444036623, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542127 }, "crows_pairs_french_physical_appearance": { "likelihood_difference": 5.159722222222222, "likelihood_difference_stderr": 0.6604052452240913, "pct_stereotype": 0.4861111111111111, "pct_stereotype_stderr": 0.05931618532716555 }, "hendrycksTest-conceptual_physics": { "acc": 0.23829787234042554, "acc_stderr": 0.027851252973889764, "acc_norm": 0.19574468085106383, "acc_norm_stderr": 0.025937853139977145 }, "hendrycksTest-college_chemistry": { "acc": 0.35, "acc_stderr": 0.04793724854411019, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "hendrycksTest-clinical_knowledge": { "acc": 0.22264150943396227, "acc_stderr": 0.025604233470899098, "acc_norm": 0.3169811320754717, "acc_norm_stderr": 0.028637235639800928 }, "hendrycksTest-high_school_geography": { "acc": 0.21212121212121213, "acc_stderr": 0.029126522834586832, "acc_norm": 0.26262626262626265, "acc_norm_stderr": 0.031353050095330855 }, "sciq": { "acc": 0.65, "acc_stderr": 0.015090650341444231, "acc_norm": 0.577, "acc_norm_stderr": 0.01563058909047635 }, "crows_pairs_french_autre": { "likelihood_difference": 3.9399038461538463, "likelihood_difference_stderr": 0.93832403380636, "pct_stereotype": 0.5384615384615384, "pct_stereotype_stderr": 0.14390989949130545 }, "hendrycksTest-prehistory": { "acc": 0.25925925925925924, "acc_stderr": 0.02438366553103545, "acc_norm": 0.18518518518518517, "acc_norm_stderr": 0.0216138093952248 }, "crows_pairs_french": { "likelihood_difference": 5.003405448717949, "likelihood_difference_stderr": 0.12449132819328972, "pct_stereotype": 0.4364937388193202, "pct_stereotype_stderr": 0.012114385095725013 }, "crows_pairs_english_race_color": { "likelihood_difference": 3.6161417322834644, "likelihood_difference_stderr": 0.18602106212344902, "pct_stereotype": 0.421259842519685, "pct_stereotype_stderr": 0.021928698676414303 }, "hendrycksTest-moral_disputes": { "acc": 0.23121387283236994, "acc_stderr": 0.022698657167855713, "acc_norm": 0.31213872832369943, "acc_norm_stderr": 0.02494679222527231 }, "hendrycksTest-college_medicine": { "acc": 0.21965317919075145, "acc_stderr": 0.031568093627031744, "acc_norm": 0.3468208092485549, "acc_norm_stderr": 0.036291466701596636 }, "hendrycksTest-professional_medicine": { "acc": 0.2610294117647059, "acc_stderr": 0.026679252270103128, "acc_norm": 0.27205882352941174, "acc_norm_stderr": 0.027033041151681456 }, "crows_pairs_english_autre": { "likelihood_difference": 5.296875, "likelihood_difference_stderr": 2.3872581014414274, "pct_stereotype": 0.45454545454545453, "pct_stereotype_stderr": 0.15745916432444335 }, "hendrycksTest-security_studies": { "acc": 0.3551020408163265, "acc_stderr": 0.030635655150387634, "acc_norm": 0.24489795918367346, "acc_norm_stderr": 0.02752963744017493 }, "crows_pairs_french_age": { "likelihood_difference": 3.9774305555555554, "likelihood_difference_stderr": 0.38401151759615354, "pct_stereotype": 0.4222222222222222, "pct_stereotype_stderr": 0.05235473399540657 }, "hendrycksTest-human_aging": { "acc": 0.2600896860986547, "acc_stderr": 0.029442495585857476, "acc_norm": 0.22869955156950672, "acc_norm_stderr": 0.02818824004692919 }, "hendrycksTest-management": { "acc": 0.20388349514563106, "acc_stderr": 0.039891398595317706, "acc_norm": 0.2524271844660194, "acc_norm_stderr": 0.04301250399690878 }, "crows_pairs_french_gender": { "likelihood_difference": 4.199961059190031, "likelihood_difference_stderr": 0.19666947220858882, "pct_stereotype": 0.5077881619937694, "pct_stereotype_stderr": 0.02794745876935634 }, "hendrycksTest-global_facts": { "acc": 0.2, "acc_stderr": 0.04020151261036847, "acc_norm": 0.2, "acc_norm_stderr": 0.04020151261036846 }, "crows_pairs_english_age": { "likelihood_difference": 2.824004120879121, "likelihood_difference_stderr": 0.25515072273014766, "pct_stereotype": 0.5714285714285714, "pct_stereotype_stderr": 0.05216405309573015 }, "hendrycksTest-business_ethics": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542128 }, "hendrycksTest-anatomy": { "acc": 0.23703703703703705, "acc_stderr": 0.03673731683969506, "acc_norm": 0.21481481481481482, "acc_norm_stderr": 0.03547854198560824 }, "hendrycksTest-jurisprudence": { "acc": 0.17592592592592593, "acc_stderr": 0.036809181416738807, "acc_norm": 0.3425925925925926, "acc_norm_stderr": 0.04587904741301812 }, "hendrycksTest-professional_accounting": { "acc": 0.25177304964539005, "acc_stderr": 0.0258921511567094, "acc_norm": 0.2624113475177305, "acc_norm_stderr": 0.026244920349843007 }, "hendrycksTest-high_school_chemistry": { "acc": 0.1724137931034483, "acc_stderr": 0.0265776721830366, "acc_norm": 0.2660098522167488, "acc_norm_stderr": 0.03108982600293752 }, "hendrycksTest-high_school_computer_science": { "acc": 0.19, "acc_stderr": 0.03942772444036622, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720683 }, "hendrycksTest-college_physics": { "acc": 0.17647058823529413, "acc_stderr": 0.0379328118530781, "acc_norm": 0.24509803921568626, "acc_norm_stderr": 0.042801058373643966 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.19689119170984457, "acc_stderr": 0.028697873971860677, "acc_norm": 0.29533678756476683, "acc_norm_stderr": 0.03292296639155141 }, "crows_pairs_english_gender": { "likelihood_difference": 3.058837890625, "likelihood_difference_stderr": 0.2502387813084938, "pct_stereotype": 0.56875, "pct_stereotype_stderr": 0.027728726065513788 }, "hendrycksTest-moral_scenarios": { "acc": 0.23798882681564246, "acc_stderr": 0.014242630070574915, "acc_norm": 0.24692737430167597, "acc_norm_stderr": 0.014422292204808836 }, "hendrycksTest-public_relations": { "acc": 0.23636363636363636, "acc_stderr": 0.040693063197213754, "acc_norm": 0.16363636363636364, "acc_norm_stderr": 0.035434330542986774 }, "crows_pairs_english": { "likelihood_difference": 3.6419387298747763, "likelihood_difference_stderr": 0.10530121451159095, "pct_stereotype": 0.5336911150864639, "pct_stereotype_stderr": 0.012185541257180466 }, "arc_challenge": { "acc": 0.17491467576791808, "acc_stderr": 0.011101562501828234, "acc_norm": 0.22184300341296928, "acc_norm_stderr": 0.012141659068147887 }, "hendrycksTest-marketing": { "acc": 0.27350427350427353, "acc_stderr": 0.029202540153431197, "acc_norm": 0.28205128205128205, "acc_norm_stderr": 0.02948036054954119 }, "hendrycksTest-philosophy": { "acc": 0.2057877813504823, "acc_stderr": 0.022961339906764248, "acc_norm": 0.2829581993569132, "acc_norm_stderr": 0.025583062489984824 }, "hendrycksTest-miscellaneous": { "acc": 0.2503192848020434, "acc_stderr": 0.015491088951494576, "acc_norm": 0.24393358876117496, "acc_norm_stderr": 0.015357212665829475 }, "crows_pairs_english_religion": { "likelihood_difference": 3.3603603603603602, "likelihood_difference_stderr": 0.39596313038117587, "pct_stereotype": 0.6666666666666666, "pct_stereotype_stderr": 0.04494665749754944 }, "hendrycksTest-nutrition": { "acc": 0.22549019607843138, "acc_stderr": 0.023929155517351284, "acc_norm": 0.33986928104575165, "acc_norm_stderr": 0.027121956071388852 }, "hendrycksTest-us_foreign_policy": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "hendrycksTest-high_school_physics": { "acc": 0.2119205298013245, "acc_stderr": 0.03336767086567977, "acc_norm": 0.2185430463576159, "acc_norm_stderr": 0.033742355504256936 }, "hendrycksTest-high_school_biology": { "acc": 0.25483870967741934, "acc_stderr": 0.024790118459332208, "acc_norm": 0.32903225806451614, "acc_norm_stderr": 0.02672949906834997 }, "crows_pairs_english_disability": { "likelihood_difference": 5.554567307692308, "likelihood_difference_stderr": 0.6793924919190663, "pct_stereotype": 0.6461538461538462, "pct_stereotype_stderr": 0.05977027026123099 }, "hendrycksTest-human_sexuality": { "acc": 0.31297709923664124, "acc_stderr": 0.04066962905677698, "acc_norm": 0.3053435114503817, "acc_norm_stderr": 0.04039314978724561 }, "arc_easy": { "acc": 0.3952020202020202, "acc_stderr": 0.01003189405279098, "acc_norm": 0.3522727272727273, "acc_norm_stderr": 0.009801753933112771 }, "crows_pairs_english_sexual_orientation": { "likelihood_difference": 4.968413978494624, "likelihood_difference_stderr": 0.6082206813914695, "pct_stereotype": 0.8064516129032258, "pct_stereotype_stderr": 0.041189832133487855 }, "crows_pairs_french_nationality": { "likelihood_difference": 7.775938735177865, "likelihood_difference_stderr": 0.3805565031745419, "pct_stereotype": 0.233201581027668, "pct_stereotype_stderr": 0.026638273845497516 }, "hendrycksTest-professional_psychology": { "acc": 0.24019607843137256, "acc_stderr": 0.017282760695167418, "acc_norm": 0.25980392156862747, "acc_norm_stderr": 0.017740899509177795 }, "hendrycksTest-astronomy": { "acc": 0.20394736842105263, "acc_stderr": 0.0327900040631005, "acc_norm": 0.34210526315789475, "acc_norm_stderr": 0.03860731599316092 }, "hendrycksTest-abstract_algebra": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "crows_pairs_french_disability": { "likelihood_difference": 6.735795454545454, "likelihood_difference_stderr": 0.7110577603651055, "pct_stereotype": 0.36363636363636365, "pct_stereotype_stderr": 0.05966637484671758 }, "hendrycksTest-high_school_european_history": { "acc": 0.23636363636363636, "acc_stderr": 0.033175059300091805, "acc_norm": 0.3090909090909091, "acc_norm_stderr": 0.036085410115739666 }, "hendrycksTest-high_school_statistics": { "acc": 0.14351851851851852, "acc_stderr": 0.02391077925264438, "acc_norm": 0.2222222222222222, "acc_norm_stderr": 0.028353212866863434 }, "crows_pairs_english_physical_appearance": { "likelihood_difference": 3.609375, "likelihood_difference_stderr": 0.41664133487403315, "pct_stereotype": 0.6111111111111112, "pct_stereotype_stderr": 0.057855371034784615 } }, "versions": { "hendrycksTest-high_school_world_history": 0, "lambada_openai": 0, "winogrande": 0, "hendrycksTest-world_religions": 0, "crows_pairs_english_nationality": 0, "hendrycksTest-professional_law": 0, "hendrycksTest-high_school_microeconomics": 0, "hendrycksTest-medical_genetics": 0, "hendrycksTest-logical_fallacies": 0, "crows_pairs_french_sexual_orientation": 0, "hendrycksTest-high_school_macroeconomics": 0, "hendrycksTest-college_biology": 0, "hendrycksTest-high_school_us_history": 0, "crows_pairs_french_socioeconomic": 0, "hendrycksTest-college_mathematics": 0, "crows_pairs_french_race_color": 0, "hendrycksTest-electrical_engineering": 0, "hendrycksTest-sociology": 0, "hendrycksTest-high_school_mathematics": 0, "wsc": 0, "hendrycksTest-high_school_psychology": 0, "piqa": 0, "crows_pairs_french_religion": 0, "hendrycksTest-machine_learning": 0, "hendrycksTest-econometrics": 0, "hendrycksTest-college_computer_science": 0, "logiqa": 0, "hendrycksTest-formal_logic": 0, "hendrycksTest-elementary_mathematics": 0, "hendrycksTest-international_law": 0, "hendrycksTest-virology": 0, "crows_pairs_english_socioeconomic": 0, "hendrycksTest-computer_security": 0, "crows_pairs_french_physical_appearance": 0, "hendrycksTest-conceptual_physics": 0, "hendrycksTest-college_chemistry": 0, "hendrycksTest-clinical_knowledge": 0, "hendrycksTest-high_school_geography": 0, "sciq": 0, "crows_pairs_french_autre": 0, "hendrycksTest-prehistory": 0, "crows_pairs_french": 0, "crows_pairs_english_race_color": 0, "hendrycksTest-moral_disputes": 0, "hendrycksTest-college_medicine": 0, "hendrycksTest-professional_medicine": 0, "crows_pairs_english_autre": 0, "hendrycksTest-security_studies": 0, "crows_pairs_french_age": 0, "hendrycksTest-human_aging": 0, "hendrycksTest-management": 0, "crows_pairs_french_gender": 0, "hendrycksTest-global_facts": 0, "crows_pairs_english_age": 0, "hendrycksTest-business_ethics": 0, "hendrycksTest-anatomy": 0, "hendrycksTest-jurisprudence": 0, "hendrycksTest-professional_accounting": 0, "hendrycksTest-high_school_chemistry": 0, "hendrycksTest-high_school_computer_science": 0, "hendrycksTest-college_physics": 0, "hendrycksTest-high_school_government_and_politics": 0, "crows_pairs_english_gender": 0, "hendrycksTest-moral_scenarios": 0, "hendrycksTest-public_relations": 0, "crows_pairs_english": 0, "arc_challenge": 0, "hendrycksTest-marketing": 0, "hendrycksTest-philosophy": 0, "hendrycksTest-miscellaneous": 0, "crows_pairs_english_religion": 0, "hendrycksTest-nutrition": 0, "hendrycksTest-us_foreign_policy": 0, "hendrycksTest-high_school_physics": 0, "hendrycksTest-high_school_biology": 0, "crows_pairs_english_disability": 0, "hendrycksTest-human_sexuality": 0, "arc_easy": 0, "crows_pairs_english_sexual_orientation": 0, "crows_pairs_french_nationality": 0, "hendrycksTest-professional_psychology": 0, "hendrycksTest-astronomy": 0, "hendrycksTest-abstract_algebra": 0, "crows_pairs_french_disability": 0, "hendrycksTest-high_school_european_history": 0, "hendrycksTest-high_school_statistics": 0, "crows_pairs_english_physical_appearance": 0 }, "config": { "model": "hf-causal", "model_args": "pretrained=EleutherAI/pythia-v1.1-70m,revision=step13000", "num_fewshot": 0, "batch_size": 16, "device": "cuda:1", "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }