{ "results": { "hendrycksTest-prehistory": { "acc": 0.30246913580246915, "acc_stderr": 0.025557653981868038, "acc_norm": 0.21296296296296297, "acc_norm_stderr": 0.022779719088733396 }, "hendrycksTest-moral_disputes": { "acc": 0.26011560693641617, "acc_stderr": 0.02361867831006937, "acc_norm": 0.30057803468208094, "acc_norm_stderr": 0.024685316867257792 }, "arc_easy": { "acc": 0.39941077441077444, "acc_stderr": 0.010050018228742104, "acc_norm": 0.35984848484848486, "acc_norm_stderr": 0.009848484848484853 }, "hendrycksTest-human_sexuality": { "acc": 0.29770992366412213, "acc_stderr": 0.040103589424622034, "acc_norm": 0.2595419847328244, "acc_norm_stderr": 0.03844876139785271 }, "crows_pairs_english_sexual_orientation": { "likelihood_difference": 4.420530913978495, "likelihood_difference_stderr": 0.5455347845945085, "pct_stereotype": 0.8172043010752689, "pct_stereotype_stderr": 0.04029530010615517 }, "hendrycksTest-conceptual_physics": { "acc": 0.2170212765957447, "acc_stderr": 0.02694748312149623, "acc_norm": 0.18723404255319148, "acc_norm_stderr": 0.025501588341883583 }, "crows_pairs_french_nationality": { "likelihood_difference": 7.1190711462450595, "likelihood_difference_stderr": 0.4052713659970057, "pct_stereotype": 0.2924901185770751, "pct_stereotype_stderr": 0.02865639690849427 }, "crows_pairs_english_nationality": { "likelihood_difference": 3.6803385416666665, "likelihood_difference_stderr": 0.2726355174224698, "pct_stereotype": 0.4398148148148148, "pct_stereotype_stderr": 0.03385177976044811 }, "hendrycksTest-professional_accounting": { "acc": 0.26595744680851063, "acc_stderr": 0.026358065698880585, "acc_norm": 0.2375886524822695, "acc_norm_stderr": 0.025389512552729906 }, "hendrycksTest-security_studies": { "acc": 0.3346938775510204, "acc_stderr": 0.030209235226242307, "acc_norm": 0.2693877551020408, "acc_norm_stderr": 0.02840125202902294 }, "hendrycksTest-college_mathematics": { "acc": 0.18, "acc_stderr": 0.03861229196653697, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394 }, "hendrycksTest-jurisprudence": { "acc": 0.21296296296296297, "acc_stderr": 0.0395783547198098, "acc_norm": 0.35185185185185186, "acc_norm_stderr": 0.04616631111801713 }, "piqa": { "acc": 0.5903155603917302, "acc_stderr": 0.011473932007187606, "acc_norm": 0.5892274211099021, "acc_norm_stderr": 0.011478565556775776 }, "hendrycksTest-high_school_physics": { "acc": 0.23178807947019867, "acc_stderr": 0.03445406271987054, "acc_norm": 0.25165562913907286, "acc_norm_stderr": 0.03543304234389985 }, "crows_pairs_english_religion": { "likelihood_difference": 3.689470720720721, "likelihood_difference_stderr": 0.45650051870103214, "pct_stereotype": 0.6576576576576577, "pct_stereotype_stderr": 0.04524117824423198 }, "hendrycksTest-computer_security": { "acc": 0.23, "acc_stderr": 0.04229525846816506, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542127 }, "hendrycksTest-high_school_biology": { "acc": 0.23870967741935484, "acc_stderr": 0.02425107126220884, "acc_norm": 0.3, "acc_norm_stderr": 0.026069362295335123 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.24102564102564103, "acc_stderr": 0.021685546665333184, "acc_norm": 0.28717948717948716, "acc_norm_stderr": 0.022939925418530616 }, "hendrycksTest-high_school_geography": { "acc": 0.2727272727272727, "acc_stderr": 0.03173071239071724, "acc_norm": 0.3181818181818182, "acc_norm_stderr": 0.0331847733384533 }, "hendrycksTest-college_medicine": { "acc": 0.2254335260115607, "acc_stderr": 0.03186209851641143, "acc_norm": 0.3179190751445087, "acc_norm_stderr": 0.0355068398916558 }, "crows_pairs_english": { "likelihood_difference": 3.5890820661896243, "likelihood_difference_stderr": 0.10115312974643073, "pct_stereotype": 0.5491949910554562, "pct_stereotype_stderr": 0.012154039490138224 }, "crows_pairs_english_autre": { "likelihood_difference": 5.323863636363637, "likelihood_difference_stderr": 1.31611564166353, "pct_stereotype": 0.5454545454545454, "pct_stereotype_stderr": 0.1574591643244434 }, "lambada_openai": { "ppl": 112.36318862751271, "ppl_stderr": 4.5861160676234745, "acc": 0.22821657287017272, "acc_stderr": 0.005847003943226629 }, "hendrycksTest-world_religions": { "acc": 0.2807017543859649, "acc_stderr": 0.034462962170884265, "acc_norm": 0.3391812865497076, "acc_norm_stderr": 0.03631053496488905 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.20725388601036268, "acc_stderr": 0.029252823291803648, "acc_norm": 0.2694300518134715, "acc_norm_stderr": 0.03201867122877793 }, "hendrycksTest-college_biology": { "acc": 0.2847222222222222, "acc_stderr": 0.03773809990686936, "acc_norm": 0.2916666666666667, "acc_norm_stderr": 0.038009680605548574 }, "crows_pairs_english_physical_appearance": { "likelihood_difference": 3.679470486111111, "likelihood_difference_stderr": 0.4176954857537189, "pct_stereotype": 0.5972222222222222, "pct_stereotype_stderr": 0.05820650942569533 }, "hendrycksTest-miscellaneous": { "acc": 0.2822477650063857, "acc_stderr": 0.016095302969878555, "acc_norm": 0.2771392081736909, "acc_norm_stderr": 0.01600563629412242 }, "crows_pairs_english_age": { "likelihood_difference": 2.8324175824175826, "likelihood_difference_stderr": 0.2900137402487036, "pct_stereotype": 0.45054945054945056, "pct_stereotype_stderr": 0.052446231001012276 }, "hendrycksTest-professional_law": { "acc": 0.24967405475880053, "acc_stderr": 0.011054538377832322, "acc_norm": 0.28096479791395046, "acc_norm_stderr": 0.011479684550077689 }, "crows_pairs_french_physical_appearance": { "likelihood_difference": 5.460503472222222, "likelihood_difference_stderr": 0.5691473963017525, "pct_stereotype": 0.4722222222222222, "pct_stereotype_stderr": 0.05924743948371486 }, "hendrycksTest-medical_genetics": { "acc": 0.27, "acc_stderr": 0.044619604333847394, "acc_norm": 0.36, "acc_norm_stderr": 0.048241815132442176 }, "hendrycksTest-college_chemistry": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542127 }, "crows_pairs_french_disability": { "likelihood_difference": 6.002840909090909, "likelihood_difference_stderr": 0.6010893644517457, "pct_stereotype": 0.3787878787878788, "pct_stereotype_stderr": 0.0601674102524024 }, "hendrycksTest-clinical_knowledge": { "acc": 0.24528301886792453, "acc_stderr": 0.02648035717989568, "acc_norm": 0.33962264150943394, "acc_norm_stderr": 0.029146904747798335 }, "crows_pairs_english_socioeconomic": { "likelihood_difference": 4.15016447368421, "likelihood_difference_stderr": 0.27969083115380783, "pct_stereotype": 0.6789473684210526, "pct_stereotype_stderr": 0.03396059335824887 }, "hendrycksTest-high_school_mathematics": { "acc": 0.15925925925925927, "acc_stderr": 0.02231039463004062, "acc_norm": 0.23333333333333334, "acc_norm_stderr": 0.025787874220959316 }, "crows_pairs_french_socioeconomic": { "likelihood_difference": 4.891063456632653, "likelihood_difference_stderr": 0.3589199389598812, "pct_stereotype": 0.4387755102040816, "pct_stereotype_stderr": 0.035536298657903934 }, "winogrande": { "acc": 0.5027624309392266, "acc_stderr": 0.014052271211616438 }, "hendrycksTest-elementary_mathematics": { "acc": 0.24867724867724866, "acc_stderr": 0.022261817692400168, "acc_norm": 0.2698412698412698, "acc_norm_stderr": 0.02286083830923207 }, "hendrycksTest-professional_medicine": { "acc": 0.27941176470588236, "acc_stderr": 0.027257202606114944, "acc_norm": 0.27205882352941174, "acc_norm_stderr": 0.027033041151681456 }, "hendrycksTest-us_foreign_policy": { "acc": 0.22, "acc_stderr": 0.041633319989322695, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "wsc": { "acc": 0.36538461538461536, "acc_stderr": 0.0474473339327792 }, "arc_challenge": { "acc": 0.17235494880546076, "acc_stderr": 0.011037113093461295, "acc_norm": 0.20477815699658702, "acc_norm_stderr": 0.011792544338513403 }, "hendrycksTest-anatomy": { "acc": 0.2074074074074074, "acc_stderr": 0.03502553170678318, "acc_norm": 0.22962962962962963, "acc_norm_stderr": 0.03633384414073462 }, "hendrycksTest-high_school_european_history": { "acc": 0.23030303030303031, "acc_stderr": 0.03287666758603489, "acc_norm": 0.2909090909090909, "acc_norm_stderr": 0.03546563019624336 }, "crows_pairs_french_age": { "likelihood_difference": 4.747916666666667, "likelihood_difference_stderr": 0.5033904535606084, "pct_stereotype": 0.45555555555555555, "pct_stereotype_stderr": 0.05279009646630345 }, "hendrycksTest-electrical_engineering": { "acc": 0.2620689655172414, "acc_stderr": 0.036646663372252565, "acc_norm": 0.2896551724137931, "acc_norm_stderr": 0.03780019230438014 }, "hendrycksTest-philosophy": { "acc": 0.2057877813504823, "acc_stderr": 0.022961339906764237, "acc_norm": 0.2958199356913183, "acc_norm_stderr": 0.025922371788818784 }, "logiqa": { "acc": 0.2304147465437788, "acc_stderr": 0.016516834820590964, "acc_norm": 0.2887864823348694, "acc_norm_stderr": 0.017775906336539225 }, "hendrycksTest-high_school_computer_science": { "acc": 0.22, "acc_stderr": 0.0416333199893227, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "hendrycksTest-virology": { "acc": 0.18674698795180722, "acc_stderr": 0.030338749144500583, "acc_norm": 0.23493975903614459, "acc_norm_stderr": 0.03300533186128922 }, "hendrycksTest-high_school_us_history": { "acc": 0.27450980392156865, "acc_stderr": 0.03132179803083291, "acc_norm": 0.28921568627450983, "acc_norm_stderr": 0.031822318676475524 }, "crows_pairs_french_religion": { "likelihood_difference": 5.036141304347826, "likelihood_difference_stderr": 0.4580008943372021, "pct_stereotype": 0.6086956521739131, "pct_stereotype_stderr": 0.045709346351117126 }, "hendrycksTest-moral_scenarios": { "acc": 0.23910614525139665, "acc_stderr": 0.014265554192331149, "acc_norm": 0.24692737430167597, "acc_norm_stderr": 0.014422292204808835 }, "hendrycksTest-econometrics": { "acc": 0.21929824561403508, "acc_stderr": 0.03892431106518754, "acc_norm": 0.24561403508771928, "acc_norm_stderr": 0.040493392977481425 }, "hendrycksTest-international_law": { "acc": 0.18181818181818182, "acc_stderr": 0.03520893951097653, "acc_norm": 0.39669421487603307, "acc_norm_stderr": 0.04465869780531009 }, "hendrycksTest-public_relations": { "acc": 0.3, "acc_stderr": 0.04389311454644286, "acc_norm": 0.2, "acc_norm_stderr": 0.03831305140884603 }, "hendrycksTest-professional_psychology": { "acc": 0.23039215686274508, "acc_stderr": 0.017035229258034038, "acc_norm": 0.2647058823529412, "acc_norm_stderr": 0.017848089574913226 }, "hendrycksTest-high_school_chemistry": { "acc": 0.19704433497536947, "acc_stderr": 0.027986724666736212, "acc_norm": 0.2955665024630542, "acc_norm_stderr": 0.032104944337514575 }, "crows_pairs_french_sexual_orientation": { "likelihood_difference": 5.629464285714286, "likelihood_difference_stderr": 0.45577446371656316, "pct_stereotype": 0.7692307692307693, "pct_stereotype_stderr": 0.04441155916843277 }, "hendrycksTest-high_school_statistics": { "acc": 0.20833333333333334, "acc_stderr": 0.027696910713093936, "acc_norm": 0.23148148148148148, "acc_norm_stderr": 0.028765111718046955 }, "hendrycksTest-global_facts": { "acc": 0.2, "acc_stderr": 0.04020151261036847, "acc_norm": 0.23, "acc_norm_stderr": 0.04229525846816507 }, "crows_pairs_french_autre": { "likelihood_difference": 3.5528846153846154, "likelihood_difference_stderr": 1.1518474776945522, "pct_stereotype": 0.3076923076923077, "pct_stereotype_stderr": 0.13323467750529824 }, "crows_pairs_english_disability": { "likelihood_difference": 4.724038461538462, "likelihood_difference_stderr": 0.5305206431151923, "pct_stereotype": 0.6461538461538462, "pct_stereotype_stderr": 0.05977027026123099 }, "sciq": { "acc": 0.663, "acc_stderr": 0.014955087918653605, "acc_norm": 0.586, "acc_norm_stderr": 0.015583544104177506 }, "hendrycksTest-college_computer_science": { "acc": 0.2, "acc_stderr": 0.040201512610368445, "acc_norm": 0.19, "acc_norm_stderr": 0.03942772444036623 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.19747899159663865, "acc_stderr": 0.02585916412205145, "acc_norm": 0.29411764705882354, "acc_norm_stderr": 0.029597329730978103 }, "hendrycksTest-marketing": { "acc": 0.2777777777777778, "acc_stderr": 0.02934311479809447, "acc_norm": 0.2905982905982906, "acc_norm_stderr": 0.029745048572674054 }, "hendrycksTest-formal_logic": { "acc": 0.31746031746031744, "acc_stderr": 0.04163453031302859, "acc_norm": 0.2857142857142857, "acc_norm_stderr": 0.0404061017820884 }, "hendrycksTest-abstract_algebra": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "hendrycksTest-business_ethics": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684 }, "hendrycksTest-logical_fallacies": { "acc": 0.25153374233128833, "acc_stderr": 0.034089978868575295, "acc_norm": 0.3312883435582822, "acc_norm_stderr": 0.03697983910025588 }, "hendrycksTest-high_school_world_history": { "acc": 0.22784810126582278, "acc_stderr": 0.027303484599069432, "acc_norm": 0.27848101265822783, "acc_norm_stderr": 0.029178682304842548 }, "crows_pairs_french": { "likelihood_difference": 5.100435114788312, "likelihood_difference_stderr": 0.12283143626277805, "pct_stereotype": 0.4263565891472868, "pct_stereotype_stderr": 0.012080098824602488 }, "hendrycksTest-human_aging": { "acc": 0.273542600896861, "acc_stderr": 0.029918586707798834, "acc_norm": 0.2600896860986547, "acc_norm_stderr": 0.029442495585857473 }, "hendrycksTest-machine_learning": { "acc": 0.3392857142857143, "acc_stderr": 0.04493949068613539, "acc_norm": 0.24107142857142858, "acc_norm_stderr": 0.04059867246952688 }, "hendrycksTest-astronomy": { "acc": 0.17105263157894737, "acc_stderr": 0.03064360707167709, "acc_norm": 0.32894736842105265, "acc_norm_stderr": 0.038234289699266046 }, "crows_pairs_english_race_color": { "likelihood_difference": 3.450602854330709, "likelihood_difference_stderr": 0.17629031976839826, "pct_stereotype": 0.4704724409448819, "pct_stereotype_stderr": 0.022167024359332235 }, "hendrycksTest-management": { "acc": 0.21359223300970873, "acc_stderr": 0.04058042015646034, "acc_norm": 0.3106796116504854, "acc_norm_stderr": 0.04582124160161551 }, "crows_pairs_french_race_color": { "likelihood_difference": 4.47445652173913, "likelihood_difference_stderr": 0.22998239662821754, "pct_stereotype": 0.2956521739130435, "pct_stereotype_stderr": 0.021299910806810252 }, "hendrycksTest-high_school_psychology": { "acc": 0.23486238532110093, "acc_stderr": 0.018175110510343578, "acc_norm": 0.24954128440366974, "acc_norm_stderr": 0.018553897629501617 }, "hendrycksTest-college_physics": { "acc": 0.20588235294117646, "acc_stderr": 0.04023382273617747, "acc_norm": 0.2647058823529412, "acc_norm_stderr": 0.043898699568087785 }, "crows_pairs_french_gender": { "likelihood_difference": 4.302570093457944, "likelihood_difference_stderr": 0.20681513943922145, "pct_stereotype": 0.5451713395638629, "pct_stereotype_stderr": 0.027836551402899614 }, "crows_pairs_english_gender": { "likelihood_difference": 3.0423828125, "likelihood_difference_stderr": 0.24361353048777595, "pct_stereotype": 0.553125, "pct_stereotype_stderr": 0.027836160509246817 }, "hendrycksTest-sociology": { "acc": 0.2885572139303483, "acc_stderr": 0.0320384104021332, "acc_norm": 0.31840796019900497, "acc_norm_stderr": 0.03294118479054095 }, "hendrycksTest-nutrition": { "acc": 0.24836601307189543, "acc_stderr": 0.024739981355113592, "acc_norm": 0.34967320261437906, "acc_norm_stderr": 0.0273053080762747 } }, "versions": { "hendrycksTest-prehistory": 0, "hendrycksTest-moral_disputes": 0, "arc_easy": 0, "hendrycksTest-human_sexuality": 0, "crows_pairs_english_sexual_orientation": 0, "hendrycksTest-conceptual_physics": 0, "crows_pairs_french_nationality": 0, "crows_pairs_english_nationality": 0, "hendrycksTest-professional_accounting": 0, "hendrycksTest-security_studies": 0, "hendrycksTest-college_mathematics": 0, "hendrycksTest-jurisprudence": 0, "piqa": 0, "hendrycksTest-high_school_physics": 0, "crows_pairs_english_religion": 0, "hendrycksTest-computer_security": 0, "hendrycksTest-high_school_biology": 0, "hendrycksTest-high_school_macroeconomics": 0, "hendrycksTest-high_school_geography": 0, "hendrycksTest-college_medicine": 0, "crows_pairs_english": 0, "crows_pairs_english_autre": 0, "lambada_openai": 0, "hendrycksTest-world_religions": 0, "hendrycksTest-high_school_government_and_politics": 0, "hendrycksTest-college_biology": 0, "crows_pairs_english_physical_appearance": 0, "hendrycksTest-miscellaneous": 0, "crows_pairs_english_age": 0, "hendrycksTest-professional_law": 0, "crows_pairs_french_physical_appearance": 0, "hendrycksTest-medical_genetics": 0, "hendrycksTest-college_chemistry": 0, "crows_pairs_french_disability": 0, "hendrycksTest-clinical_knowledge": 0, "crows_pairs_english_socioeconomic": 0, "hendrycksTest-high_school_mathematics": 0, "crows_pairs_french_socioeconomic": 0, "winogrande": 0, "hendrycksTest-elementary_mathematics": 0, "hendrycksTest-professional_medicine": 0, "hendrycksTest-us_foreign_policy": 0, "wsc": 0, "arc_challenge": 0, "hendrycksTest-anatomy": 0, "hendrycksTest-high_school_european_history": 0, "crows_pairs_french_age": 0, "hendrycksTest-electrical_engineering": 0, "hendrycksTest-philosophy": 0, "logiqa": 0, "hendrycksTest-high_school_computer_science": 0, "hendrycksTest-virology": 0, "hendrycksTest-high_school_us_history": 0, "crows_pairs_french_religion": 0, "hendrycksTest-moral_scenarios": 0, "hendrycksTest-econometrics": 0, "hendrycksTest-international_law": 0, "hendrycksTest-public_relations": 0, "hendrycksTest-professional_psychology": 0, "hendrycksTest-high_school_chemistry": 0, "crows_pairs_french_sexual_orientation": 0, "hendrycksTest-high_school_statistics": 0, "hendrycksTest-global_facts": 0, "crows_pairs_french_autre": 0, "crows_pairs_english_disability": 0, "sciq": 0, "hendrycksTest-college_computer_science": 0, "hendrycksTest-high_school_microeconomics": 0, "hendrycksTest-marketing": 0, "hendrycksTest-formal_logic": 0, "hendrycksTest-abstract_algebra": 0, "hendrycksTest-business_ethics": 0, "hendrycksTest-logical_fallacies": 0, "hendrycksTest-high_school_world_history": 0, "crows_pairs_french": 0, "hendrycksTest-human_aging": 0, "hendrycksTest-machine_learning": 0, "hendrycksTest-astronomy": 0, "crows_pairs_english_race_color": 0, "hendrycksTest-management": 0, "crows_pairs_french_race_color": 0, "hendrycksTest-high_school_psychology": 0, "hendrycksTest-college_physics": 0, "crows_pairs_french_gender": 0, "crows_pairs_english_gender": 0, "hendrycksTest-sociology": 0, "hendrycksTest-nutrition": 0 }, "config": { "model": "hf-causal", "model_args": "pretrained=EleutherAI/pythia-v1.1-70m,revision=step43000", "num_fewshot": 0, "batch_size": 16, "device": "cuda:4", "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }