{ "results": { "crows_pairs_english_socioeconomic": { "likelihood_difference": 3.950657894736842, "likelihood_difference_stderr": 0.2574433600890588, "pct_stereotype": 0.631578947368421, "pct_stereotype_stderr": 0.03508771929824559 }, "hendrycksTest-global_facts": { "acc": 0.22, "acc_stderr": 0.04163331998932269, "acc_norm": 0.24, "acc_norm_stderr": 0.042923469599092816 }, "hendrycksTest-formal_logic": { "acc": 0.31746031746031744, "acc_stderr": 0.04163453031302859, "acc_norm": 0.29365079365079366, "acc_norm_stderr": 0.040735243221471255 }, "hendrycksTest-moral_disputes": { "acc": 0.25722543352601157, "acc_stderr": 0.02353292543104428, "acc_norm": 0.3063583815028902, "acc_norm_stderr": 0.024818350129436593 }, "hendrycksTest-college_biology": { "acc": 0.2569444444444444, "acc_stderr": 0.03653946969442099, "acc_norm": 0.2708333333333333, "acc_norm_stderr": 0.03716177437566017 }, "hendrycksTest-management": { "acc": 0.1941747572815534, "acc_stderr": 0.03916667762822583, "acc_norm": 0.23300970873786409, "acc_norm_stderr": 0.04185832598928315 }, "hendrycksTest-philosophy": { "acc": 0.2540192926045016, "acc_stderr": 0.024723861504771693, "acc_norm": 0.2733118971061093, "acc_norm_stderr": 0.02531176597542612 }, "wsc": { "acc": 0.36538461538461536, "acc_stderr": 0.0474473339327792 }, "hendrycksTest-world_religions": { "acc": 0.22807017543859648, "acc_stderr": 0.032180937956023566, "acc_norm": 0.2807017543859649, "acc_norm_stderr": 0.034462962170884265 }, "crows_pairs_english_sexual_orientation": { "likelihood_difference": 5.241263440860215, "likelihood_difference_stderr": 0.5402840881668942, "pct_stereotype": 0.8279569892473119, "pct_stereotype_stderr": 0.039348528120618634 }, "crows_pairs_french_sexual_orientation": { "likelihood_difference": 5.804258241758242, "likelihood_difference_stderr": 0.47873767306009934, "pct_stereotype": 0.7912087912087912, "pct_stereotype_stderr": 0.04284305206509431 }, "hendrycksTest-college_chemistry": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "crows_pairs_english_autre": { "likelihood_difference": 5.4914772727272725, "likelihood_difference_stderr": 1.6384338230645699, "pct_stereotype": 0.45454545454545453, "pct_stereotype_stderr": 0.15745916432444335 }, "hendrycksTest-professional_accounting": { "acc": 0.2907801418439716, "acc_stderr": 0.027090664368353178, "acc_norm": 0.29432624113475175, "acc_norm_stderr": 0.027187127011503796 }, "hendrycksTest-electrical_engineering": { "acc": 0.2896551724137931, "acc_stderr": 0.03780019230438014, "acc_norm": 0.2896551724137931, "acc_norm_stderr": 0.03780019230438014 }, "hendrycksTest-nutrition": { "acc": 0.28104575163398693, "acc_stderr": 0.02573885479781873, "acc_norm": 0.3431372549019608, "acc_norm_stderr": 0.027184498909941616 }, "hendrycksTest-miscellaneous": { "acc": 0.26053639846743293, "acc_stderr": 0.015696008563807082, "acc_norm": 0.2503192848020434, "acc_norm_stderr": 0.015491088951494566 }, "crows_pairs_french_disability": { "likelihood_difference": 6.596590909090909, "likelihood_difference_stderr": 0.7276689352721015, "pct_stereotype": 0.48484848484848486, "pct_stereotype_stderr": 0.06198888629778894 }, "crows_pairs_english_race_color": { "likelihood_difference": 3.584584153543307, "likelihood_difference_stderr": 0.17124663558058703, "pct_stereotype": 0.452755905511811, "pct_stereotype_stderr": 0.022106430541228055 }, "hendrycksTest-human_aging": { "acc": 0.2556053811659193, "acc_stderr": 0.029275891003969923, "acc_norm": 0.21076233183856502, "acc_norm_stderr": 0.027373095500540193 }, "hendrycksTest-business_ethics": { "acc": 0.36, "acc_stderr": 0.048241815132442176, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621505 }, "hendrycksTest-moral_scenarios": { "acc": 0.2324022346368715, "acc_stderr": 0.014125968754673387, "acc_norm": 0.2435754189944134, "acc_norm_stderr": 0.014355911964767857 }, "hendrycksTest-professional_psychology": { "acc": 0.2565359477124183, "acc_stderr": 0.017667841612378977, "acc_norm": 0.28104575163398693, "acc_norm_stderr": 0.018185218954318082 }, "hendrycksTest-marketing": { "acc": 0.29914529914529914, "acc_stderr": 0.02999695185834948, "acc_norm": 0.2948717948717949, "acc_norm_stderr": 0.029872577708891165 }, "hendrycksTest-sociology": { "acc": 0.23880597014925373, "acc_stderr": 0.03014777593540922, "acc_norm": 0.27860696517412936, "acc_norm_stderr": 0.031700561834973086 }, "logiqa": { "acc": 0.21044546850998463, "acc_stderr": 0.015988369488888737, "acc_norm": 0.2565284178187404, "acc_norm_stderr": 0.01712944332788756 }, "hendrycksTest-high_school_biology": { "acc": 0.2709677419354839, "acc_stderr": 0.02528441611490016, "acc_norm": 0.2967741935483871, "acc_norm_stderr": 0.025988500792411898 }, "hendrycksTest-high_school_computer_science": { "acc": 0.24, "acc_stderr": 0.04292346959909284, "acc_norm": 0.37, "acc_norm_stderr": 0.048523658709391 }, "hendrycksTest-college_mathematics": { "acc": 0.16, "acc_stderr": 0.0368452949177471, "acc_norm": 0.23, "acc_norm_stderr": 0.04229525846816506 }, "crows_pairs_english_gender": { "likelihood_difference": 3.396826171875, "likelihood_difference_stderr": 0.2639919252147075, "pct_stereotype": 0.484375, "pct_stereotype_stderr": 0.027980952958187033 }, "crows_pairs_french_religion": { "likelihood_difference": 5.348641304347826, "likelihood_difference_stderr": 0.542018501052962, "pct_stereotype": 0.48695652173913045, "pct_stereotype_stderr": 0.04681335351503156 }, "hendrycksTest-high_school_chemistry": { "acc": 0.16748768472906403, "acc_stderr": 0.026273086047535397, "acc_norm": 0.2561576354679803, "acc_norm_stderr": 0.030712730070982592 }, "crows_pairs_english": { "likelihood_difference": 3.730359272510435, "likelihood_difference_stderr": 0.10184234185547236, "pct_stereotype": 0.5193798449612403, "pct_stereotype_stderr": 0.012204121667933781 }, "crows_pairs_french": { "likelihood_difference": 5.56149373881932, "likelihood_difference_stderr": 0.1424654102528186, "pct_stereotype": 0.42695289206917114, "pct_stereotype_stderr": 0.012082258834091222 }, "hendrycksTest-high_school_world_history": { "acc": 0.2869198312236287, "acc_stderr": 0.029443773022594693, "acc_norm": 0.31223628691983124, "acc_norm_stderr": 0.030165137867847004 }, "hendrycksTest-abstract_algebra": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768079 }, "hendrycksTest-human_sexuality": { "acc": 0.26717557251908397, "acc_stderr": 0.03880848301082394, "acc_norm": 0.25190839694656486, "acc_norm_stderr": 0.03807387116306086 }, "hendrycksTest-anatomy": { "acc": 0.2518518518518518, "acc_stderr": 0.03749850709174021, "acc_norm": 0.23703703703703705, "acc_norm_stderr": 0.03673731683969506 }, "crows_pairs_english_physical_appearance": { "likelihood_difference": 3.955078125, "likelihood_difference_stderr": 0.3980155873439069, "pct_stereotype": 0.5833333333333334, "pct_stereotype_stderr": 0.058509124791617455 }, "lambada_openai": { "ppl": 142.42891015470678, "ppl_stderr": 6.043810708201551, "acc": 0.18455268775470599, "acc_stderr": 0.00540468283118203 }, "crows_pairs_english_nationality": { "likelihood_difference": 3.5283564814814814, "likelihood_difference_stderr": 0.26698531590286084, "pct_stereotype": 0.4074074074074074, "pct_stereotype_stderr": 0.03350991604696043 }, "hendrycksTest-high_school_us_history": { "acc": 0.2549019607843137, "acc_stderr": 0.030587591351604246, "acc_norm": 0.27941176470588236, "acc_norm_stderr": 0.03149328104507956 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.2605042016806723, "acc_stderr": 0.028510251512341933, "acc_norm": 0.35714285714285715, "acc_norm_stderr": 0.031124619309328177 }, "crows_pairs_french_race_color": { "likelihood_difference": 4.814945652173913, "likelihood_difference_stderr": 0.26747469667074003, "pct_stereotype": 0.3391304347826087, "pct_stereotype_stderr": 0.02209708145176117 }, "hendrycksTest-college_physics": { "acc": 0.20588235294117646, "acc_stderr": 0.04023382273617747, "acc_norm": 0.2549019607843137, "acc_norm_stderr": 0.04336432707993177 }, "crows_pairs_french_gender": { "likelihood_difference": 4.641744548286605, "likelihood_difference_stderr": 0.2415529745762916, "pct_stereotype": 0.5015576323987538, "pct_stereotype_stderr": 0.02795071408867036 }, "hendrycksTest-college_computer_science": { "acc": 0.26, "acc_stderr": 0.04408440022768078, "acc_norm": 0.23, "acc_norm_stderr": 0.04229525846816506 }, "hendrycksTest-jurisprudence": { "acc": 0.24074074074074073, "acc_stderr": 0.0413311944024384, "acc_norm": 0.37037037037037035, "acc_norm_stderr": 0.04668408033024931 }, "hendrycksTest-international_law": { "acc": 0.2066115702479339, "acc_stderr": 0.036959801280988226, "acc_norm": 0.4380165289256198, "acc_norm_stderr": 0.045291468044357915 }, "crows_pairs_english_religion": { "likelihood_difference": 3.739864864864865, "likelihood_difference_stderr": 0.4263788133283603, "pct_stereotype": 0.6396396396396397, "pct_stereotype_stderr": 0.04577621167070315 }, "crows_pairs_french_age": { "likelihood_difference": 4.602430555555555, "likelihood_difference_stderr": 0.5176263692232941, "pct_stereotype": 0.4, "pct_stereotype_stderr": 0.051929078688949845 }, "hendrycksTest-security_studies": { "acc": 0.363265306122449, "acc_stderr": 0.030789051139030802, "acc_norm": 0.31020408163265306, "acc_norm_stderr": 0.029613459872484375 }, "hendrycksTest-machine_learning": { "acc": 0.25, "acc_stderr": 0.04109974682633932, "acc_norm": 0.22321428571428573, "acc_norm_stderr": 0.039523019677025116 }, "crows_pairs_french_autre": { "likelihood_difference": 3.8822115384615383, "likelihood_difference_stderr": 1.0329310420047324, "pct_stereotype": 0.38461538461538464, "pct_stereotype_stderr": 0.1404416814115811 }, "sciq": { "acc": 0.601, "acc_stderr": 0.015493193313162906, "acc_norm": 0.552, "acc_norm_stderr": 0.015733516566347833 }, "hendrycksTest-prehistory": { "acc": 0.2962962962962963, "acc_stderr": 0.02540719779889016, "acc_norm": 0.25, "acc_norm_stderr": 0.02409347123262133 }, "hendrycksTest-econometrics": { "acc": 0.2894736842105263, "acc_stderr": 0.04266339443159394, "acc_norm": 0.21929824561403508, "acc_norm_stderr": 0.03892431106518754 }, "crows_pairs_french_socioeconomic": { "likelihood_difference": 5.561702806122449, "likelihood_difference_stderr": 0.4452498998491719, "pct_stereotype": 0.45408163265306123, "pct_stereotype_stderr": 0.035654431417332814 }, "hendrycksTest-high_school_european_history": { "acc": 0.24242424242424243, "acc_stderr": 0.03346409881055953, "acc_norm": 0.3151515151515151, "acc_norm_stderr": 0.0362773057502241 }, "hendrycksTest-astronomy": { "acc": 0.26973684210526316, "acc_stderr": 0.036117805602848975, "acc_norm": 0.375, "acc_norm_stderr": 0.039397364351956274 }, "hendrycksTest-logical_fallacies": { "acc": 0.22699386503067484, "acc_stderr": 0.03291099578615769, "acc_norm": 0.25153374233128833, "acc_norm_stderr": 0.034089978868575295 }, "hendrycksTest-high_school_physics": { "acc": 0.2251655629139073, "acc_stderr": 0.03410435282008937, "acc_norm": 0.2582781456953642, "acc_norm_stderr": 0.035737053147634576 }, "crows_pairs_french_nationality": { "likelihood_difference": 8.138463438735178, "likelihood_difference_stderr": 0.4545360074730252, "pct_stereotype": 0.2845849802371542, "pct_stereotype_stderr": 0.02842397052208522 }, "hendrycksTest-virology": { "acc": 0.2710843373493976, "acc_stderr": 0.03460579907553027, "acc_norm": 0.24096385542168675, "acc_norm_stderr": 0.033293941190735296 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.25906735751295334, "acc_stderr": 0.031618779179354094, "acc_norm": 0.27461139896373055, "acc_norm_stderr": 0.03221024508041154 }, "crows_pairs_english_disability": { "likelihood_difference": 5.389182692307692, "likelihood_difference_stderr": 0.6517633770952715, "pct_stereotype": 0.5692307692307692, "pct_stereotype_stderr": 0.061897988228581086 }, "hendrycksTest-high_school_mathematics": { "acc": 0.18888888888888888, "acc_stderr": 0.023865318862285302, "acc_norm": 0.22962962962962963, "acc_norm_stderr": 0.02564410863926762 }, "hendrycksTest-high_school_psychology": { "acc": 0.26238532110091745, "acc_stderr": 0.018861885021534738, "acc_norm": 0.22935779816513763, "acc_norm_stderr": 0.018025349724618688 }, "crows_pairs_french_physical_appearance": { "likelihood_difference": 5.962239583333333, "likelihood_difference_stderr": 0.694376691160675, "pct_stereotype": 0.5138888888888888, "pct_stereotype_stderr": 0.05931618532716554 }, "hendrycksTest-college_medicine": { "acc": 0.2658959537572254, "acc_stderr": 0.0336876293225943, "acc_norm": 0.2832369942196532, "acc_norm_stderr": 0.03435568056047873 }, "hendrycksTest-professional_law": { "acc": 0.2457627118644068, "acc_stderr": 0.01099615663514269, "acc_norm": 0.2790091264667536, "acc_norm_stderr": 0.011455208832803546 }, "hendrycksTest-us_foreign_policy": { "acc": 0.26, "acc_stderr": 0.044084400227680794, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "hendrycksTest-elementary_mathematics": { "acc": 0.25132275132275134, "acc_stderr": 0.022340482339643895, "acc_norm": 0.23544973544973544, "acc_norm_stderr": 0.02185150982203172 }, "arc_easy": { "acc": 0.37373737373737376, "acc_stderr": 0.009927267058259621, "acc_norm": 0.3501683501683502, "acc_norm_stderr": 0.009788295410093142 }, "arc_challenge": { "acc": 0.18088737201365188, "acc_stderr": 0.01124857446740701, "acc_norm": 0.22098976109215018, "acc_norm_stderr": 0.012124929206818258 }, "crows_pairs_english_age": { "likelihood_difference": 2.605254120879121, "likelihood_difference_stderr": 0.2698048849736164, "pct_stereotype": 0.5054945054945055, "pct_stereotype_stderr": 0.05270144531112881 }, "hendrycksTest-conceptual_physics": { "acc": 0.2297872340425532, "acc_stderr": 0.02750175294441242, "acc_norm": 0.1829787234042553, "acc_norm_stderr": 0.025276041000449966 }, "hendrycksTest-medical_genetics": { "acc": 0.22, "acc_stderr": 0.0416333199893227, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.21794871794871795, "acc_stderr": 0.020932445774463175, "acc_norm": 0.2794871794871795, "acc_norm_stderr": 0.022752388839776826 }, "winogrande": { "acc": 0.5280189423835833, "acc_stderr": 0.014030404213405777 }, "hendrycksTest-public_relations": { "acc": 0.3, "acc_stderr": 0.04389311454644286, "acc_norm": 0.19090909090909092, "acc_norm_stderr": 0.03764425585984924 }, "hendrycksTest-high_school_geography": { "acc": 0.2676767676767677, "acc_stderr": 0.031544498882702866, "acc_norm": 0.2777777777777778, "acc_norm_stderr": 0.03191178226713547 }, "hendrycksTest-clinical_knowledge": { "acc": 0.2339622641509434, "acc_stderr": 0.02605529690115292, "acc_norm": 0.2943396226415094, "acc_norm_stderr": 0.028049186315695245 }, "hendrycksTest-professional_medicine": { "acc": 0.2867647058823529, "acc_stderr": 0.027472274473233818, "acc_norm": 0.25735294117647056, "acc_norm_stderr": 0.026556519470041524 }, "piqa": { "acc": 0.5946681175190425, "acc_stderr": 0.011454816387346764, "acc_norm": 0.5914036996735582, "acc_norm_stderr": 0.01146924038724515 }, "hendrycksTest-computer_security": { "acc": 0.2, "acc_stderr": 0.04020151261036845, "acc_norm": 0.27, "acc_norm_stderr": 0.0446196043338474 }, "hendrycksTest-high_school_statistics": { "acc": 0.2037037037037037, "acc_stderr": 0.027467401804057996, "acc_norm": 0.2361111111111111, "acc_norm_stderr": 0.02896370257079103 } }, "versions": { "crows_pairs_english_socioeconomic": 0, "hendrycksTest-global_facts": 0, "hendrycksTest-formal_logic": 0, "hendrycksTest-moral_disputes": 0, "hendrycksTest-college_biology": 0, "hendrycksTest-management": 0, "hendrycksTest-philosophy": 0, "wsc": 0, "hendrycksTest-world_religions": 0, "crows_pairs_english_sexual_orientation": 0, "crows_pairs_french_sexual_orientation": 0, "hendrycksTest-college_chemistry": 0, "crows_pairs_english_autre": 0, "hendrycksTest-professional_accounting": 0, "hendrycksTest-electrical_engineering": 0, "hendrycksTest-nutrition": 0, "hendrycksTest-miscellaneous": 0, "crows_pairs_french_disability": 0, "crows_pairs_english_race_color": 0, "hendrycksTest-human_aging": 0, "hendrycksTest-business_ethics": 0, "hendrycksTest-moral_scenarios": 0, "hendrycksTest-professional_psychology": 0, "hendrycksTest-marketing": 0, "hendrycksTest-sociology": 0, "logiqa": 0, "hendrycksTest-high_school_biology": 0, "hendrycksTest-high_school_computer_science": 0, "hendrycksTest-college_mathematics": 0, "crows_pairs_english_gender": 0, "crows_pairs_french_religion": 0, "hendrycksTest-high_school_chemistry": 0, "crows_pairs_english": 0, "crows_pairs_french": 0, "hendrycksTest-high_school_world_history": 0, "hendrycksTest-abstract_algebra": 0, "hendrycksTest-human_sexuality": 0, "hendrycksTest-anatomy": 0, "crows_pairs_english_physical_appearance": 0, "lambada_openai": 0, "crows_pairs_english_nationality": 0, "hendrycksTest-high_school_us_history": 0, "hendrycksTest-high_school_microeconomics": 0, "crows_pairs_french_race_color": 0, "hendrycksTest-college_physics": 0, "crows_pairs_french_gender": 0, "hendrycksTest-college_computer_science": 0, "hendrycksTest-jurisprudence": 0, "hendrycksTest-international_law": 0, "crows_pairs_english_religion": 0, "crows_pairs_french_age": 0, "hendrycksTest-security_studies": 0, "hendrycksTest-machine_learning": 0, "crows_pairs_french_autre": 0, "sciq": 0, "hendrycksTest-prehistory": 0, "hendrycksTest-econometrics": 0, "crows_pairs_french_socioeconomic": 0, "hendrycksTest-high_school_european_history": 0, "hendrycksTest-astronomy": 0, "hendrycksTest-logical_fallacies": 0, "hendrycksTest-high_school_physics": 0, "crows_pairs_french_nationality": 0, "hendrycksTest-virology": 0, "hendrycksTest-high_school_government_and_politics": 0, "crows_pairs_english_disability": 0, "hendrycksTest-high_school_mathematics": 0, "hendrycksTest-high_school_psychology": 0, "crows_pairs_french_physical_appearance": 0, "hendrycksTest-college_medicine": 0, "hendrycksTest-professional_law": 0, "hendrycksTest-us_foreign_policy": 0, "hendrycksTest-elementary_mathematics": 0, "arc_easy": 0, "arc_challenge": 0, "crows_pairs_english_age": 0, "hendrycksTest-conceptual_physics": 0, "hendrycksTest-medical_genetics": 0, "hendrycksTest-high_school_macroeconomics": 0, "winogrande": 0, "hendrycksTest-public_relations": 0, "hendrycksTest-high_school_geography": 0, "hendrycksTest-clinical_knowledge": 0, "hendrycksTest-professional_medicine": 0, "piqa": 0, "hendrycksTest-computer_security": 0, "hendrycksTest-high_school_statistics": 0 }, "config": { "model": "hf-causal", "model_args": "pretrained=EleutherAI/pythia-v1.1-70m,revision=step143000", "num_fewshot": 0, "batch_size": 16, "device": "cuda:7", "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }