{ "results": { "hendrycksTest-sociology": { "acc": 0.2736318407960199, "acc_stderr": 0.03152439186555404, "acc_norm": 0.3034825870646766, "acc_norm_stderr": 0.03251006816458617 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.19747899159663865, "acc_stderr": 0.02585916412205146, "acc_norm": 0.3025210084033613, "acc_norm_stderr": 0.02983796238829193 }, "hendrycksTest-college_computer_science": { "acc": 0.22, "acc_stderr": 0.04163331998932269, "acc_norm": 0.19, "acc_norm_stderr": 0.03942772444036623 }, "hendrycksTest-conceptual_physics": { "acc": 0.2680851063829787, "acc_stderr": 0.02895734278834235, "acc_norm": 0.18723404255319148, "acc_norm_stderr": 0.02550158834188358 }, "hendrycksTest-high_school_statistics": { "acc": 0.2037037037037037, "acc_stderr": 0.027467401804057986, "acc_norm": 0.22685185185185186, "acc_norm_stderr": 0.02856165010242227 }, "hendrycksTest-clinical_knowledge": { "acc": 0.23018867924528302, "acc_stderr": 0.025907897122408173, "acc_norm": 0.32452830188679244, "acc_norm_stderr": 0.028815615713432118 }, "piqa": { "acc": 0.5919477693144722, "acc_stderr": 0.011466872778651261, "acc_norm": 0.5979325353645266, "acc_norm_stderr": 0.01143986712726753 }, "crows_pairs_french_disability": { "likelihood_difference": 5.745738636363637, "likelihood_difference_stderr": 0.603740965474876, "pct_stereotype": 0.48484848484848486, "pct_stereotype_stderr": 0.06198888629778894 }, "hendrycksTest-college_medicine": { "acc": 0.2138728323699422, "acc_stderr": 0.03126511206173042, "acc_norm": 0.3063583815028902, "acc_norm_stderr": 0.03514942551267437 }, "crows_pairs_english_disability": { "likelihood_difference": 4.940384615384615, "likelihood_difference_stderr": 0.5258513529267634, "pct_stereotype": 0.6153846153846154, "pct_stereotype_stderr": 0.06081303192631497 }, "hendrycksTest-econometrics": { "acc": 0.20175438596491227, "acc_stderr": 0.037752050135836386, "acc_norm": 0.19298245614035087, "acc_norm_stderr": 0.037124548537213684 }, "hendrycksTest-business_ethics": { "acc": 0.35, "acc_stderr": 0.04793724854411018, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542128 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.29533678756476683, "acc_stderr": 0.032922966391551414, "acc_norm": 0.27461139896373055, "acc_norm_stderr": 0.03221024508041154 }, "crows_pairs_french_sexual_orientation": { "likelihood_difference": 4.6717032967032965, "likelihood_difference_stderr": 0.35079580322071463, "pct_stereotype": 0.8021978021978022, "pct_stereotype_stderr": 0.04198895203196222 }, "hendrycksTest-abstract_algebra": { "acc": 0.2, "acc_stderr": 0.040201512610368466, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "crows_pairs_french_socioeconomic": { "likelihood_difference": 4.72429049744898, "likelihood_difference_stderr": 0.38514448828446046, "pct_stereotype": 0.45408163265306123, "pct_stereotype_stderr": 0.035654431417332814 }, "crows_pairs_english": { "likelihood_difference": 3.67170728980322, "likelihood_difference_stderr": 0.1032630912208814, "pct_stereotype": 0.545020870602266, "pct_stereotype_stderr": 0.012163688705232118 }, "crows_pairs_french": { "likelihood_difference": 5.014772473166368, "likelihood_difference_stderr": 0.12242859643295022, "pct_stereotype": 0.43410852713178294, "pct_stereotype_stderr": 0.012106782103996008 }, "hendrycksTest-college_mathematics": { "acc": 0.2, "acc_stderr": 0.04020151261036845, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768078 }, "hendrycksTest-miscellaneous": { "acc": 0.2681992337164751, "acc_stderr": 0.015842430835269435, "acc_norm": 0.2515964240102171, "acc_norm_stderr": 0.015517322365529619 }, "hendrycksTest-moral_disputes": { "acc": 0.2630057803468208, "acc_stderr": 0.023703099525258155, "acc_norm": 0.2947976878612717, "acc_norm_stderr": 0.02454761779480383 }, "hendrycksTest-college_chemistry": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.27, "acc_norm_stderr": 0.04461960433384741 }, "hendrycksTest-moral_scenarios": { "acc": 0.25027932960893856, "acc_stderr": 0.014487500852850412, "acc_norm": 0.24692737430167597, "acc_norm_stderr": 0.014422292204808835 }, "hendrycksTest-high_school_mathematics": { "acc": 0.2074074074074074, "acc_stderr": 0.024720713193952148, "acc_norm": 0.2518518518518518, "acc_norm_stderr": 0.026466117538959902 }, "hendrycksTest-high_school_us_history": { "acc": 0.2647058823529412, "acc_stderr": 0.03096451792692341, "acc_norm": 0.2647058823529412, "acc_norm_stderr": 0.03096451792692341 }, "crows_pairs_french_gender": { "likelihood_difference": 4.173773364485982, "likelihood_difference_stderr": 0.20666001663696318, "pct_stereotype": 0.5327102803738317, "pct_stereotype_stderr": 0.027890972865217984 }, "crows_pairs_english_physical_appearance": { "likelihood_difference": 3.8569878472222223, "likelihood_difference_stderr": 0.44844825841380226, "pct_stereotype": 0.5277777777777778, "pct_stereotype_stderr": 0.05924743948371487 }, "hendrycksTest-high_school_physics": { "acc": 0.18543046357615894, "acc_stderr": 0.03173284384294287, "acc_norm": 0.2185430463576159, "acc_norm_stderr": 0.03374235550425694 }, "wsc": { "acc": 0.36538461538461536, "acc_stderr": 0.0474473339327792 }, "hendrycksTest-jurisprudence": { "acc": 0.18518518518518517, "acc_stderr": 0.03755265865037181, "acc_norm": 0.37037037037037035, "acc_norm_stderr": 0.04668408033024931 }, "arc_easy": { "acc": 0.39225589225589225, "acc_stderr": 0.010018744689650043, "acc_norm": 0.35858585858585856, "acc_norm_stderr": 0.009840882301225297 }, "hendrycksTest-formal_logic": { "acc": 0.30952380952380953, "acc_stderr": 0.04134913018303316, "acc_norm": 0.29365079365079366, "acc_norm_stderr": 0.040735243221471255 }, "hendrycksTest-high_school_psychology": { "acc": 0.27155963302752295, "acc_stderr": 0.019069098363191442, "acc_norm": 0.26605504587155965, "acc_norm_stderr": 0.018946022322225614 }, "crows_pairs_english_autre": { "likelihood_difference": 5.355113636363637, "likelihood_difference_stderr": 1.5602556194869146, "pct_stereotype": 0.5454545454545454, "pct_stereotype_stderr": 0.1574591643244434 }, "hendrycksTest-high_school_european_history": { "acc": 0.2606060606060606, "acc_stderr": 0.034277431758165236, "acc_norm": 0.2787878787878788, "acc_norm_stderr": 0.035014387062967806 }, "crows_pairs_english_socioeconomic": { "likelihood_difference": 3.9657894736842105, "likelihood_difference_stderr": 0.2608872260073087, "pct_stereotype": 0.6473684210526316, "pct_stereotype_stderr": 0.034754052595820976 }, "hendrycksTest-electrical_engineering": { "acc": 0.296551724137931, "acc_stderr": 0.03806142687309994, "acc_norm": 0.32413793103448274, "acc_norm_stderr": 0.03900432069185554 }, "hendrycksTest-anatomy": { "acc": 0.25925925925925924, "acc_stderr": 0.03785714465066654, "acc_norm": 0.23703703703703705, "acc_norm_stderr": 0.03673731683969506 }, "crows_pairs_french_physical_appearance": { "likelihood_difference": 5.307291666666667, "likelihood_difference_stderr": 0.5547099715245821, "pct_stereotype": 0.5, "pct_stereotype_stderr": 0.05933908290969268 }, "hendrycksTest-philosophy": { "acc": 0.2282958199356913, "acc_stderr": 0.023839303311398215, "acc_norm": 0.3022508038585209, "acc_norm_stderr": 0.02608270069539966 }, "lambada_openai": { "ppl": 94.31955728859376, "ppl_stderr": 3.991574316908998, "acc": 0.25344459538133124, "acc_stderr": 0.0060601672763364745 }, "hendrycksTest-high_school_computer_science": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.28, "acc_norm_stderr": 0.045126085985421296 }, "hendrycksTest-nutrition": { "acc": 0.25163398692810457, "acc_stderr": 0.024848018263875192, "acc_norm": 0.34967320261437906, "acc_norm_stderr": 0.027305308076274702 }, "hendrycksTest-virology": { "acc": 0.27710843373493976, "acc_stderr": 0.034843315926805875, "acc_norm": 0.2891566265060241, "acc_norm_stderr": 0.03529486801511115 }, "crows_pairs_english_gender": { "likelihood_difference": 3.0603515625, "likelihood_difference_stderr": 0.2570312907090984, "pct_stereotype": 0.5125, "pct_stereotype_stderr": 0.02798587585995665 }, "hendrycksTest-computer_security": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720683 }, "hendrycksTest-professional_accounting": { "acc": 0.2730496453900709, "acc_stderr": 0.02657786094330786, "acc_norm": 0.25886524822695034, "acc_norm_stderr": 0.02612957252718085 }, "hendrycksTest-machine_learning": { "acc": 0.3482142857142857, "acc_stderr": 0.045218299028335865, "acc_norm": 0.2767857142857143, "acc_norm_stderr": 0.042466243366976256 }, "crows_pairs_english_race_color": { "likelihood_difference": 3.5856606791338583, "likelihood_difference_stderr": 0.18118219123514714, "pct_stereotype": 0.5118110236220472, "pct_stereotype_stderr": 0.022199583294816923 }, "crows_pairs_english_religion": { "likelihood_difference": 3.8061655405405403, "likelihood_difference_stderr": 0.43453880510820464, "pct_stereotype": 0.6036036036036037, "pct_stereotype_stderr": 0.04663848326322447 }, "hendrycksTest-management": { "acc": 0.22330097087378642, "acc_stderr": 0.04123553189891431, "acc_norm": 0.3106796116504854, "acc_norm_stderr": 0.04582124160161551 }, "sciq": { "acc": 0.664, "acc_stderr": 0.014944140233795028, "acc_norm": 0.572, "acc_norm_stderr": 0.01565442624502929 }, "hendrycksTest-astronomy": { "acc": 0.17763157894736842, "acc_stderr": 0.031103182383123387, "acc_norm": 0.34868421052631576, "acc_norm_stderr": 0.03878139888797609 }, "hendrycksTest-high_school_world_history": { "acc": 0.23628691983122363, "acc_stderr": 0.027652153144159294, "acc_norm": 0.3080168776371308, "acc_norm_stderr": 0.030052389335605695 }, "crows_pairs_french_race_color": { "likelihood_difference": 4.440149456521739, "likelihood_difference_stderr": 0.2261395575520835, "pct_stereotype": 0.3239130434782609, "pct_stereotype_stderr": 0.021842842500532617 }, "hendrycksTest-global_facts": { "acc": 0.22, "acc_stderr": 0.04163331998932268, "acc_norm": 0.21, "acc_norm_stderr": 0.040936018074033256 }, "hendrycksTest-human_sexuality": { "acc": 0.3053435114503817, "acc_stderr": 0.040393149787245605, "acc_norm": 0.2824427480916031, "acc_norm_stderr": 0.03948406125768361 }, "hendrycksTest-prehistory": { "acc": 0.2993827160493827, "acc_stderr": 0.02548311560119546, "acc_norm": 0.23148148148148148, "acc_norm_stderr": 0.023468429832451145 }, "hendrycksTest-college_biology": { "acc": 0.25, "acc_stderr": 0.03621034121889507, "acc_norm": 0.25, "acc_norm_stderr": 0.03621034121889507 }, "crows_pairs_french_age": { "likelihood_difference": 4.878472222222222, "likelihood_difference_stderr": 0.4858540541132919, "pct_stereotype": 0.4666666666666667, "pct_stereotype_stderr": 0.05288198530254015 }, "hendrycksTest-marketing": { "acc": 0.2948717948717949, "acc_stderr": 0.029872577708891162, "acc_norm": 0.3162393162393162, "acc_norm_stderr": 0.030463656747340247 }, "hendrycksTest-security_studies": { "acc": 0.3183673469387755, "acc_stderr": 0.029822533793982052, "acc_norm": 0.23265306122448978, "acc_norm_stderr": 0.02704925791589618 }, "hendrycksTest-international_law": { "acc": 0.2066115702479339, "acc_stderr": 0.03695980128098823, "acc_norm": 0.4132231404958678, "acc_norm_stderr": 0.04495087843548408 }, "hendrycksTest-elementary_mathematics": { "acc": 0.22486772486772486, "acc_stderr": 0.021502096078229147, "acc_norm": 0.20634920634920634, "acc_norm_stderr": 0.020842290930114676 }, "hendrycksTest-high_school_geography": { "acc": 0.2474747474747475, "acc_stderr": 0.030746300742124522, "acc_norm": 0.32323232323232326, "acc_norm_stderr": 0.033322999210706444 }, "crows_pairs_french_religion": { "likelihood_difference": 4.854619565217392, "likelihood_difference_stderr": 0.505869033934835, "pct_stereotype": 0.4956521739130435, "pct_stereotype_stderr": 0.04682752006203916 }, "hendrycksTest-world_religions": { "acc": 0.2631578947368421, "acc_stderr": 0.033773102522091945, "acc_norm": 0.30994152046783624, "acc_norm_stderr": 0.035469769593931624 }, "hendrycksTest-logical_fallacies": { "acc": 0.22699386503067484, "acc_stderr": 0.032910995786157686, "acc_norm": 0.2883435582822086, "acc_norm_stderr": 0.035590395316173425 }, "crows_pairs_french_nationality": { "likelihood_difference": 7.36919466403162, "likelihood_difference_stderr": 0.3929905019461457, "pct_stereotype": 0.2964426877470356, "pct_stereotype_stderr": 0.028768673758013903 }, "crows_pairs_french_autre": { "likelihood_difference": 4.103365384615385, "likelihood_difference_stderr": 1.0499970465523882, "pct_stereotype": 0.3076923076923077, "pct_stereotype_stderr": 0.13323467750529824 }, "hendrycksTest-high_school_biology": { "acc": 0.24193548387096775, "acc_stderr": 0.024362599693031086, "acc_norm": 0.3, "acc_norm_stderr": 0.02606936229533513 }, "hendrycksTest-medical_genetics": { "acc": 0.23, "acc_stderr": 0.04229525846816507, "acc_norm": 0.35, "acc_norm_stderr": 0.047937248544110196 }, "hendrycksTest-us_foreign_policy": { "acc": 0.27, "acc_stderr": 0.04461960433384739, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "hendrycksTest-professional_law": { "acc": 0.24837027379400262, "acc_stderr": 0.01103521259803449, "acc_norm": 0.27444589308996087, "acc_norm_stderr": 0.011397043163078154 }, "crows_pairs_english_sexual_orientation": { "likelihood_difference": 4.869623655913978, "likelihood_difference_stderr": 0.5959735406192751, "pct_stereotype": 0.7849462365591398, "pct_stereotype_stderr": 0.04283507835554754 }, "hendrycksTest-professional_psychology": { "acc": 0.2565359477124183, "acc_stderr": 0.017667841612378984, "acc_norm": 0.25163398692810457, "acc_norm_stderr": 0.017555818091322256 }, "crows_pairs_english_nationality": { "likelihood_difference": 3.8365162037037037, "likelihood_difference_stderr": 0.2671010238288838, "pct_stereotype": 0.4444444444444444, "pct_stereotype_stderr": 0.03388857118502326 }, "hendrycksTest-professional_medicine": { "acc": 0.3014705882352941, "acc_stderr": 0.027875982114273168, "acc_norm": 0.26838235294117646, "acc_norm_stderr": 0.02691748122437721 }, "winogrande": { "acc": 0.494869771112865, "acc_stderr": 0.014051745961790516 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.23333333333333334, "acc_stderr": 0.02144454730156047, "acc_norm": 0.2717948717948718, "acc_norm_stderr": 0.02255655101013236 }, "hendrycksTest-human_aging": { "acc": 0.3004484304932735, "acc_stderr": 0.030769352008229136, "acc_norm": 0.242152466367713, "acc_norm_stderr": 0.028751392398694755 }, "hendrycksTest-college_physics": { "acc": 0.19607843137254902, "acc_stderr": 0.03950581861179962, "acc_norm": 0.21568627450980393, "acc_norm_stderr": 0.04092563958237654 }, "logiqa": { "acc": 0.2227342549923195, "acc_stderr": 0.01632005404616512, "acc_norm": 0.27956989247311825, "acc_norm_stderr": 0.017602909186822453 }, "hendrycksTest-high_school_chemistry": { "acc": 0.19704433497536947, "acc_stderr": 0.02798672466673622, "acc_norm": 0.23645320197044334, "acc_norm_stderr": 0.02989611429173355 }, "hendrycksTest-public_relations": { "acc": 0.2909090909090909, "acc_stderr": 0.04350271442923243, "acc_norm": 0.2, "acc_norm_stderr": 0.038313051408846034 }, "arc_challenge": { "acc": 0.1757679180887372, "acc_stderr": 0.011122850863120485, "acc_norm": 0.21331058020477817, "acc_norm_stderr": 0.011970971742326334 }, "crows_pairs_english_age": { "likelihood_difference": 2.652129120879121, "likelihood_difference_stderr": 0.2944534289937784, "pct_stereotype": 0.5164835164835165, "pct_stereotype_stderr": 0.05267597952306975 } }, "versions": { "hendrycksTest-sociology": 0, "hendrycksTest-high_school_microeconomics": 0, "hendrycksTest-college_computer_science": 0, "hendrycksTest-conceptual_physics": 0, "hendrycksTest-high_school_statistics": 0, "hendrycksTest-clinical_knowledge": 0, "piqa": 0, "crows_pairs_french_disability": 0, "hendrycksTest-college_medicine": 0, "crows_pairs_english_disability": 0, "hendrycksTest-econometrics": 0, "hendrycksTest-business_ethics": 0, "hendrycksTest-high_school_government_and_politics": 0, "crows_pairs_french_sexual_orientation": 0, "hendrycksTest-abstract_algebra": 0, "crows_pairs_french_socioeconomic": 0, "crows_pairs_english": 0, "crows_pairs_french": 0, "hendrycksTest-college_mathematics": 0, "hendrycksTest-miscellaneous": 0, "hendrycksTest-moral_disputes": 0, "hendrycksTest-college_chemistry": 0, "hendrycksTest-moral_scenarios": 0, "hendrycksTest-high_school_mathematics": 0, "hendrycksTest-high_school_us_history": 0, "crows_pairs_french_gender": 0, "crows_pairs_english_physical_appearance": 0, "hendrycksTest-high_school_physics": 0, "wsc": 0, "hendrycksTest-jurisprudence": 0, "arc_easy": 0, "hendrycksTest-formal_logic": 0, "hendrycksTest-high_school_psychology": 0, "crows_pairs_english_autre": 0, "hendrycksTest-high_school_european_history": 0, "crows_pairs_english_socioeconomic": 0, "hendrycksTest-electrical_engineering": 0, "hendrycksTest-anatomy": 0, "crows_pairs_french_physical_appearance": 0, "hendrycksTest-philosophy": 0, "lambada_openai": 0, "hendrycksTest-high_school_computer_science": 0, "hendrycksTest-nutrition": 0, "hendrycksTest-virology": 0, "crows_pairs_english_gender": 0, "hendrycksTest-computer_security": 0, "hendrycksTest-professional_accounting": 0, "hendrycksTest-machine_learning": 0, "crows_pairs_english_race_color": 0, "crows_pairs_english_religion": 0, "hendrycksTest-management": 0, "sciq": 0, "hendrycksTest-astronomy": 0, "hendrycksTest-high_school_world_history": 0, "crows_pairs_french_race_color": 0, "hendrycksTest-global_facts": 0, "hendrycksTest-human_sexuality": 0, "hendrycksTest-prehistory": 0, "hendrycksTest-college_biology": 0, "crows_pairs_french_age": 0, "hendrycksTest-marketing": 0, "hendrycksTest-security_studies": 0, "hendrycksTest-international_law": 0, "hendrycksTest-elementary_mathematics": 0, "hendrycksTest-high_school_geography": 0, "crows_pairs_french_religion": 0, "hendrycksTest-world_religions": 0, "hendrycksTest-logical_fallacies": 0, "crows_pairs_french_nationality": 0, "crows_pairs_french_autre": 0, "hendrycksTest-high_school_biology": 0, "hendrycksTest-medical_genetics": 0, "hendrycksTest-us_foreign_policy": 0, "hendrycksTest-professional_law": 0, "crows_pairs_english_sexual_orientation": 0, "hendrycksTest-professional_psychology": 0, "crows_pairs_english_nationality": 0, "hendrycksTest-professional_medicine": 0, "winogrande": 0, "hendrycksTest-high_school_macroeconomics": 0, "hendrycksTest-human_aging": 0, "hendrycksTest-college_physics": 0, "logiqa": 0, "hendrycksTest-high_school_chemistry": 0, "hendrycksTest-public_relations": 0, "arc_challenge": 0, "crows_pairs_english_age": 0 }, "config": { "model": "hf-causal", "model_args": "pretrained=EleutherAI/pythia-v1.1-70m,revision=step53000", "num_fewshot": 0, "batch_size": 16, "device": "cuda:5", "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }