karlexmarin's picture
feat: ship paper artefacts + CLI diagnostic alongside browser tool
535348a
Raw
History Blame
21.9 kB
{
"results": {
"hendrycksTest-sociology": {
"acc": 0.2736318407960199,
"acc_stderr": 0.03152439186555404,
"acc_norm": 0.3034825870646766,
"acc_norm_stderr": 0.03251006816458617
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.19747899159663865,
"acc_stderr": 0.02585916412205146,
"acc_norm": 0.3025210084033613,
"acc_norm_stderr": 0.02983796238829193
},
"hendrycksTest-college_computer_science": {
"acc": 0.22,
"acc_stderr": 0.04163331998932269,
"acc_norm": 0.19,
"acc_norm_stderr": 0.03942772444036623
},
"hendrycksTest-conceptual_physics": {
"acc": 0.2680851063829787,
"acc_stderr": 0.02895734278834235,
"acc_norm": 0.18723404255319148,
"acc_norm_stderr": 0.02550158834188358
},
"hendrycksTest-high_school_statistics": {
"acc": 0.2037037037037037,
"acc_stderr": 0.027467401804057986,
"acc_norm": 0.22685185185185186,
"acc_norm_stderr": 0.02856165010242227
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.23018867924528302,
"acc_stderr": 0.025907897122408173,
"acc_norm": 0.32452830188679244,
"acc_norm_stderr": 0.028815615713432118
},
"piqa": {
"acc": 0.5919477693144722,
"acc_stderr": 0.011466872778651261,
"acc_norm": 0.5979325353645266,
"acc_norm_stderr": 0.01143986712726753
},
"crows_pairs_french_disability": {
"likelihood_difference": 5.745738636363637,
"likelihood_difference_stderr": 0.603740965474876,
"pct_stereotype": 0.48484848484848486,
"pct_stereotype_stderr": 0.06198888629778894
},
"hendrycksTest-college_medicine": {
"acc": 0.2138728323699422,
"acc_stderr": 0.03126511206173042,
"acc_norm": 0.3063583815028902,
"acc_norm_stderr": 0.03514942551267437
},
"crows_pairs_english_disability": {
"likelihood_difference": 4.940384615384615,
"likelihood_difference_stderr": 0.5258513529267634,
"pct_stereotype": 0.6153846153846154,
"pct_stereotype_stderr": 0.06081303192631497
},
"hendrycksTest-econometrics": {
"acc": 0.20175438596491227,
"acc_stderr": 0.037752050135836386,
"acc_norm": 0.19298245614035087,
"acc_norm_stderr": 0.037124548537213684
},
"hendrycksTest-business_ethics": {
"acc": 0.35,
"acc_stderr": 0.04793724854411018,
"acc_norm": 0.28,
"acc_norm_stderr": 0.04512608598542128
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.29533678756476683,
"acc_stderr": 0.032922966391551414,
"acc_norm": 0.27461139896373055,
"acc_norm_stderr": 0.03221024508041154
},
"crows_pairs_french_sexual_orientation": {
"likelihood_difference": 4.6717032967032965,
"likelihood_difference_stderr": 0.35079580322071463,
"pct_stereotype": 0.8021978021978022,
"pct_stereotype_stderr": 0.04198895203196222
},
"hendrycksTest-abstract_algebra": {
"acc": 0.2,
"acc_stderr": 0.040201512610368466,
"acc_norm": 0.31,
"acc_norm_stderr": 0.04648231987117316
},
"crows_pairs_french_socioeconomic": {
"likelihood_difference": 4.72429049744898,
"likelihood_difference_stderr": 0.38514448828446046,
"pct_stereotype": 0.45408163265306123,
"pct_stereotype_stderr": 0.035654431417332814
},
"crows_pairs_english": {
"likelihood_difference": 3.67170728980322,
"likelihood_difference_stderr": 0.1032630912208814,
"pct_stereotype": 0.545020870602266,
"pct_stereotype_stderr": 0.012163688705232118
},
"crows_pairs_french": {
"likelihood_difference": 5.014772473166368,
"likelihood_difference_stderr": 0.12242859643295022,
"pct_stereotype": 0.43410852713178294,
"pct_stereotype_stderr": 0.012106782103996008
},
"hendrycksTest-college_mathematics": {
"acc": 0.2,
"acc_stderr": 0.04020151261036845,
"acc_norm": 0.26,
"acc_norm_stderr": 0.04408440022768078
},
"hendrycksTest-miscellaneous": {
"acc": 0.2681992337164751,
"acc_stderr": 0.015842430835269435,
"acc_norm": 0.2515964240102171,
"acc_norm_stderr": 0.015517322365529619
},
"hendrycksTest-moral_disputes": {
"acc": 0.2630057803468208,
"acc_stderr": 0.023703099525258155,
"acc_norm": 0.2947976878612717,
"acc_norm_stderr": 0.02454761779480383
},
"hendrycksTest-college_chemistry": {
"acc": 0.3,
"acc_stderr": 0.046056618647183814,
"acc_norm": 0.27,
"acc_norm_stderr": 0.04461960433384741
},
"hendrycksTest-moral_scenarios": {
"acc": 0.25027932960893856,
"acc_stderr": 0.014487500852850412,
"acc_norm": 0.24692737430167597,
"acc_norm_stderr": 0.014422292204808835
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.2074074074074074,
"acc_stderr": 0.024720713193952148,
"acc_norm": 0.2518518518518518,
"acc_norm_stderr": 0.026466117538959902
},
"hendrycksTest-high_school_us_history": {
"acc": 0.2647058823529412,
"acc_stderr": 0.03096451792692341,
"acc_norm": 0.2647058823529412,
"acc_norm_stderr": 0.03096451792692341
},
"crows_pairs_french_gender": {
"likelihood_difference": 4.173773364485982,
"likelihood_difference_stderr": 0.20666001663696318,
"pct_stereotype": 0.5327102803738317,
"pct_stereotype_stderr": 0.027890972865217984
},
"crows_pairs_english_physical_appearance": {
"likelihood_difference": 3.8569878472222223,
"likelihood_difference_stderr": 0.44844825841380226,
"pct_stereotype": 0.5277777777777778,
"pct_stereotype_stderr": 0.05924743948371487
},
"hendrycksTest-high_school_physics": {
"acc": 0.18543046357615894,
"acc_stderr": 0.03173284384294287,
"acc_norm": 0.2185430463576159,
"acc_norm_stderr": 0.03374235550425694
},
"wsc": {
"acc": 0.36538461538461536,
"acc_stderr": 0.0474473339327792
},
"hendrycksTest-jurisprudence": {
"acc": 0.18518518518518517,
"acc_stderr": 0.03755265865037181,
"acc_norm": 0.37037037037037035,
"acc_norm_stderr": 0.04668408033024931
},
"arc_easy": {
"acc": 0.39225589225589225,
"acc_stderr": 0.010018744689650043,
"acc_norm": 0.35858585858585856,
"acc_norm_stderr": 0.009840882301225297
},
"hendrycksTest-formal_logic": {
"acc": 0.30952380952380953,
"acc_stderr": 0.04134913018303316,
"acc_norm": 0.29365079365079366,
"acc_norm_stderr": 0.040735243221471255
},
"hendrycksTest-high_school_psychology": {
"acc": 0.27155963302752295,
"acc_stderr": 0.019069098363191442,
"acc_norm": 0.26605504587155965,
"acc_norm_stderr": 0.018946022322225614
},
"crows_pairs_english_autre": {
"likelihood_difference": 5.355113636363637,
"likelihood_difference_stderr": 1.5602556194869146,
"pct_stereotype": 0.5454545454545454,
"pct_stereotype_stderr": 0.1574591643244434
},
"hendrycksTest-high_school_european_history": {
"acc": 0.2606060606060606,
"acc_stderr": 0.034277431758165236,
"acc_norm": 0.2787878787878788,
"acc_norm_stderr": 0.035014387062967806
},
"crows_pairs_english_socioeconomic": {
"likelihood_difference": 3.9657894736842105,
"likelihood_difference_stderr": 0.2608872260073087,
"pct_stereotype": 0.6473684210526316,
"pct_stereotype_stderr": 0.034754052595820976
},
"hendrycksTest-electrical_engineering": {
"acc": 0.296551724137931,
"acc_stderr": 0.03806142687309994,
"acc_norm": 0.32413793103448274,
"acc_norm_stderr": 0.03900432069185554
},
"hendrycksTest-anatomy": {
"acc": 0.25925925925925924,
"acc_stderr": 0.03785714465066654,
"acc_norm": 0.23703703703703705,
"acc_norm_stderr": 0.03673731683969506
},
"crows_pairs_french_physical_appearance": {
"likelihood_difference": 5.307291666666667,
"likelihood_difference_stderr": 0.5547099715245821,
"pct_stereotype": 0.5,
"pct_stereotype_stderr": 0.05933908290969268
},
"hendrycksTest-philosophy": {
"acc": 0.2282958199356913,
"acc_stderr": 0.023839303311398215,
"acc_norm": 0.3022508038585209,
"acc_norm_stderr": 0.02608270069539966
},
"lambada_openai": {
"ppl": 94.31955728859376,
"ppl_stderr": 3.991574316908998,
"acc": 0.25344459538133124,
"acc_stderr": 0.0060601672763364745
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.21,
"acc_stderr": 0.040936018074033256,
"acc_norm": 0.28,
"acc_norm_stderr": 0.045126085985421296
},
"hendrycksTest-nutrition": {
"acc": 0.25163398692810457,
"acc_stderr": 0.024848018263875192,
"acc_norm": 0.34967320261437906,
"acc_norm_stderr": 0.027305308076274702
},
"hendrycksTest-virology": {
"acc": 0.27710843373493976,
"acc_stderr": 0.034843315926805875,
"acc_norm": 0.2891566265060241,
"acc_norm_stderr": 0.03529486801511115
},
"crows_pairs_english_gender": {
"likelihood_difference": 3.0603515625,
"likelihood_difference_stderr": 0.2570312907090984,
"pct_stereotype": 0.5125,
"pct_stereotype_stderr": 0.02798587585995665
},
"hendrycksTest-computer_security": {
"acc": 0.21,
"acc_stderr": 0.040936018074033256,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720683
},
"hendrycksTest-professional_accounting": {
"acc": 0.2730496453900709,
"acc_stderr": 0.02657786094330786,
"acc_norm": 0.25886524822695034,
"acc_norm_stderr": 0.02612957252718085
},
"hendrycksTest-machine_learning": {
"acc": 0.3482142857142857,
"acc_stderr": 0.045218299028335865,
"acc_norm": 0.2767857142857143,
"acc_norm_stderr": 0.042466243366976256
},
"crows_pairs_english_race_color": {
"likelihood_difference": 3.5856606791338583,
"likelihood_difference_stderr": 0.18118219123514714,
"pct_stereotype": 0.5118110236220472,
"pct_stereotype_stderr": 0.022199583294816923
},
"crows_pairs_english_religion": {
"likelihood_difference": 3.8061655405405403,
"likelihood_difference_stderr": 0.43453880510820464,
"pct_stereotype": 0.6036036036036037,
"pct_stereotype_stderr": 0.04663848326322447
},
"hendrycksTest-management": {
"acc": 0.22330097087378642,
"acc_stderr": 0.04123553189891431,
"acc_norm": 0.3106796116504854,
"acc_norm_stderr": 0.04582124160161551
},
"sciq": {
"acc": 0.664,
"acc_stderr": 0.014944140233795028,
"acc_norm": 0.572,
"acc_norm_stderr": 0.01565442624502929
},
"hendrycksTest-astronomy": {
"acc": 0.17763157894736842,
"acc_stderr": 0.031103182383123387,
"acc_norm": 0.34868421052631576,
"acc_norm_stderr": 0.03878139888797609
},
"hendrycksTest-high_school_world_history": {
"acc": 0.23628691983122363,
"acc_stderr": 0.027652153144159294,
"acc_norm": 0.3080168776371308,
"acc_norm_stderr": 0.030052389335605695
},
"crows_pairs_french_race_color": {
"likelihood_difference": 4.440149456521739,
"likelihood_difference_stderr": 0.2261395575520835,
"pct_stereotype": 0.3239130434782609,
"pct_stereotype_stderr": 0.021842842500532617
},
"hendrycksTest-global_facts": {
"acc": 0.22,
"acc_stderr": 0.04163331998932268,
"acc_norm": 0.21,
"acc_norm_stderr": 0.040936018074033256
},
"hendrycksTest-human_sexuality": {
"acc": 0.3053435114503817,
"acc_stderr": 0.040393149787245605,
"acc_norm": 0.2824427480916031,
"acc_norm_stderr": 0.03948406125768361
},
"hendrycksTest-prehistory": {
"acc": 0.2993827160493827,
"acc_stderr": 0.02548311560119546,
"acc_norm": 0.23148148148148148,
"acc_norm_stderr": 0.023468429832451145
},
"hendrycksTest-college_biology": {
"acc": 0.25,
"acc_stderr": 0.03621034121889507,
"acc_norm": 0.25,
"acc_norm_stderr": 0.03621034121889507
},
"crows_pairs_french_age": {
"likelihood_difference": 4.878472222222222,
"likelihood_difference_stderr": 0.4858540541132919,
"pct_stereotype": 0.4666666666666667,
"pct_stereotype_stderr": 0.05288198530254015
},
"hendrycksTest-marketing": {
"acc": 0.2948717948717949,
"acc_stderr": 0.029872577708891162,
"acc_norm": 0.3162393162393162,
"acc_norm_stderr": 0.030463656747340247
},
"hendrycksTest-security_studies": {
"acc": 0.3183673469387755,
"acc_stderr": 0.029822533793982052,
"acc_norm": 0.23265306122448978,
"acc_norm_stderr": 0.02704925791589618
},
"hendrycksTest-international_law": {
"acc": 0.2066115702479339,
"acc_stderr": 0.03695980128098823,
"acc_norm": 0.4132231404958678,
"acc_norm_stderr": 0.04495087843548408
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.22486772486772486,
"acc_stderr": 0.021502096078229147,
"acc_norm": 0.20634920634920634,
"acc_norm_stderr": 0.020842290930114676
},
"hendrycksTest-high_school_geography": {
"acc": 0.2474747474747475,
"acc_stderr": 0.030746300742124522,
"acc_norm": 0.32323232323232326,
"acc_norm_stderr": 0.033322999210706444
},
"crows_pairs_french_religion": {
"likelihood_difference": 4.854619565217392,
"likelihood_difference_stderr": 0.505869033934835,
"pct_stereotype": 0.4956521739130435,
"pct_stereotype_stderr": 0.04682752006203916
},
"hendrycksTest-world_religions": {
"acc": 0.2631578947368421,
"acc_stderr": 0.033773102522091945,
"acc_norm": 0.30994152046783624,
"acc_norm_stderr": 0.035469769593931624
},
"hendrycksTest-logical_fallacies": {
"acc": 0.22699386503067484,
"acc_stderr": 0.032910995786157686,
"acc_norm": 0.2883435582822086,
"acc_norm_stderr": 0.035590395316173425
},
"crows_pairs_french_nationality": {
"likelihood_difference": 7.36919466403162,
"likelihood_difference_stderr": 0.3929905019461457,
"pct_stereotype": 0.2964426877470356,
"pct_stereotype_stderr": 0.028768673758013903
},
"crows_pairs_french_autre": {
"likelihood_difference": 4.103365384615385,
"likelihood_difference_stderr": 1.0499970465523882,
"pct_stereotype": 0.3076923076923077,
"pct_stereotype_stderr": 0.13323467750529824
},
"hendrycksTest-high_school_biology": {
"acc": 0.24193548387096775,
"acc_stderr": 0.024362599693031086,
"acc_norm": 0.3,
"acc_norm_stderr": 0.02606936229533513
},
"hendrycksTest-medical_genetics": {
"acc": 0.23,
"acc_stderr": 0.04229525846816507,
"acc_norm": 0.35,
"acc_norm_stderr": 0.047937248544110196
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.27,
"acc_stderr": 0.04461960433384739,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"hendrycksTest-professional_law": {
"acc": 0.24837027379400262,
"acc_stderr": 0.01103521259803449,
"acc_norm": 0.27444589308996087,
"acc_norm_stderr": 0.011397043163078154
},
"crows_pairs_english_sexual_orientation": {
"likelihood_difference": 4.869623655913978,
"likelihood_difference_stderr": 0.5959735406192751,
"pct_stereotype": 0.7849462365591398,
"pct_stereotype_stderr": 0.04283507835554754
},
"hendrycksTest-professional_psychology": {
"acc": 0.2565359477124183,
"acc_stderr": 0.017667841612378984,
"acc_norm": 0.25163398692810457,
"acc_norm_stderr": 0.017555818091322256
},
"crows_pairs_english_nationality": {
"likelihood_difference": 3.8365162037037037,
"likelihood_difference_stderr": 0.2671010238288838,
"pct_stereotype": 0.4444444444444444,
"pct_stereotype_stderr": 0.03388857118502326
},
"hendrycksTest-professional_medicine": {
"acc": 0.3014705882352941,
"acc_stderr": 0.027875982114273168,
"acc_norm": 0.26838235294117646,
"acc_norm_stderr": 0.02691748122437721
},
"winogrande": {
"acc": 0.494869771112865,
"acc_stderr": 0.014051745961790516
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.23333333333333334,
"acc_stderr": 0.02144454730156047,
"acc_norm": 0.2717948717948718,
"acc_norm_stderr": 0.02255655101013236
},
"hendrycksTest-human_aging": {
"acc": 0.3004484304932735,
"acc_stderr": 0.030769352008229136,
"acc_norm": 0.242152466367713,
"acc_norm_stderr": 0.028751392398694755
},
"hendrycksTest-college_physics": {
"acc": 0.19607843137254902,
"acc_stderr": 0.03950581861179962,
"acc_norm": 0.21568627450980393,
"acc_norm_stderr": 0.04092563958237654
},
"logiqa": {
"acc": 0.2227342549923195,
"acc_stderr": 0.01632005404616512,
"acc_norm": 0.27956989247311825,
"acc_norm_stderr": 0.017602909186822453
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.19704433497536947,
"acc_stderr": 0.02798672466673622,
"acc_norm": 0.23645320197044334,
"acc_norm_stderr": 0.02989611429173355
},
"hendrycksTest-public_relations": {
"acc": 0.2909090909090909,
"acc_stderr": 0.04350271442923243,
"acc_norm": 0.2,
"acc_norm_stderr": 0.038313051408846034
},
"arc_challenge": {
"acc": 0.1757679180887372,
"acc_stderr": 0.011122850863120485,
"acc_norm": 0.21331058020477817,
"acc_norm_stderr": 0.011970971742326334
},
"crows_pairs_english_age": {
"likelihood_difference": 2.652129120879121,
"likelihood_difference_stderr": 0.2944534289937784,
"pct_stereotype": 0.5164835164835165,
"pct_stereotype_stderr": 0.05267597952306975
}
},
"versions": {
"hendrycksTest-sociology": 0,
"hendrycksTest-high_school_microeconomics": 0,
"hendrycksTest-college_computer_science": 0,
"hendrycksTest-conceptual_physics": 0,
"hendrycksTest-high_school_statistics": 0,
"hendrycksTest-clinical_knowledge": 0,
"piqa": 0,
"crows_pairs_french_disability": 0,
"hendrycksTest-college_medicine": 0,
"crows_pairs_english_disability": 0,
"hendrycksTest-econometrics": 0,
"hendrycksTest-business_ethics": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"crows_pairs_french_sexual_orientation": 0,
"hendrycksTest-abstract_algebra": 0,
"crows_pairs_french_socioeconomic": 0,
"crows_pairs_english": 0,
"crows_pairs_french": 0,
"hendrycksTest-college_mathematics": 0,
"hendrycksTest-miscellaneous": 0,
"hendrycksTest-moral_disputes": 0,
"hendrycksTest-college_chemistry": 0,
"hendrycksTest-moral_scenarios": 0,
"hendrycksTest-high_school_mathematics": 0,
"hendrycksTest-high_school_us_history": 0,
"crows_pairs_french_gender": 0,
"crows_pairs_english_physical_appearance": 0,
"hendrycksTest-high_school_physics": 0,
"wsc": 0,
"hendrycksTest-jurisprudence": 0,
"arc_easy": 0,
"hendrycksTest-formal_logic": 0,
"hendrycksTest-high_school_psychology": 0,
"crows_pairs_english_autre": 0,
"hendrycksTest-high_school_european_history": 0,
"crows_pairs_english_socioeconomic": 0,
"hendrycksTest-electrical_engineering": 0,
"hendrycksTest-anatomy": 0,
"crows_pairs_french_physical_appearance": 0,
"hendrycksTest-philosophy": 0,
"lambada_openai": 0,
"hendrycksTest-high_school_computer_science": 0,
"hendrycksTest-nutrition": 0,
"hendrycksTest-virology": 0,
"crows_pairs_english_gender": 0,
"hendrycksTest-computer_security": 0,
"hendrycksTest-professional_accounting": 0,
"hendrycksTest-machine_learning": 0,
"crows_pairs_english_race_color": 0,
"crows_pairs_english_religion": 0,
"hendrycksTest-management": 0,
"sciq": 0,
"hendrycksTest-astronomy": 0,
"hendrycksTest-high_school_world_history": 0,
"crows_pairs_french_race_color": 0,
"hendrycksTest-global_facts": 0,
"hendrycksTest-human_sexuality": 0,
"hendrycksTest-prehistory": 0,
"hendrycksTest-college_biology": 0,
"crows_pairs_french_age": 0,
"hendrycksTest-marketing": 0,
"hendrycksTest-security_studies": 0,
"hendrycksTest-international_law": 0,
"hendrycksTest-elementary_mathematics": 0,
"hendrycksTest-high_school_geography": 0,
"crows_pairs_french_religion": 0,
"hendrycksTest-world_religions": 0,
"hendrycksTest-logical_fallacies": 0,
"crows_pairs_french_nationality": 0,
"crows_pairs_french_autre": 0,
"hendrycksTest-high_school_biology": 0,
"hendrycksTest-medical_genetics": 0,
"hendrycksTest-us_foreign_policy": 0,
"hendrycksTest-professional_law": 0,
"crows_pairs_english_sexual_orientation": 0,
"hendrycksTest-professional_psychology": 0,
"crows_pairs_english_nationality": 0,
"hendrycksTest-professional_medicine": 0,
"winogrande": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"hendrycksTest-human_aging": 0,
"hendrycksTest-college_physics": 0,
"logiqa": 0,
"hendrycksTest-high_school_chemistry": 0,
"hendrycksTest-public_relations": 0,
"arc_challenge": 0,
"crows_pairs_english_age": 0
},
"config": {
"model": "hf-causal",
"model_args": "pretrained=EleutherAI/pythia-v1.1-70m,revision=step53000",
"num_fewshot": 0,
"batch_size": 16,
"device": "cuda:5",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}