karlexmarin's picture
feat: ship paper artefacts + CLI diagnostic alongside browser tool
535348a
Raw
History Blame
21.9 kB
{
"results": {
"hendrycksTest-machine_learning": {
"acc": 0.25,
"acc_stderr": 0.04109974682633932,
"acc_norm": 0.21428571428571427,
"acc_norm_stderr": 0.03894641120044793
},
"crows_pairs_french_religion": {
"likelihood_difference": 5.236141304347826,
"likelihood_difference_stderr": 0.5106076448625602,
"pct_stereotype": 0.5391304347826087,
"pct_stereotype_stderr": 0.04668566114758416
},
"hendrycksTest-professional_medicine": {
"acc": 0.30514705882352944,
"acc_stderr": 0.027971541370170595,
"acc_norm": 0.27205882352941174,
"acc_norm_stderr": 0.027033041151681456
},
"crows_pairs_french_sexual_orientation": {
"likelihood_difference": 5.860233516483516,
"likelihood_difference_stderr": 0.5089789548023154,
"pct_stereotype": 0.8131868131868132,
"pct_stereotype_stderr": 0.04108446855035881
},
"hendrycksTest-moral_scenarios": {
"acc": 0.24692737430167597,
"acc_stderr": 0.014422292204808835,
"acc_norm": 0.24692737430167597,
"acc_norm_stderr": 0.014422292204808835
},
"crows_pairs_english_autre": {
"likelihood_difference": 6.355113636363637,
"likelihood_difference_stderr": 1.7489509473745437,
"pct_stereotype": 0.5454545454545454,
"pct_stereotype_stderr": 0.1574591643244434
},
"crows_pairs_english_age": {
"likelihood_difference": 2.771291208791209,
"likelihood_difference_stderr": 0.26169461121705356,
"pct_stereotype": 0.5164835164835165,
"pct_stereotype_stderr": 0.05267597952306975
},
"hendrycksTest-human_sexuality": {
"acc": 0.3435114503816794,
"acc_stderr": 0.041649760719448786,
"acc_norm": 0.2900763358778626,
"acc_norm_stderr": 0.03980066246467766
},
"crows_pairs_english_gender": {
"likelihood_difference": 3.02265625,
"likelihood_difference_stderr": 0.2641863477227852,
"pct_stereotype": 0.540625,
"pct_stereotype_stderr": 0.027902068404300068
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.1814814814814815,
"acc_stderr": 0.02349926466940731,
"acc_norm": 0.23703703703703705,
"acc_norm_stderr": 0.025928876132766104
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.28,
"acc_stderr": 0.04512608598542127,
"acc_norm": 0.27,
"acc_norm_stderr": 0.04461960433384741
},
"hendrycksTest-formal_logic": {
"acc": 0.2777777777777778,
"acc_stderr": 0.04006168083848876,
"acc_norm": 0.23809523809523808,
"acc_norm_stderr": 0.038095238095238126
},
"hendrycksTest-computer_security": {
"acc": 0.22,
"acc_stderr": 0.041633319989322716,
"acc_norm": 0.28,
"acc_norm_stderr": 0.045126085985421255
},
"hendrycksTest-electrical_engineering": {
"acc": 0.2896551724137931,
"acc_stderr": 0.03780019230438014,
"acc_norm": 0.31724137931034485,
"acc_norm_stderr": 0.03878352372138622
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.25660377358490566,
"acc_stderr": 0.026880647889051982,
"acc_norm": 0.3320754716981132,
"acc_norm_stderr": 0.028985455652334388
},
"hendrycksTest-human_aging": {
"acc": 0.23766816143497757,
"acc_stderr": 0.028568079464714263,
"acc_norm": 0.21973094170403587,
"acc_norm_stderr": 0.02779017706438361
},
"crows_pairs_english_sexual_orientation": {
"likelihood_difference": 4.497311827956989,
"likelihood_difference_stderr": 0.547355686843944,
"pct_stereotype": 0.8172043010752689,
"pct_stereotype_stderr": 0.040295300106155174
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.21243523316062177,
"acc_stderr": 0.02951928261681725,
"acc_norm": 0.2694300518134715,
"acc_norm_stderr": 0.03201867122877794
},
"hendrycksTest-professional_law": {
"acc": 0.24185136897001303,
"acc_stderr": 0.010936550813827065,
"acc_norm": 0.288135593220339,
"acc_norm_stderr": 0.011567140661324568
},
"hendrycksTest-professional_psychology": {
"acc": 0.24019607843137256,
"acc_stderr": 0.01728276069516741,
"acc_norm": 0.2647058823529412,
"acc_norm_stderr": 0.01784808957491322
},
"hendrycksTest-marketing": {
"acc": 0.24786324786324787,
"acc_stderr": 0.028286324075564404,
"acc_norm": 0.2777777777777778,
"acc_norm_stderr": 0.02934311479809447
},
"hendrycksTest-management": {
"acc": 0.1941747572815534,
"acc_stderr": 0.03916667762822583,
"acc_norm": 0.2815533980582524,
"acc_norm_stderr": 0.04453254836326469
},
"hendrycksTest-public_relations": {
"acc": 0.3,
"acc_stderr": 0.04389311454644286,
"acc_norm": 0.22727272727272727,
"acc_norm_stderr": 0.040139645540727735
},
"hendrycksTest-nutrition": {
"acc": 0.2875816993464052,
"acc_stderr": 0.02591780611714716,
"acc_norm": 0.35947712418300654,
"acc_norm_stderr": 0.027475969910660952
},
"crows_pairs_french_physical_appearance": {
"likelihood_difference": 5.818142361111111,
"likelihood_difference_stderr": 0.641566712133372,
"pct_stereotype": 0.5,
"pct_stereotype_stderr": 0.05933908290969268
},
"hendrycksTest-college_medicine": {
"acc": 0.21965317919075145,
"acc_stderr": 0.031568093627031744,
"acc_norm": 0.32947976878612717,
"acc_norm_stderr": 0.03583901754736412
},
"crows_pairs_english_race_color": {
"likelihood_difference": 3.48705093503937,
"likelihood_difference_stderr": 0.17608529748474508,
"pct_stereotype": 0.4625984251968504,
"pct_stereotype_stderr": 0.02214356608896984
},
"hendrycksTest-logical_fallacies": {
"acc": 0.20245398773006135,
"acc_stderr": 0.03157065078911902,
"acc_norm": 0.3128834355828221,
"acc_norm_stderr": 0.03642914578292404
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.23544973544973544,
"acc_stderr": 0.02185150982203172,
"acc_norm": 0.24603174603174602,
"acc_norm_stderr": 0.022182037202948365
},
"hendrycksTest-professional_accounting": {
"acc": 0.24113475177304963,
"acc_stderr": 0.02551873104953776,
"acc_norm": 0.2553191489361702,
"acc_norm_stderr": 0.026011992930902013
},
"hendrycksTest-high_school_european_history": {
"acc": 0.22424242424242424,
"acc_stderr": 0.03256866661681102,
"acc_norm": 0.2909090909090909,
"acc_norm_stderr": 0.03546563019624337
},
"hendrycksTest-college_mathematics": {
"acc": 0.17,
"acc_stderr": 0.03775251680686371,
"acc_norm": 0.23,
"acc_norm_stderr": 0.04229525846816505
},
"hendrycksTest-prehistory": {
"acc": 0.2716049382716049,
"acc_stderr": 0.024748624490537382,
"acc_norm": 0.21604938271604937,
"acc_norm_stderr": 0.022899162918445785
},
"hendrycksTest-global_facts": {
"acc": 0.23,
"acc_stderr": 0.042295258468165065,
"acc_norm": 0.23,
"acc_norm_stderr": 0.042295258468165065
},
"hendrycksTest-anatomy": {
"acc": 0.16296296296296298,
"acc_stderr": 0.0319054147448284,
"acc_norm": 0.17037037037037037,
"acc_norm_stderr": 0.03247781185995593
},
"hendrycksTest-college_computer_science": {
"acc": 0.23,
"acc_stderr": 0.04229525846816506,
"acc_norm": 0.25,
"acc_norm_stderr": 0.04351941398892446
},
"crows_pairs_english_nationality": {
"likelihood_difference": 3.807146990740741,
"likelihood_difference_stderr": 0.2792489767677307,
"pct_stereotype": 0.4351851851851852,
"pct_stereotype_stderr": 0.03381200005643525
},
"hendrycksTest-high_school_biology": {
"acc": 0.25806451612903225,
"acc_stderr": 0.024892469172462826,
"acc_norm": 0.29354838709677417,
"acc_norm_stderr": 0.025906087021319295
},
"crows_pairs_english_physical_appearance": {
"likelihood_difference": 3.8326822916666665,
"likelihood_difference_stderr": 0.42534095862131277,
"pct_stereotype": 0.5555555555555556,
"pct_stereotype_stderr": 0.05897165471491952
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.24,
"acc_stderr": 0.04292346959909284,
"acc_norm": 0.32,
"acc_norm_stderr": 0.04688261722621503
},
"hendrycksTest-college_physics": {
"acc": 0.21568627450980393,
"acc_stderr": 0.04092563958237655,
"acc_norm": 0.28431372549019607,
"acc_norm_stderr": 0.04488482852329017
},
"winogrande": {
"acc": 0.4972375690607735,
"acc_stderr": 0.014052271211616441
},
"logiqa": {
"acc": 0.20890937019969277,
"acc_stderr": 0.01594539939642392,
"acc_norm": 0.28417818740399386,
"acc_norm_stderr": 0.01769054268019078
},
"lambada_openai": {
"ppl": 118.09596009074914,
"ppl_stderr": 4.94543500156858,
"acc": 0.2233650300795653,
"acc_stderr": 0.005802673494605816
},
"hendrycksTest-high_school_geography": {
"acc": 0.23232323232323232,
"acc_stderr": 0.030088629490217487,
"acc_norm": 0.2828282828282828,
"acc_norm_stderr": 0.032087795587867514
},
"hendrycksTest-econometrics": {
"acc": 0.23684210526315788,
"acc_stderr": 0.039994238792813365,
"acc_norm": 0.22807017543859648,
"acc_norm_stderr": 0.03947152782669415
},
"crows_pairs_french_race_color": {
"likelihood_difference": 4.586209239130435,
"likelihood_difference_stderr": 0.22439998730100816,
"pct_stereotype": 0.2847826086956522,
"pct_stereotype_stderr": 0.02106538604116979
},
"crows_pairs_english_socioeconomic": {
"likelihood_difference": 3.9653782894736844,
"likelihood_difference_stderr": 0.28742290506987894,
"pct_stereotype": 0.6263157894736842,
"pct_stereotype_stderr": 0.035189909668609055
},
"crows_pairs_french_nationality": {
"likelihood_difference": 7.6789772727272725,
"likelihood_difference_stderr": 0.41601257841823347,
"pct_stereotype": 0.2648221343873518,
"pct_stereotype_stderr": 0.02779540983044468
},
"hendrycksTest-high_school_psychology": {
"acc": 0.26788990825688075,
"acc_stderr": 0.018987462257978652,
"acc_norm": 0.25871559633027524,
"acc_norm_stderr": 0.018776052319619617
},
"crows_pairs_french": {
"likelihood_difference": 5.2784641472868215,
"likelihood_difference_stderr": 0.12512951793875754,
"pct_stereotype": 0.407871198568873,
"pct_stereotype_stderr": 0.012004182941077525
},
"crows_pairs_french_disability": {
"likelihood_difference": 6.4512310606060606,
"likelihood_difference_stderr": 0.6395317220387889,
"pct_stereotype": 0.45454545454545453,
"pct_stereotype_stderr": 0.06176056549879611
},
"hendrycksTest-high_school_physics": {
"acc": 0.23178807947019867,
"acc_stderr": 0.03445406271987053,
"acc_norm": 0.2119205298013245,
"acc_norm_stderr": 0.03336767086567978
},
"arc_easy": {
"acc": 0.4010942760942761,
"acc_stderr": 0.010057051106534374,
"acc_norm": 0.36447811447811446,
"acc_norm_stderr": 0.009875729282482438
},
"hendrycksTest-conceptual_physics": {
"acc": 0.25957446808510637,
"acc_stderr": 0.02865917937429232,
"acc_norm": 0.19574468085106383,
"acc_norm_stderr": 0.025937853139977145
},
"crows_pairs_french_age": {
"likelihood_difference": 4.49375,
"likelihood_difference_stderr": 0.48104738994215757,
"pct_stereotype": 0.4,
"pct_stereotype_stderr": 0.05192907868894985
},
"crows_pairs_english": {
"likelihood_difference": 3.625680158020274,
"likelihood_difference_stderr": 0.10323728907768165,
"pct_stereotype": 0.5372689326177699,
"pct_stereotype_stderr": 0.012179324068364769
},
"hendrycksTest-sociology": {
"acc": 0.25870646766169153,
"acc_stderr": 0.03096590312357304,
"acc_norm": 0.26865671641791045,
"acc_norm_stderr": 0.03134328358208954
},
"hendrycksTest-miscellaneous": {
"acc": 0.27330779054916987,
"acc_stderr": 0.01593668106262856,
"acc_norm": 0.2503192848020434,
"acc_norm_stderr": 0.0154910889514946
},
"arc_challenge": {
"acc": 0.181740614334471,
"acc_stderr": 0.011269198948880236,
"acc_norm": 0.21416382252559726,
"acc_norm_stderr": 0.011988383205966497
},
"hendrycksTest-world_religions": {
"acc": 0.29239766081871343,
"acc_stderr": 0.03488647713457922,
"acc_norm": 0.3333333333333333,
"acc_norm_stderr": 0.03615507630310935
},
"hendrycksTest-astronomy": {
"acc": 0.21052631578947367,
"acc_stderr": 0.033176727875331574,
"acc_norm": 0.3618421052631579,
"acc_norm_stderr": 0.03910525752849724
},
"hendrycksTest-business_ethics": {
"acc": 0.36,
"acc_stderr": 0.048241815132442176,
"acc_norm": 0.33,
"acc_norm_stderr": 0.04725815626252604
},
"hendrycksTest-college_biology": {
"acc": 0.2847222222222222,
"acc_stderr": 0.037738099906869355,
"acc_norm": 0.2847222222222222,
"acc_norm_stderr": 0.03773809990686934
},
"piqa": {
"acc": 0.5984766050054406,
"acc_stderr": 0.011437324373397848,
"acc_norm": 0.5930359085963003,
"acc_norm_stderr": 0.011462093919190166
},
"crows_pairs_french_gender": {
"likelihood_difference": 4.313473520249222,
"likelihood_difference_stderr": 0.19628650459456284,
"pct_stereotype": 0.5202492211838006,
"pct_stereotype_stderr": 0.027927918885132307
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.23109243697478993,
"acc_stderr": 0.02738140692786898,
"acc_norm": 0.29831932773109243,
"acc_norm_stderr": 0.02971914287634287
},
"hendrycksTest-virology": {
"acc": 0.21686746987951808,
"acc_stderr": 0.03208284450356365,
"acc_norm": 0.2469879518072289,
"acc_norm_stderr": 0.03357351982064536
},
"hendrycksTest-jurisprudence": {
"acc": 0.26851851851851855,
"acc_stderr": 0.04284467968052191,
"acc_norm": 0.4074074074074074,
"acc_norm_stderr": 0.04750077341199985
},
"hendrycksTest-abstract_algebra": {
"acc": 0.24,
"acc_stderr": 0.04292346959909284,
"acc_norm": 0.26,
"acc_norm_stderr": 0.04408440022768078
},
"crows_pairs_english_disability": {
"likelihood_difference": 5.187980769230769,
"likelihood_difference_stderr": 0.5880197346199485,
"pct_stereotype": 0.6153846153846154,
"pct_stereotype_stderr": 0.06081303192631497
},
"hendrycksTest-high_school_us_history": {
"acc": 0.24019607843137256,
"acc_stderr": 0.02998373305591361,
"acc_norm": 0.2696078431372549,
"acc_norm_stderr": 0.031145570659486782
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.21674876847290642,
"acc_stderr": 0.02899033125251624,
"acc_norm": 0.26108374384236455,
"acc_norm_stderr": 0.030903796952114468
},
"sciq": {
"acc": 0.664,
"acc_stderr": 0.014944140233795027,
"acc_norm": 0.576,
"acc_norm_stderr": 0.01563548747140519
},
"hendrycksTest-philosophy": {
"acc": 0.2057877813504823,
"acc_stderr": 0.022961339906764244,
"acc_norm": 0.28938906752411575,
"acc_norm_stderr": 0.025755865922632945
},
"hendrycksTest-security_studies": {
"acc": 0.30612244897959184,
"acc_stderr": 0.02950489645459597,
"acc_norm": 0.2530612244897959,
"acc_norm_stderr": 0.027833023871399683
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.258974358974359,
"acc_stderr": 0.022211106810061675,
"acc_norm": 0.28205128205128205,
"acc_norm_stderr": 0.022815813098896597
},
"hendrycksTest-moral_disputes": {
"acc": 0.27167630057803466,
"acc_stderr": 0.023948512905468365,
"acc_norm": 0.32947976878612717,
"acc_norm_stderr": 0.025305258131879716
},
"hendrycksTest-international_law": {
"acc": 0.18181818181818182,
"acc_stderr": 0.03520893951097652,
"acc_norm": 0.4214876033057851,
"acc_norm_stderr": 0.04507732278775094
},
"wsc": {
"acc": 0.36538461538461536,
"acc_stderr": 0.0474473339327792
},
"hendrycksTest-medical_genetics": {
"acc": 0.22,
"acc_stderr": 0.04163331998932269,
"acc_norm": 0.32,
"acc_norm_stderr": 0.046882617226215034
},
"crows_pairs_english_religion": {
"likelihood_difference": 3.714527027027027,
"likelihood_difference_stderr": 0.39412229193840076,
"pct_stereotype": 0.6396396396396397,
"pct_stereotype_stderr": 0.04577621167070314
},
"crows_pairs_french_autre": {
"likelihood_difference": 2.5288461538461537,
"likelihood_difference_stderr": 0.9157702142826863,
"pct_stereotype": 0.38461538461538464,
"pct_stereotype_stderr": 0.1404416814115811
},
"hendrycksTest-high_school_world_history": {
"acc": 0.270042194092827,
"acc_stderr": 0.028900721906293426,
"acc_norm": 0.270042194092827,
"acc_norm_stderr": 0.028900721906293426
},
"hendrycksTest-high_school_statistics": {
"acc": 0.18518518518518517,
"acc_stderr": 0.02649191472735516,
"acc_norm": 0.24537037037037038,
"acc_norm_stderr": 0.029346665094372924
},
"hendrycksTest-college_chemistry": {
"acc": 0.29,
"acc_stderr": 0.04560480215720684,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"crows_pairs_french_socioeconomic": {
"likelihood_difference": 5.089205994897959,
"likelihood_difference_stderr": 0.3514259595841283,
"pct_stereotype": 0.3877551020408163,
"pct_stereotype_stderr": 0.03489185364347385
}
},
"versions": {
"hendrycksTest-machine_learning": 0,
"crows_pairs_french_religion": 0,
"hendrycksTest-professional_medicine": 0,
"crows_pairs_french_sexual_orientation": 0,
"hendrycksTest-moral_scenarios": 0,
"crows_pairs_english_autre": 0,
"crows_pairs_english_age": 0,
"hendrycksTest-human_sexuality": 0,
"crows_pairs_english_gender": 0,
"hendrycksTest-high_school_mathematics": 0,
"hendrycksTest-us_foreign_policy": 0,
"hendrycksTest-formal_logic": 0,
"hendrycksTest-computer_security": 0,
"hendrycksTest-electrical_engineering": 0,
"hendrycksTest-clinical_knowledge": 0,
"hendrycksTest-human_aging": 0,
"crows_pairs_english_sexual_orientation": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"hendrycksTest-professional_law": 0,
"hendrycksTest-professional_psychology": 0,
"hendrycksTest-marketing": 0,
"hendrycksTest-management": 0,
"hendrycksTest-public_relations": 0,
"hendrycksTest-nutrition": 0,
"crows_pairs_french_physical_appearance": 0,
"hendrycksTest-college_medicine": 0,
"crows_pairs_english_race_color": 0,
"hendrycksTest-logical_fallacies": 0,
"hendrycksTest-elementary_mathematics": 0,
"hendrycksTest-professional_accounting": 0,
"hendrycksTest-high_school_european_history": 0,
"hendrycksTest-college_mathematics": 0,
"hendrycksTest-prehistory": 0,
"hendrycksTest-global_facts": 0,
"hendrycksTest-anatomy": 0,
"hendrycksTest-college_computer_science": 0,
"crows_pairs_english_nationality": 0,
"hendrycksTest-high_school_biology": 0,
"crows_pairs_english_physical_appearance": 0,
"hendrycksTest-high_school_computer_science": 0,
"hendrycksTest-college_physics": 0,
"winogrande": 0,
"logiqa": 0,
"lambada_openai": 0,
"hendrycksTest-high_school_geography": 0,
"hendrycksTest-econometrics": 0,
"crows_pairs_french_race_color": 0,
"crows_pairs_english_socioeconomic": 0,
"crows_pairs_french_nationality": 0,
"hendrycksTest-high_school_psychology": 0,
"crows_pairs_french": 0,
"crows_pairs_french_disability": 0,
"hendrycksTest-high_school_physics": 0,
"arc_easy": 0,
"hendrycksTest-conceptual_physics": 0,
"crows_pairs_french_age": 0,
"crows_pairs_english": 0,
"hendrycksTest-sociology": 0,
"hendrycksTest-miscellaneous": 0,
"arc_challenge": 0,
"hendrycksTest-world_religions": 0,
"hendrycksTest-astronomy": 0,
"hendrycksTest-business_ethics": 0,
"hendrycksTest-college_biology": 0,
"piqa": 0,
"crows_pairs_french_gender": 0,
"hendrycksTest-high_school_microeconomics": 0,
"hendrycksTest-virology": 0,
"hendrycksTest-jurisprudence": 0,
"hendrycksTest-abstract_algebra": 0,
"crows_pairs_english_disability": 0,
"hendrycksTest-high_school_us_history": 0,
"hendrycksTest-high_school_chemistry": 0,
"sciq": 0,
"hendrycksTest-philosophy": 0,
"hendrycksTest-security_studies": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"hendrycksTest-moral_disputes": 0,
"hendrycksTest-international_law": 0,
"wsc": 0,
"hendrycksTest-medical_genetics": 0,
"crows_pairs_english_religion": 0,
"crows_pairs_french_autre": 0,
"hendrycksTest-high_school_world_history": 0,
"hendrycksTest-high_school_statistics": 0,
"hendrycksTest-college_chemistry": 0,
"crows_pairs_french_socioeconomic": 0
},
"config": {
"model": "hf-causal",
"model_args": "pretrained=EleutherAI/pythia-v1.1-70m,revision=step33000",
"num_fewshot": 0,
"batch_size": 16,
"device": "cuda:3",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}