karlexmarin's picture
feat: ship paper artefacts + CLI diagnostic alongside browser tool
535348a
raw
history blame
21.9 kB
{
"results": {
"hendrycksTest-miscellaneous": {
"acc": 0.26181353767560667,
"acc_stderr": 0.01572083867844526,
"acc_norm": 0.24776500638569604,
"acc_norm_stderr": 0.015438083080568965
},
"hendrycksTest-professional_accounting": {
"acc": 0.2978723404255319,
"acc_stderr": 0.027281608344469414,
"acc_norm": 0.2695035460992908,
"acc_norm_stderr": 0.02646903681859062
},
"hendrycksTest-moral_scenarios": {
"acc": 0.2346368715083799,
"acc_stderr": 0.014173044098303654,
"acc_norm": 0.2569832402234637,
"acc_norm_stderr": 0.014614465821966361
},
"sciq": {
"acc": 0.633,
"acc_stderr": 0.015249378464171749,
"acc_norm": 0.552,
"acc_norm_stderr": 0.01573351656634783
},
"hendrycksTest-nutrition": {
"acc": 0.27450980392156865,
"acc_stderr": 0.02555316999182651,
"acc_norm": 0.3366013071895425,
"acc_norm_stderr": 0.027057974624494382
},
"piqa": {
"acc": 0.5973884657236126,
"acc_stderr": 0.011442395233488698,
"acc_norm": 0.5854189336235038,
"acc_norm_stderr": 0.011494326682255158
},
"hendrycksTest-high_school_us_history": {
"acc": 0.2549019607843137,
"acc_stderr": 0.030587591351604243,
"acc_norm": 0.27941176470588236,
"acc_norm_stderr": 0.031493281045079556
},
"hendrycksTest-international_law": {
"acc": 0.15702479338842976,
"acc_stderr": 0.0332124484254713,
"acc_norm": 0.4132231404958678,
"acc_norm_stderr": 0.04495087843548408
},
"hendrycksTest-anatomy": {
"acc": 0.2222222222222222,
"acc_stderr": 0.035914440841969694,
"acc_norm": 0.2740740740740741,
"acc_norm_stderr": 0.03853254836552003
},
"crows_pairs_french_gender": {
"likelihood_difference": 4.660533489096573,
"likelihood_difference_stderr": 0.22532366484380598,
"pct_stereotype": 0.5077881619937694,
"pct_stereotype_stderr": 0.027947458769356347
},
"hendrycksTest-professional_medicine": {
"acc": 0.2757352941176471,
"acc_stderr": 0.027146271936625162,
"acc_norm": 0.3125,
"acc_norm_stderr": 0.02815637344037142
},
"hendrycksTest-high_school_psychology": {
"acc": 0.27339449541284405,
"acc_stderr": 0.0191092998460983,
"acc_norm": 0.24403669724770644,
"acc_norm_stderr": 0.018415286351416395
},
"hendrycksTest-astronomy": {
"acc": 0.23684210526315788,
"acc_stderr": 0.03459777606810535,
"acc_norm": 0.3355263157894737,
"acc_norm_stderr": 0.03842498559395268
},
"hendrycksTest-logical_fallacies": {
"acc": 0.22699386503067484,
"acc_stderr": 0.03291099578615769,
"acc_norm": 0.26380368098159507,
"acc_norm_stderr": 0.03462419931615623
},
"crows_pairs_french_disability": {
"likelihood_difference": 6.6946022727272725,
"likelihood_difference_stderr": 0.7491237826255029,
"pct_stereotype": 0.5,
"pct_stereotype_stderr": 0.06201736729460421
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.2315270935960591,
"acc_stderr": 0.029678333141444455,
"acc_norm": 0.3054187192118227,
"acc_norm_stderr": 0.03240661565868408
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.25925925925925924,
"acc_stderr": 0.022569897074918424,
"acc_norm": 0.25925925925925924,
"acc_norm_stderr": 0.022569897074918424
},
"hendrycksTest-human_sexuality": {
"acc": 0.3053435114503817,
"acc_stderr": 0.040393149787245605,
"acc_norm": 0.22137404580152673,
"acc_norm_stderr": 0.03641297081313729
},
"hendrycksTest-professional_psychology": {
"acc": 0.25,
"acc_stderr": 0.01751781884501444,
"acc_norm": 0.2630718954248366,
"acc_norm_stderr": 0.017812676542320653
},
"crows_pairs_french_religion": {
"likelihood_difference": 4.175815217391304,
"likelihood_difference_stderr": 0.5425080644657401,
"pct_stereotype": 0.5130434782608696,
"pct_stereotype_stderr": 0.04681335351503156
},
"hendrycksTest-college_computer_science": {
"acc": 0.28,
"acc_stderr": 0.045126085985421255,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720684
},
"crows_pairs_english_autre": {
"likelihood_difference": 5.3977272727272725,
"likelihood_difference_stderr": 1.790491828842816,
"pct_stereotype": 0.6363636363636364,
"pct_stereotype_stderr": 0.15212000482437738
},
"hendrycksTest-econometrics": {
"acc": 0.30701754385964913,
"acc_stderr": 0.04339138322579861,
"acc_norm": 0.2631578947368421,
"acc_norm_stderr": 0.041424397194893624
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.23529411764705882,
"acc_stderr": 0.027553614467863825,
"acc_norm": 0.3403361344537815,
"acc_norm_stderr": 0.030778057422931673
},
"hendrycksTest-moral_disputes": {
"acc": 0.2774566473988439,
"acc_stderr": 0.024105712607754307,
"acc_norm": 0.2947976878612717,
"acc_norm_stderr": 0.024547617794803835
},
"hendrycksTest-machine_learning": {
"acc": 0.33035714285714285,
"acc_stderr": 0.04464285714285713,
"acc_norm": 0.20535714285714285,
"acc_norm_stderr": 0.03834241021419073
},
"hendrycksTest-management": {
"acc": 0.2621359223300971,
"acc_stderr": 0.043546310772605935,
"acc_norm": 0.27184466019417475,
"acc_norm_stderr": 0.044052680241409216
},
"crows_pairs_french_socioeconomic": {
"likelihood_difference": 5.223692602040816,
"likelihood_difference_stderr": 0.3716517632652829,
"pct_stereotype": 0.5510204081632653,
"pct_stereotype_stderr": 0.03561884533975955
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.2564102564102564,
"acc_stderr": 0.022139081103971534,
"acc_norm": 0.28205128205128205,
"acc_norm_stderr": 0.022815813098896597
},
"hendrycksTest-security_studies": {
"acc": 0.2897959183673469,
"acc_stderr": 0.029043088683304345,
"acc_norm": 0.2530612244897959,
"acc_norm_stderr": 0.027833023871399683
},
"hendrycksTest-medical_genetics": {
"acc": 0.25,
"acc_stderr": 0.04351941398892446,
"acc_norm": 0.31,
"acc_norm_stderr": 0.04648231987117316
},
"hendrycksTest-high_school_statistics": {
"acc": 0.2361111111111111,
"acc_stderr": 0.028963702570791033,
"acc_norm": 0.27314814814814814,
"acc_norm_stderr": 0.03038805130167812
},
"crows_pairs_english": {
"likelihood_difference": 3.675657796660704,
"likelihood_difference_stderr": 0.10428478695252169,
"pct_stereotype": 0.5438282647584973,
"pct_stereotype_stderr": 0.012166287275376289
},
"hendrycksTest-high_school_physics": {
"acc": 0.2185430463576159,
"acc_stderr": 0.03374235550425694,
"acc_norm": 0.25165562913907286,
"acc_norm_stderr": 0.03543304234389985
},
"crows_pairs_english_religion": {
"likelihood_difference": 3.5057713963963963,
"likelihood_difference_stderr": 0.4253117969664197,
"pct_stereotype": 0.6216216216216216,
"pct_stereotype_stderr": 0.04624128233851482
},
"crows_pairs_english_sexual_orientation": {
"likelihood_difference": 4.478158602150538,
"likelihood_difference_stderr": 0.5463367427565824,
"pct_stereotype": 0.7849462365591398,
"pct_stereotype_stderr": 0.04283507835554755
},
"crows_pairs_english_socioeconomic": {
"likelihood_difference": 4.158223684210526,
"likelihood_difference_stderr": 0.2827099752616182,
"pct_stereotype": 0.5842105263157895,
"pct_stereotype_stderr": 0.0358501132552001
},
"crows_pairs_english_gender": {
"likelihood_difference": 2.90234375,
"likelihood_difference_stderr": 0.26743360486517015,
"pct_stereotype": 0.5375,
"pct_stereotype_stderr": 0.02791577963000663
},
"hendrycksTest-electrical_engineering": {
"acc": 0.2413793103448276,
"acc_stderr": 0.03565998174135303,
"acc_norm": 0.27586206896551724,
"acc_norm_stderr": 0.03724563619774632
},
"hendrycksTest-business_ethics": {
"acc": 0.31,
"acc_stderr": 0.04648231987117316,
"acc_norm": 0.25,
"acc_norm_stderr": 0.04351941398892446
},
"hendrycksTest-global_facts": {
"acc": 0.19,
"acc_stderr": 0.03942772444036625,
"acc_norm": 0.24,
"acc_norm_stderr": 0.042923469599092816
},
"hendrycksTest-public_relations": {
"acc": 0.2545454545454545,
"acc_stderr": 0.04172343038705383,
"acc_norm": 0.20909090909090908,
"acc_norm_stderr": 0.03895091015724137
},
"crows_pairs_french_age": {
"likelihood_difference": 4.967708333333333,
"likelihood_difference_stderr": 0.4550873657608913,
"pct_stereotype": 0.43333333333333335,
"pct_stereotype_stderr": 0.05252667118728807
},
"hendrycksTest-virology": {
"acc": 0.25301204819277107,
"acc_stderr": 0.033844291552331346,
"acc_norm": 0.21686746987951808,
"acc_norm_stderr": 0.03208284450356365
},
"crows_pairs_french_physical_appearance": {
"likelihood_difference": 5.224392361111111,
"likelihood_difference_stderr": 0.5949955425776441,
"pct_stereotype": 0.4861111111111111,
"pct_stereotype_stderr": 0.059316185327165566
},
"hendrycksTest-human_aging": {
"acc": 0.26905829596412556,
"acc_stderr": 0.02976377940687497,
"acc_norm": 0.21524663677130046,
"acc_norm_stderr": 0.027584066602208274
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.25906735751295334,
"acc_stderr": 0.03161877917935409,
"acc_norm": 0.3005181347150259,
"acc_norm_stderr": 0.033088185944157515
},
"hendrycksTest-philosophy": {
"acc": 0.24437299035369775,
"acc_stderr": 0.024406162094668893,
"acc_norm": 0.26688102893890675,
"acc_norm_stderr": 0.025122637608816646
},
"lambada_openai": {
"ppl": 124.26962204175287,
"ppl_stderr": 5.363117769801199,
"acc": 0.22627595575392975,
"acc_stderr": 0.005829406265404375
},
"crows_pairs_english_physical_appearance": {
"likelihood_difference": 3.8758680555555554,
"likelihood_difference_stderr": 0.41377726625457284,
"pct_stereotype": 0.625,
"pct_stereotype_stderr": 0.05745481997211521
},
"winogrande": {
"acc": 0.5193370165745856,
"acc_stderr": 0.014041972733712972
},
"crows_pairs_english_nationality": {
"likelihood_difference": 3.6435908564814814,
"likelihood_difference_stderr": 0.26705840381438256,
"pct_stereotype": 0.4305555555555556,
"pct_stereotype_stderr": 0.03376922151252336
},
"hendrycksTest-college_physics": {
"acc": 0.17647058823529413,
"acc_stderr": 0.03793281185307809,
"acc_norm": 0.23529411764705882,
"acc_norm_stderr": 0.04220773659171453
},
"crows_pairs_english_race_color": {
"likelihood_difference": 3.7424950787401574,
"likelihood_difference_stderr": 0.18169346622004526,
"pct_stereotype": 0.5059055118110236,
"pct_stereotype_stderr": 0.02220423067397246
},
"hendrycksTest-conceptual_physics": {
"acc": 0.2936170212765957,
"acc_stderr": 0.02977164271249123,
"acc_norm": 0.1829787234042553,
"acc_norm_stderr": 0.025276041000449966
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.23773584905660378,
"acc_stderr": 0.026199808807561915,
"acc_norm": 0.3018867924528302,
"acc_norm_stderr": 0.028254200344438662
},
"hendrycksTest-college_mathematics": {
"acc": 0.18,
"acc_stderr": 0.03861229196653697,
"acc_norm": 0.2,
"acc_norm_stderr": 0.04020151261036845
},
"hendrycksTest-abstract_algebra": {
"acc": 0.23,
"acc_stderr": 0.04229525846816505,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"hendrycksTest-computer_security": {
"acc": 0.22,
"acc_stderr": 0.041633319989322716,
"acc_norm": 0.32,
"acc_norm_stderr": 0.04688261722621503
},
"hendrycksTest-world_religions": {
"acc": 0.23976608187134502,
"acc_stderr": 0.03274485211946956,
"acc_norm": 0.3157894736842105,
"acc_norm_stderr": 0.03565079670708311
},
"hendrycksTest-sociology": {
"acc": 0.24378109452736318,
"acc_stderr": 0.030360490154014638,
"acc_norm": 0.2835820895522388,
"acc_norm_stderr": 0.03187187537919798
},
"hendrycksTest-college_chemistry": {
"acc": 0.32,
"acc_stderr": 0.04688261722621504,
"acc_norm": 0.33,
"acc_norm_stderr": 0.047258156262526045
},
"hendrycksTest-high_school_world_history": {
"acc": 0.2109704641350211,
"acc_stderr": 0.02655837250266192,
"acc_norm": 0.2742616033755274,
"acc_norm_stderr": 0.029041333510598046
},
"logiqa": {
"acc": 0.23195084485407066,
"acc_stderr": 0.0165552524979259,
"acc_norm": 0.27035330261136714,
"acc_norm_stderr": 0.01742069478339314
},
"wsc": {
"acc": 0.36538461538461536,
"acc_stderr": 0.0474473339327792
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.2,
"acc_stderr": 0.04020151261036843,
"acc_norm": 0.26,
"acc_norm_stderr": 0.044084400227680814
},
"hendrycksTest-high_school_biology": {
"acc": 0.2870967741935484,
"acc_stderr": 0.025736542745594528,
"acc_norm": 0.3,
"acc_norm_stderr": 0.02606936229533513
},
"hendrycksTest-marketing": {
"acc": 0.27350427350427353,
"acc_stderr": 0.029202540153431177,
"acc_norm": 0.2606837606837607,
"acc_norm_stderr": 0.028760348956523414
},
"hendrycksTest-professional_law": {
"acc": 0.24771838331160365,
"acc_stderr": 0.011025499291443742,
"acc_norm": 0.27444589308996087,
"acc_norm_stderr": 0.011397043163078154
},
"crows_pairs_french_nationality": {
"likelihood_difference": 7.527667984189724,
"likelihood_difference_stderr": 0.4209795564667756,
"pct_stereotype": 0.308300395256917,
"pct_stereotype_stderr": 0.02909012143059231
},
"hendrycksTest-prehistory": {
"acc": 0.26851851851851855,
"acc_stderr": 0.024659685185967284,
"acc_norm": 0.21296296296296297,
"acc_norm_stderr": 0.0227797190887334
},
"crows_pairs_french_sexual_orientation": {
"likelihood_difference": 7.548076923076923,
"likelihood_difference_stderr": 0.5113727094452629,
"pct_stereotype": 0.8131868131868132,
"pct_stereotype_stderr": 0.04108446855035883
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.1814814814814815,
"acc_stderr": 0.023499264669407292,
"acc_norm": 0.22962962962962963,
"acc_norm_stderr": 0.025644108639267613
},
"hendrycksTest-jurisprudence": {
"acc": 0.2037037037037037,
"acc_stderr": 0.038935425188248475,
"acc_norm": 0.3611111111111111,
"acc_norm_stderr": 0.04643454608906275
},
"crows_pairs_french_race_color": {
"likelihood_difference": 4.847758152173913,
"likelihood_difference_stderr": 0.2507391728199927,
"pct_stereotype": 0.3239130434782609,
"pct_stereotype_stderr": 0.021842842500532617
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.24,
"acc_stderr": 0.04292346959909283,
"acc_norm": 0.24,
"acc_norm_stderr": 0.04292346959909283
},
"arc_easy": {
"acc": 0.37415824915824913,
"acc_stderr": 0.009929516948977625,
"acc_norm": 0.3367003367003367,
"acc_norm_stderr": 0.009697166595752477
},
"arc_challenge": {
"acc": 0.18600682593856654,
"acc_stderr": 0.011370940183266749,
"acc_norm": 0.22440273037542663,
"acc_norm_stderr": 0.012191404938603843
},
"crows_pairs_english_age": {
"likelihood_difference": 2.8133585164835164,
"likelihood_difference_stderr": 0.27309263450343635,
"pct_stereotype": 0.4725274725274725,
"pct_stereotype_stderr": 0.05262501097748859
},
"crows_pairs_english_disability": {
"likelihood_difference": 5.492307692307692,
"likelihood_difference_stderr": 0.571991498636384,
"pct_stereotype": 0.6461538461538462,
"pct_stereotype_stderr": 0.05977027026123099
},
"crows_pairs_french": {
"likelihood_difference": 5.452854800238521,
"likelihood_difference_stderr": 0.13262546821335017,
"pct_stereotype": 0.4442456768038163,
"pct_stereotype_stderr": 0.012137130534698507
},
"hendrycksTest-formal_logic": {
"acc": 0.2777777777777778,
"acc_stderr": 0.040061680838488774,
"acc_norm": 0.29365079365079366,
"acc_norm_stderr": 0.04073524322147125
},
"hendrycksTest-high_school_european_history": {
"acc": 0.18181818181818182,
"acc_stderr": 0.030117688929503585,
"acc_norm": 0.2606060606060606,
"acc_norm_stderr": 0.03427743175816524
},
"hendrycksTest-high_school_geography": {
"acc": 0.2878787878787879,
"acc_stderr": 0.03225883512300992,
"acc_norm": 0.3181818181818182,
"acc_norm_stderr": 0.03318477333845331
},
"hendrycksTest-college_medicine": {
"acc": 0.24277456647398843,
"acc_stderr": 0.0326926380614177,
"acc_norm": 0.3063583815028902,
"acc_norm_stderr": 0.03514942551267437
},
"crows_pairs_french_autre": {
"likelihood_difference": 4.454326923076923,
"likelihood_difference_stderr": 1.3817380041698064,
"pct_stereotype": 0.5384615384615384,
"pct_stereotype_stderr": 0.14390989949130545
},
"hendrycksTest-college_biology": {
"acc": 0.2569444444444444,
"acc_stderr": 0.03653946969442099,
"acc_norm": 0.2777777777777778,
"acc_norm_stderr": 0.037455547914624576
}
},
"versions": {
"hendrycksTest-miscellaneous": 0,
"hendrycksTest-professional_accounting": 0,
"hendrycksTest-moral_scenarios": 0,
"sciq": 0,
"hendrycksTest-nutrition": 0,
"piqa": 0,
"hendrycksTest-high_school_us_history": 0,
"hendrycksTest-international_law": 0,
"hendrycksTest-anatomy": 0,
"crows_pairs_french_gender": 0,
"hendrycksTest-professional_medicine": 0,
"hendrycksTest-high_school_psychology": 0,
"hendrycksTest-astronomy": 0,
"hendrycksTest-logical_fallacies": 0,
"crows_pairs_french_disability": 0,
"hendrycksTest-high_school_chemistry": 0,
"hendrycksTest-elementary_mathematics": 0,
"hendrycksTest-human_sexuality": 0,
"hendrycksTest-professional_psychology": 0,
"crows_pairs_french_religion": 0,
"hendrycksTest-college_computer_science": 0,
"crows_pairs_english_autre": 0,
"hendrycksTest-econometrics": 0,
"hendrycksTest-high_school_microeconomics": 0,
"hendrycksTest-moral_disputes": 0,
"hendrycksTest-machine_learning": 0,
"hendrycksTest-management": 0,
"crows_pairs_french_socioeconomic": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"hendrycksTest-security_studies": 0,
"hendrycksTest-medical_genetics": 0,
"hendrycksTest-high_school_statistics": 0,
"crows_pairs_english": 0,
"hendrycksTest-high_school_physics": 0,
"crows_pairs_english_religion": 0,
"crows_pairs_english_sexual_orientation": 0,
"crows_pairs_english_socioeconomic": 0,
"crows_pairs_english_gender": 0,
"hendrycksTest-electrical_engineering": 0,
"hendrycksTest-business_ethics": 0,
"hendrycksTest-global_facts": 0,
"hendrycksTest-public_relations": 0,
"crows_pairs_french_age": 0,
"hendrycksTest-virology": 0,
"crows_pairs_french_physical_appearance": 0,
"hendrycksTest-human_aging": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"hendrycksTest-philosophy": 0,
"lambada_openai": 0,
"crows_pairs_english_physical_appearance": 0,
"winogrande": 0,
"crows_pairs_english_nationality": 0,
"hendrycksTest-college_physics": 0,
"crows_pairs_english_race_color": 0,
"hendrycksTest-conceptual_physics": 0,
"hendrycksTest-clinical_knowledge": 0,
"hendrycksTest-college_mathematics": 0,
"hendrycksTest-abstract_algebra": 0,
"hendrycksTest-computer_security": 0,
"hendrycksTest-world_religions": 0,
"hendrycksTest-sociology": 0,
"hendrycksTest-college_chemistry": 0,
"hendrycksTest-high_school_world_history": 0,
"logiqa": 0,
"wsc": 0,
"hendrycksTest-high_school_computer_science": 0,
"hendrycksTest-high_school_biology": 0,
"hendrycksTest-marketing": 0,
"hendrycksTest-professional_law": 0,
"crows_pairs_french_nationality": 0,
"hendrycksTest-prehistory": 0,
"crows_pairs_french_sexual_orientation": 0,
"hendrycksTest-high_school_mathematics": 0,
"hendrycksTest-jurisprudence": 0,
"crows_pairs_french_race_color": 0,
"hendrycksTest-us_foreign_policy": 0,
"arc_easy": 0,
"arc_challenge": 0,
"crows_pairs_english_age": 0,
"crows_pairs_english_disability": 0,
"crows_pairs_french": 0,
"hendrycksTest-formal_logic": 0,
"hendrycksTest-high_school_european_history": 0,
"hendrycksTest-high_school_geography": 0,
"hendrycksTest-college_medicine": 0,
"crows_pairs_french_autre": 0,
"hendrycksTest-college_biology": 0
},
"config": {
"model": "hf-causal",
"model_args": "pretrained=EleutherAI/pythia-v1.1-70m,revision=step83000",
"num_fewshot": 0,
"batch_size": 16,
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}