karlexmarin's picture
feat: ship paper artefacts + CLI diagnostic alongside browser tool
535348a
raw
history blame
21.9 kB
{
"results": {
"lambada_openai": {
"ppl": 3288862.4386760374,
"ppl_stderr": 311605.46093383565,
"acc": 0.0,
"acc_stderr": 0.0
},
"hendrycksTest-astronomy": {
"acc": 0.16447368421052633,
"acc_stderr": 0.030167533468632723,
"acc_norm": 0.2236842105263158,
"acc_norm_stderr": 0.033911609343436046
},
"winogrande": {
"acc": 0.4940805051302289,
"acc_stderr": 0.01405150083848581
},
"hendrycksTest-high_school_us_history": {
"acc": 0.18627450980392157,
"acc_stderr": 0.027325470966716323,
"acc_norm": 0.2647058823529412,
"acc_norm_stderr": 0.030964517926923393
},
"crows_pairs_french_age": {
"likelihood_difference": 5.902777777777778,
"likelihood_difference_stderr": 0.745349522367746,
"pct_stereotype": 0.6777777777777778,
"pct_stereotype_stderr": 0.049536623805744535
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.23,
"acc_stderr": 0.04229525846816505,
"acc_norm": 0.36,
"acc_norm_stderr": 0.04824181513244218
},
"hendrycksTest-global_facts": {
"acc": 0.33,
"acc_stderr": 0.047258156262526045,
"acc_norm": 0.32,
"acc_norm_stderr": 0.046882617226215034
},
"hendrycksTest-high_school_physics": {
"acc": 0.2052980132450331,
"acc_stderr": 0.03297986648473836,
"acc_norm": 0.2980132450331126,
"acc_norm_stderr": 0.03734535676787198
},
"crows_pairs_french_socioeconomic": {
"likelihood_difference": 12.683394451530612,
"likelihood_difference_stderr": 0.8321591288729919,
"pct_stereotype": 0.45918367346938777,
"pct_stereotype_stderr": 0.03568624151230552
},
"hendrycksTest-international_law": {
"acc": 0.09917355371900827,
"acc_stderr": 0.027285246312758957,
"acc_norm": 0.2396694214876033,
"acc_norm_stderr": 0.03896878985070417
},
"hendrycksTest-medical_genetics": {
"acc": 0.26,
"acc_stderr": 0.0440844002276808,
"acc_norm": 0.2,
"acc_norm_stderr": 0.04020151261036845
},
"hendrycksTest-logical_fallacies": {
"acc": 0.17177914110429449,
"acc_stderr": 0.029634717272371013,
"acc_norm": 0.25766871165644173,
"acc_norm_stderr": 0.03436150827846917
},
"hendrycksTest-moral_disputes": {
"acc": 0.22254335260115607,
"acc_stderr": 0.02239421566194282,
"acc_norm": 0.21965317919075145,
"acc_norm_stderr": 0.022289638852617893
},
"crows_pairs_english_disability": {
"likelihood_difference": 7.655769230769231,
"likelihood_difference_stderr": 1.2456701776455885,
"pct_stereotype": 0.6307692307692307,
"pct_stereotype_stderr": 0.060324565928300454
},
"hendrycksTest-prehistory": {
"acc": 0.25925925925925924,
"acc_stderr": 0.02438366553103545,
"acc_norm": 0.24382716049382716,
"acc_norm_stderr": 0.023891879541959603
},
"hendrycksTest-college_mathematics": {
"acc": 0.14,
"acc_stderr": 0.0348735088019777,
"acc_norm": 0.19,
"acc_norm_stderr": 0.03942772444036623
},
"crows_pairs_french": {
"likelihood_difference": 10.100835755813954,
"likelihood_difference_stderr": 0.23128974328889199,
"pct_stereotype": 0.5819916517590936,
"pct_stereotype_stderr": 0.012047969184920519
},
"wsc": {
"acc": 0.6346153846153846,
"acc_stderr": 0.0474473339327792
},
"hendrycksTest-electrical_engineering": {
"acc": 0.2413793103448276,
"acc_stderr": 0.03565998174135303,
"acc_norm": 0.20689655172413793,
"acc_norm_stderr": 0.03375672449560554
},
"crows_pairs_english_physical_appearance": {
"likelihood_difference": 6.219184027777778,
"likelihood_difference_stderr": 0.8156476562247187,
"pct_stereotype": 0.5138888888888888,
"pct_stereotype_stderr": 0.05931618532716555
},
"hendrycksTest-management": {
"acc": 0.1941747572815534,
"acc_stderr": 0.03916667762822582,
"acc_norm": 0.23300970873786409,
"acc_norm_stderr": 0.04185832598928315
},
"hendrycksTest-machine_learning": {
"acc": 0.25,
"acc_stderr": 0.04109974682633932,
"acc_norm": 0.3125,
"acc_norm_stderr": 0.043994650575715215
},
"crows_pairs_english_race_color": {
"likelihood_difference": 5.5294045275590555,
"likelihood_difference_stderr": 0.34271615785671483,
"pct_stereotype": 0.36811023622047245,
"pct_stereotype_stderr": 0.021419317453594672
},
"hendrycksTest-marketing": {
"acc": 0.2222222222222222,
"acc_stderr": 0.027236013946196666,
"acc_norm": 0.23931623931623933,
"acc_norm_stderr": 0.02795182680892433
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.20689655172413793,
"acc_stderr": 0.028501378167893946,
"acc_norm": 0.22167487684729065,
"acc_norm_stderr": 0.029225575892489617
},
"hendrycksTest-econometrics": {
"acc": 0.24561403508771928,
"acc_stderr": 0.0404933929774814,
"acc_norm": 0.2807017543859649,
"acc_norm_stderr": 0.04227054451232199
},
"hendrycksTest-virology": {
"acc": 0.14457831325301204,
"acc_stderr": 0.027377874786362316,
"acc_norm": 0.18674698795180722,
"acc_norm_stderr": 0.030338749144500615
},
"hendrycksTest-high_school_psychology": {
"acc": 0.22752293577981653,
"acc_stderr": 0.017974463578776502,
"acc_norm": 0.24954128440366974,
"acc_norm_stderr": 0.01855389762950162
},
"hendrycksTest-high_school_geography": {
"acc": 0.25252525252525254,
"acc_stderr": 0.030954055470365897,
"acc_norm": 0.2474747474747475,
"acc_norm_stderr": 0.03074630074212451
},
"sciq": {
"acc": 0.223,
"acc_stderr": 0.013169830843425661,
"acc_norm": 0.21,
"acc_norm_stderr": 0.012886662332274547
},
"crows_pairs_french_religion": {
"likelihood_difference": 12.11983695652174,
"likelihood_difference_stderr": 0.9761138647537818,
"pct_stereotype": 0.6608695652173913,
"pct_stereotype_stderr": 0.04433930011819816
},
"crows_pairs_english_gender": {
"likelihood_difference": 4.749609375,
"likelihood_difference_stderr": 0.4877724715110692,
"pct_stereotype": 0.48125,
"pct_stereotype_stderr": 0.027974934901776306
},
"hendrycksTest-professional_accounting": {
"acc": 0.26595744680851063,
"acc_stderr": 0.026358065698880582,
"acc_norm": 0.25886524822695034,
"acc_norm_stderr": 0.026129572527180848
},
"logiqa": {
"acc": 0.2196620583717358,
"acc_stderr": 0.01623910941493393,
"acc_norm": 0.23809523809523808,
"acc_norm_stderr": 0.016705867034419633
},
"hendrycksTest-professional_medicine": {
"acc": 0.22794117647058823,
"acc_stderr": 0.025483081468029804,
"acc_norm": 0.2867647058823529,
"acc_norm_stderr": 0.027472274473233818
},
"hendrycksTest-world_religions": {
"acc": 0.1695906432748538,
"acc_stderr": 0.028782108105401712,
"acc_norm": 0.22807017543859648,
"acc_norm_stderr": 0.03218093795602357
},
"hendrycksTest-sociology": {
"acc": 0.2835820895522388,
"acc_stderr": 0.03187187537919796,
"acc_norm": 0.2935323383084577,
"acc_norm_stderr": 0.032200241045342054
},
"hendrycksTest-professional_psychology": {
"acc": 0.22058823529411764,
"acc_stderr": 0.01677467236546854,
"acc_norm": 0.24019607843137256,
"acc_norm_stderr": 0.017282760695167435
},
"hendrycksTest-computer_security": {
"acc": 0.26,
"acc_stderr": 0.04408440022768078,
"acc_norm": 0.28,
"acc_norm_stderr": 0.045126085985421276
},
"hendrycksTest-philosophy": {
"acc": 0.2379421221864952,
"acc_stderr": 0.024185150647818707,
"acc_norm": 0.2861736334405145,
"acc_norm_stderr": 0.025670259242188943
},
"crows_pairs_french_race_color": {
"likelihood_difference": 9.869972826086956,
"likelihood_difference_stderr": 0.3709338879215957,
"pct_stereotype": 0.7130434782608696,
"pct_stereotype_stderr": 0.021113474740601688
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.18490566037735848,
"acc_stderr": 0.023893351834464324,
"acc_norm": 0.28679245283018867,
"acc_norm_stderr": 0.027834912527544067
},
"crows_pairs_english": {
"likelihood_difference": 5.480079755515802,
"likelihood_difference_stderr": 0.19151850776212573,
"pct_stereotype": 0.45855694692904,
"pct_stereotype_stderr": 0.012171273580365826
},
"crows_pairs_french_nationality": {
"likelihood_difference": 9.49802371541502,
"likelihood_difference_stderr": 0.5281355544781192,
"pct_stereotype": 0.4980237154150198,
"pct_stereotype_stderr": 0.031496793380453074
},
"hendrycksTest-nutrition": {
"acc": 0.20915032679738563,
"acc_stderr": 0.023287685312334803,
"acc_norm": 0.24836601307189543,
"acc_norm_stderr": 0.02473998135511359
},
"hendrycksTest-college_medicine": {
"acc": 0.19653179190751446,
"acc_stderr": 0.030299574664788147,
"acc_norm": 0.24855491329479767,
"acc_norm_stderr": 0.03295304696818318
},
"crows_pairs_english_socioeconomic": {
"likelihood_difference": 5.714967105263158,
"likelihood_difference_stderr": 0.5307740830599903,
"pct_stereotype": 0.5684210526315789,
"pct_stereotype_stderr": 0.03602751443822843
},
"crows_pairs_english_autre": {
"likelihood_difference": 5.2414772727272725,
"likelihood_difference_stderr": 2.881736459713796,
"pct_stereotype": 0.7272727272727273,
"pct_stereotype_stderr": 0.14083575804390605
},
"hendrycksTest-anatomy": {
"acc": 0.2074074074074074,
"acc_stderr": 0.03502553170678318,
"acc_norm": 0.28888888888888886,
"acc_norm_stderr": 0.0391545063041425
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.2037037037037037,
"acc_stderr": 0.02074274056012268,
"acc_norm": 0.21957671957671956,
"acc_norm_stderr": 0.021320018599770375
},
"hendrycksTest-high_school_biology": {
"acc": 0.2,
"acc_stderr": 0.022755204959542936,
"acc_norm": 0.22580645161290322,
"acc_norm_stderr": 0.02378557788418101
},
"crows_pairs_english_sexual_orientation": {
"likelihood_difference": 5.359206989247312,
"likelihood_difference_stderr": 0.7683231947337748,
"pct_stereotype": 0.6021505376344086,
"pct_stereotype_stderr": 0.0510291122856655
},
"hendrycksTest-high_school_statistics": {
"acc": 0.23148148148148148,
"acc_stderr": 0.02876511171804696,
"acc_norm": 0.2962962962962963,
"acc_norm_stderr": 0.031141447823536037
},
"crows_pairs_french_physical_appearance": {
"likelihood_difference": 10.003472222222221,
"likelihood_difference_stderr": 1.3633059287800664,
"pct_stereotype": 0.4861111111111111,
"pct_stereotype_stderr": 0.059316185327165566
},
"hendrycksTest-formal_logic": {
"acc": 0.2857142857142857,
"acc_stderr": 0.04040610178208841,
"acc_norm": 0.25396825396825395,
"acc_norm_stderr": 0.038932596106046706
},
"hendrycksTest-human_sexuality": {
"acc": 0.25190839694656486,
"acc_stderr": 0.038073871163060866,
"acc_norm": 0.25190839694656486,
"acc_norm_stderr": 0.038073871163060866
},
"hendrycksTest-moral_scenarios": {
"acc": 0.23798882681564246,
"acc_stderr": 0.014242630070574915,
"acc_norm": 0.23798882681564246,
"acc_norm_stderr": 0.014242630070574915
},
"hendrycksTest-abstract_algebra": {
"acc": 0.2,
"acc_stderr": 0.04020151261036845,
"acc_norm": 0.17,
"acc_norm_stderr": 0.03775251680686371
},
"arc_easy": {
"acc": 0.2668350168350168,
"acc_stderr": 0.00907591585926725,
"acc_norm": 0.2638888888888889,
"acc_norm_stderr": 0.009043789220055139
},
"hendrycksTest-college_chemistry": {
"acc": 0.21,
"acc_stderr": 0.04093601807403325,
"acc_norm": 0.26,
"acc_norm_stderr": 0.0440844002276808
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.18067226890756302,
"acc_stderr": 0.02499196496660074,
"acc_norm": 0.2773109243697479,
"acc_norm_stderr": 0.029079374539480007
},
"hendrycksTest-jurisprudence": {
"acc": 0.18518518518518517,
"acc_stderr": 0.03755265865037181,
"acc_norm": 0.24074074074074073,
"acc_norm_stderr": 0.041331194402438376
},
"hendrycksTest-college_physics": {
"acc": 0.13725490196078433,
"acc_stderr": 0.03424084669891521,
"acc_norm": 0.20588235294117646,
"acc_norm_stderr": 0.04023382273617747
},
"hendrycksTest-public_relations": {
"acc": 0.2636363636363636,
"acc_stderr": 0.04220224692971987,
"acc_norm": 0.20909090909090908,
"acc_norm_stderr": 0.03895091015724138
},
"crows_pairs_english_religion": {
"likelihood_difference": 5.728322072072072,
"likelihood_difference_stderr": 0.6965067589462834,
"pct_stereotype": 0.45045045045045046,
"pct_stereotype_stderr": 0.04743846177747609
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.2230769230769231,
"acc_stderr": 0.02110773012724399,
"acc_norm": 0.25384615384615383,
"acc_norm_stderr": 0.022066054378726257
},
"crows_pairs_french_sexual_orientation": {
"likelihood_difference": 15.282967032967033,
"likelihood_difference_stderr": 1.0847203102990313,
"pct_stereotype": 0.8021978021978022,
"pct_stereotype_stderr": 0.04198895203196222
},
"hendrycksTest-professional_law": {
"acc": 0.23533246414602346,
"acc_stderr": 0.010834432543912219,
"acc_norm": 0.25684485006518903,
"acc_norm_stderr": 0.011158455853098851
},
"hendrycksTest-high_school_world_history": {
"acc": 0.189873417721519,
"acc_stderr": 0.025530100460233494,
"acc_norm": 0.22362869198312235,
"acc_norm_stderr": 0.02712329820522997
},
"hendrycksTest-business_ethics": {
"acc": 0.26,
"acc_stderr": 0.044084400227680794,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720684
},
"crows_pairs_french_autre": {
"likelihood_difference": 7.216346153846154,
"likelihood_difference_stderr": 1.9704931663267538,
"pct_stereotype": 0.46153846153846156,
"pct_stereotype_stderr": 0.14390989949130545
},
"hendrycksTest-conceptual_physics": {
"acc": 0.2723404255319149,
"acc_stderr": 0.029101290698386708,
"acc_norm": 0.25957446808510637,
"acc_norm_stderr": 0.02865917937429232
},
"crows_pairs_english_age": {
"likelihood_difference": 3.418269230769231,
"likelihood_difference_stderr": 0.6082631522720632,
"pct_stereotype": 0.5274725274725275,
"pct_stereotype_stderr": 0.05262501097748859
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.23,
"acc_stderr": 0.04229525846816505,
"acc_norm": 0.25,
"acc_norm_stderr": 0.04351941398892446
},
"arc_challenge": {
"acc": 0.20477815699658702,
"acc_stderr": 0.01179254433851342,
"acc_norm": 0.2440273037542662,
"acc_norm_stderr": 0.01255144762785626
},
"hendrycksTest-high_school_european_history": {
"acc": 0.17575757575757575,
"acc_stderr": 0.02972094300622445,
"acc_norm": 0.22424242424242424,
"acc_norm_stderr": 0.03256866661681102
},
"hendrycksTest-miscellaneous": {
"acc": 0.2503192848020434,
"acc_stderr": 0.015491088951494588,
"acc_norm": 0.25287356321839083,
"acc_norm_stderr": 0.015543377313719681
},
"hendrycksTest-college_biology": {
"acc": 0.2569444444444444,
"acc_stderr": 0.03653946969442099,
"acc_norm": 0.25,
"acc_norm_stderr": 0.03621034121889507
},
"crows_pairs_english_nationality": {
"likelihood_difference": 6.14380787037037,
"likelihood_difference_stderr": 0.5217915071777064,
"pct_stereotype": 0.37037037037037035,
"pct_stereotype_stderr": 0.03293377139415191
},
"crows_pairs_french_gender": {
"likelihood_difference": 7.492017133956386,
"likelihood_difference_stderr": 0.4566662635366117,
"pct_stereotype": 0.48286604361370716,
"pct_stereotype_stderr": 0.027934433698537306
},
"piqa": {
"acc": 0.5272034820457019,
"acc_stderr": 0.011648545262429021,
"acc_norm": 0.5261153427638737,
"acc_norm_stderr": 0.011649900854263415
},
"hendrycksTest-security_studies": {
"acc": 0.31020408163265306,
"acc_stderr": 0.02961345987248438,
"acc_norm": 0.19183673469387755,
"acc_norm_stderr": 0.025206963154225378
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.19170984455958548,
"acc_stderr": 0.02840895362624527,
"acc_norm": 0.24870466321243523,
"acc_norm_stderr": 0.03119584087770028
},
"crows_pairs_french_disability": {
"likelihood_difference": 14.775568181818182,
"likelihood_difference_stderr": 1.4715579883772572,
"pct_stereotype": 0.3939393939393939,
"pct_stereotype_stderr": 0.06060606060606062
},
"hendrycksTest-college_computer_science": {
"acc": 0.21,
"acc_stderr": 0.040936018074033256,
"acc_norm": 0.17,
"acc_norm_stderr": 0.03775251680686371
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.2074074074074074,
"acc_stderr": 0.02472071319395215,
"acc_norm": 0.26666666666666666,
"acc_norm_stderr": 0.026962424325073824
},
"hendrycksTest-human_aging": {
"acc": 0.273542600896861,
"acc_stderr": 0.029918586707798824,
"acc_norm": 0.27802690582959644,
"acc_norm_stderr": 0.030069584874494033
}
},
"versions": {
"lambada_openai": 0,
"hendrycksTest-astronomy": 0,
"winogrande": 0,
"hendrycksTest-high_school_us_history": 0,
"crows_pairs_french_age": 0,
"hendrycksTest-high_school_computer_science": 0,
"hendrycksTest-global_facts": 0,
"hendrycksTest-high_school_physics": 0,
"crows_pairs_french_socioeconomic": 0,
"hendrycksTest-international_law": 0,
"hendrycksTest-medical_genetics": 0,
"hendrycksTest-logical_fallacies": 0,
"hendrycksTest-moral_disputes": 0,
"crows_pairs_english_disability": 0,
"hendrycksTest-prehistory": 0,
"hendrycksTest-college_mathematics": 0,
"crows_pairs_french": 0,
"wsc": 0,
"hendrycksTest-electrical_engineering": 0,
"crows_pairs_english_physical_appearance": 0,
"hendrycksTest-management": 0,
"hendrycksTest-machine_learning": 0,
"crows_pairs_english_race_color": 0,
"hendrycksTest-marketing": 0,
"hendrycksTest-high_school_chemistry": 0,
"hendrycksTest-econometrics": 0,
"hendrycksTest-virology": 0,
"hendrycksTest-high_school_psychology": 0,
"hendrycksTest-high_school_geography": 0,
"sciq": 0,
"crows_pairs_french_religion": 0,
"crows_pairs_english_gender": 0,
"hendrycksTest-professional_accounting": 0,
"logiqa": 0,
"hendrycksTest-professional_medicine": 0,
"hendrycksTest-world_religions": 0,
"hendrycksTest-sociology": 0,
"hendrycksTest-professional_psychology": 0,
"hendrycksTest-computer_security": 0,
"hendrycksTest-philosophy": 0,
"crows_pairs_french_race_color": 0,
"hendrycksTest-clinical_knowledge": 0,
"crows_pairs_english": 0,
"crows_pairs_french_nationality": 0,
"hendrycksTest-nutrition": 0,
"hendrycksTest-college_medicine": 0,
"crows_pairs_english_socioeconomic": 0,
"crows_pairs_english_autre": 0,
"hendrycksTest-anatomy": 0,
"hendrycksTest-elementary_mathematics": 0,
"hendrycksTest-high_school_biology": 0,
"crows_pairs_english_sexual_orientation": 0,
"hendrycksTest-high_school_statistics": 0,
"crows_pairs_french_physical_appearance": 0,
"hendrycksTest-formal_logic": 0,
"hendrycksTest-human_sexuality": 0,
"hendrycksTest-moral_scenarios": 0,
"hendrycksTest-abstract_algebra": 0,
"arc_easy": 0,
"hendrycksTest-college_chemistry": 0,
"hendrycksTest-high_school_microeconomics": 0,
"hendrycksTest-jurisprudence": 0,
"hendrycksTest-college_physics": 0,
"hendrycksTest-public_relations": 0,
"crows_pairs_english_religion": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"crows_pairs_french_sexual_orientation": 0,
"hendrycksTest-professional_law": 0,
"hendrycksTest-high_school_world_history": 0,
"hendrycksTest-business_ethics": 0,
"crows_pairs_french_autre": 0,
"hendrycksTest-conceptual_physics": 0,
"crows_pairs_english_age": 0,
"hendrycksTest-us_foreign_policy": 0,
"arc_challenge": 0,
"hendrycksTest-high_school_european_history": 0,
"hendrycksTest-miscellaneous": 0,
"hendrycksTest-college_biology": 0,
"crows_pairs_english_nationality": 0,
"crows_pairs_french_gender": 0,
"piqa": 0,
"hendrycksTest-security_studies": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"crows_pairs_french_disability": 0,
"hendrycksTest-college_computer_science": 0,
"hendrycksTest-high_school_mathematics": 0,
"hendrycksTest-human_aging": 0
},
"config": {
"model": "hf-causal",
"model_args": "use_accelerate=True,pretrained=EleutherAI/pythia-v1.1-70m,revision=step32",
"num_fewshot": 0,
"batch_size": 32,
"device": null,
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}