karlexmarin's picture
feat: ship paper artefacts + CLI diagnostic alongside browser tool
535348a
raw
history blame
22 kB
{
"results": {
"hendrycksTest-elementary_mathematics": {
"acc": 0.21693121693121692,
"acc_stderr": 0.021227082449445045,
"acc_norm": 0.21164021164021163,
"acc_norm_stderr": 0.02103733150526289
},
"hendrycksTest-high_school_world_history": {
"acc": 0.1729957805907173,
"acc_stderr": 0.024621562866768427,
"acc_norm": 0.25316455696202533,
"acc_norm_stderr": 0.028304657943035296
},
"winogrande": {
"acc": 0.4972375690607735,
"acc_stderr": 0.014052271211616445
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.21794871794871795,
"acc_stderr": 0.020932445774463175,
"acc_norm": 0.26153846153846155,
"acc_norm_stderr": 0.02228214120420442
},
"hendrycksTest-machine_learning": {
"acc": 0.24107142857142858,
"acc_stderr": 0.04059867246952685,
"acc_norm": 0.2857142857142857,
"acc_norm_stderr": 0.04287858751340456
},
"hendrycksTest-college_physics": {
"acc": 0.22549019607843138,
"acc_stderr": 0.041583075330832865,
"acc_norm": 0.23529411764705882,
"acc_norm_stderr": 0.04220773659171452
},
"hendrycksTest-anatomy": {
"acc": 0.23703703703703705,
"acc_stderr": 0.03673731683969506,
"acc_norm": 0.28888888888888886,
"acc_norm_stderr": 0.0391545063041425
},
"hendrycksTest-management": {
"acc": 0.23300970873786409,
"acc_stderr": 0.04185832598928315,
"acc_norm": 0.23300970873786409,
"acc_norm_stderr": 0.04185832598928315
},
"hendrycksTest-college_medicine": {
"acc": 0.21965317919075145,
"acc_stderr": 0.031568093627031744,
"acc_norm": 0.2658959537572254,
"acc_norm_stderr": 0.03368762932259431
},
"hendrycksTest-conceptual_physics": {
"acc": 0.28936170212765955,
"acc_stderr": 0.029644006577009618,
"acc_norm": 0.24680851063829787,
"acc_norm_stderr": 0.02818544130123408
},
"hendrycksTest-professional_psychology": {
"acc": 0.2222222222222222,
"acc_stderr": 0.01681902837573638,
"acc_norm": 0.24836601307189543,
"acc_norm_stderr": 0.017479487001364764
},
"hendrycksTest-high_school_geography": {
"acc": 0.21717171717171718,
"acc_stderr": 0.029376616484945637,
"acc_norm": 0.26262626262626265,
"acc_norm_stderr": 0.031353050095330855
},
"crows_pairs_english_physical_appearance": {
"likelihood_difference": 5.821180555555555,
"likelihood_difference_stderr": 0.7528994326490429,
"pct_stereotype": 0.5416666666666666,
"pct_stereotype_stderr": 0.05913268547421811
},
"crows_pairs_english_disability": {
"likelihood_difference": 7.460576923076923,
"likelihood_difference_stderr": 1.1593364221878786,
"pct_stereotype": 0.5076923076923077,
"pct_stereotype_stderr": 0.062492603112584276
},
"hendrycksTest-logical_fallacies": {
"acc": 0.17177914110429449,
"acc_stderr": 0.02963471727237102,
"acc_norm": 0.27607361963190186,
"acc_norm_stderr": 0.0351238528370505
},
"hendrycksTest-security_studies": {
"acc": 0.3183673469387755,
"acc_stderr": 0.02982253379398205,
"acc_norm": 0.21224489795918366,
"acc_norm_stderr": 0.026176967197866767
},
"hendrycksTest-virology": {
"acc": 0.16265060240963855,
"acc_stderr": 0.028730237892613798,
"acc_norm": 0.21084337349397592,
"acc_norm_stderr": 0.031755547866299215
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.22,
"acc_stderr": 0.04163331998932269,
"acc_norm": 0.22,
"acc_norm_stderr": 0.0416333199893227
},
"hendrycksTest-econometrics": {
"acc": 0.22807017543859648,
"acc_stderr": 0.03947152782669415,
"acc_norm": 0.32456140350877194,
"acc_norm_stderr": 0.04404556157374768
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.21243523316062177,
"acc_stderr": 0.029519282616817254,
"acc_norm": 0.24870466321243523,
"acc_norm_stderr": 0.03119584087770029
},
"logiqa": {
"acc": 0.20890937019969277,
"acc_stderr": 0.015945399396423896,
"acc_norm": 0.24270353302611367,
"acc_norm_stderr": 0.016815676206479523
},
"hendrycksTest-human_sexuality": {
"acc": 0.2366412213740458,
"acc_stderr": 0.03727673575596919,
"acc_norm": 0.25190839694656486,
"acc_norm_stderr": 0.038073871163060866
},
"crows_pairs_french_autre": {
"likelihood_difference": 6.8173076923076925,
"likelihood_difference_stderr": 1.7019864539843879,
"pct_stereotype": 0.5384615384615384,
"pct_stereotype_stderr": 0.14390989949130545
},
"crows_pairs_french_disability": {
"likelihood_difference": 14.00189393939394,
"likelihood_difference_stderr": 1.4054078154752692,
"pct_stereotype": 0.36363636363636365,
"pct_stereotype_stderr": 0.05966637484671757
},
"hendrycksTest-high_school_psychology": {
"acc": 0.23486238532110093,
"acc_stderr": 0.01817511051034359,
"acc_norm": 0.24954128440366974,
"acc_norm_stderr": 0.018553897629501624
},
"sciq": {
"acc": 0.199,
"acc_stderr": 0.012631649083099186,
"acc_norm": 0.218,
"acc_norm_stderr": 0.013063179040595282
},
"crows_pairs_french_socioeconomic": {
"likelihood_difference": 11.76251594387755,
"likelihood_difference_stderr": 0.7977323948668976,
"pct_stereotype": 0.3673469387755102,
"pct_stereotype_stderr": 0.03452261728704165
},
"crows_pairs_french_religion": {
"likelihood_difference": 11.039673913043478,
"likelihood_difference_stderr": 0.8746581584196995,
"pct_stereotype": 0.6608695652173913,
"pct_stereotype_stderr": 0.04433930011819816
},
"hendrycksTest-jurisprudence": {
"acc": 0.18518518518518517,
"acc_stderr": 0.03755265865037181,
"acc_norm": 0.24074074074074073,
"acc_norm_stderr": 0.041331194402438376
},
"crows_pairs_french_gender": {
"likelihood_difference": 7.126947040498442,
"likelihood_difference_stderr": 0.41934707639834146,
"pct_stereotype": 0.48598130841121495,
"pct_stereotype_stderr": 0.027939861549302374
},
"hendrycksTest-college_mathematics": {
"acc": 0.13,
"acc_stderr": 0.03379976689896309,
"acc_norm": 0.21,
"acc_norm_stderr": 0.040936018074033256
},
"arc_challenge": {
"acc": 0.20819112627986347,
"acc_stderr": 0.01186486611844807,
"acc_norm": 0.24488054607508533,
"acc_norm_stderr": 0.012566273985131356
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.1660377358490566,
"acc_stderr": 0.022902064724569966,
"acc_norm": 0.3018867924528302,
"acc_norm_stderr": 0.028254200344438655
},
"crows_pairs_english_autre": {
"likelihood_difference": 5.184659090909091,
"likelihood_difference_stderr": 2.7270102264769593,
"pct_stereotype": 0.6363636363636364,
"pct_stereotype_stderr": 0.15212000482437738
},
"crows_pairs_english_socioeconomic": {
"likelihood_difference": 5.49078947368421,
"likelihood_difference_stderr": 0.48194076993788276,
"pct_stereotype": 0.6157894736842106,
"pct_stereotype_stderr": 0.03538097998767891
},
"hendrycksTest-medical_genetics": {
"acc": 0.28,
"acc_stderr": 0.04512608598542128,
"acc_norm": 0.24,
"acc_norm_stderr": 0.04292346959909283
},
"crows_pairs_english_race_color": {
"likelihood_difference": 5.203678641732283,
"likelihood_difference_stderr": 0.3157259260594444,
"pct_stereotype": 0.33267716535433073,
"pct_stereotype_stderr": 0.02092548388333584
},
"hendrycksTest-human_aging": {
"acc": 0.2645739910313901,
"acc_stderr": 0.029605103217038315,
"acc_norm": 0.2825112107623318,
"acc_norm_stderr": 0.03021683101150877
},
"crows_pairs_french_nationality": {
"likelihood_difference": 8.90736166007905,
"likelihood_difference_stderr": 0.48994737960034646,
"pct_stereotype": 0.5177865612648221,
"pct_stereotype_stderr": 0.03147710419094347
},
"hendrycksTest-high_school_statistics": {
"acc": 0.2222222222222222,
"acc_stderr": 0.028353212866863445,
"acc_norm": 0.2824074074074074,
"acc_norm_stderr": 0.030701372111510937
},
"hendrycksTest-philosophy": {
"acc": 0.2508038585209003,
"acc_stderr": 0.024619771956697168,
"acc_norm": 0.2765273311897106,
"acc_norm_stderr": 0.02540383297817961
},
"hendrycksTest-miscellaneous": {
"acc": 0.24265644955300128,
"acc_stderr": 0.01532988894089986,
"acc_norm": 0.26053639846743293,
"acc_norm_stderr": 0.01569600856380707
},
"hendrycksTest-professional_law": {
"acc": 0.2242503259452412,
"acc_stderr": 0.010652615824906163,
"acc_norm": 0.2529335071707953,
"acc_norm_stderr": 0.011102268713839989
},
"hendrycksTest-astronomy": {
"acc": 0.19736842105263158,
"acc_stderr": 0.03238981601699397,
"acc_norm": 0.24342105263157895,
"acc_norm_stderr": 0.034923496688842384
},
"crows_pairs_english_nationality": {
"likelihood_difference": 5.985243055555555,
"likelihood_difference_stderr": 0.48650219522921606,
"pct_stereotype": 0.3333333333333333,
"pct_stereotype_stderr": 0.03214952147802749
},
"hendrycksTest-college_chemistry": {
"acc": 0.23,
"acc_stderr": 0.04229525846816506,
"acc_norm": 0.27,
"acc_norm_stderr": 0.0446196043338474
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.22660098522167488,
"acc_stderr": 0.029454863835292968,
"acc_norm": 0.21674876847290642,
"acc_norm_stderr": 0.028990331252516235
},
"hendrycksTest-high_school_european_history": {
"acc": 0.17575757575757575,
"acc_stderr": 0.02972094300622445,
"acc_norm": 0.23030303030303031,
"acc_norm_stderr": 0.03287666758603488
},
"hendrycksTest-high_school_us_history": {
"acc": 0.19117647058823528,
"acc_stderr": 0.027599174300640766,
"acc_norm": 0.2696078431372549,
"acc_norm_stderr": 0.031145570659486782
},
"hendrycksTest-college_computer_science": {
"acc": 0.23,
"acc_stderr": 0.042295258468165044,
"acc_norm": 0.22,
"acc_norm_stderr": 0.04163331998932269
},
"crows_pairs_french_race_color": {
"likelihood_difference": 8.887228260869565,
"likelihood_difference_stderr": 0.3298429589043161,
"pct_stereotype": 0.717391304347826,
"pct_stereotype_stderr": 0.02101669741793868
},
"crows_pairs_french": {
"likelihood_difference": 9.346535852713178,
"likelihood_difference_stderr": 0.21437404785240546,
"pct_stereotype": 0.5742397137745975,
"pct_stereotype_stderr": 0.012077920863042001
},
"hendrycksTest-world_religions": {
"acc": 0.17543859649122806,
"acc_stderr": 0.02917088550072768,
"acc_norm": 0.2222222222222222,
"acc_norm_stderr": 0.03188578017686399
},
"hendrycksTest-global_facts": {
"acc": 0.21,
"acc_stderr": 0.040936018074033256,
"acc_norm": 0.22,
"acc_norm_stderr": 0.04163331998932269
},
"hendrycksTest-abstract_algebra": {
"acc": 0.2,
"acc_stderr": 0.04020151261036843,
"acc_norm": 0.15,
"acc_norm_stderr": 0.035887028128263734
},
"hendrycksTest-business_ethics": {
"acc": 0.32,
"acc_stderr": 0.04688261722621504,
"acc_norm": 0.27,
"acc_norm_stderr": 0.0446196043338474
},
"hendrycksTest-international_law": {
"acc": 0.09917355371900827,
"acc_stderr": 0.027285246312758957,
"acc_norm": 0.2809917355371901,
"acc_norm_stderr": 0.04103203830514512
},
"hendrycksTest-college_biology": {
"acc": 0.2013888888888889,
"acc_stderr": 0.03353647469713839,
"acc_norm": 0.2361111111111111,
"acc_norm_stderr": 0.03551446610810826
},
"wsc": {
"acc": 0.375,
"acc_stderr": 0.04770204856076104
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.18067226890756302,
"acc_stderr": 0.024991964966600753,
"acc_norm": 0.2857142857142857,
"acc_norm_stderr": 0.029344572500634342
},
"hendrycksTest-prehistory": {
"acc": 0.2623456790123457,
"acc_stderr": 0.02447722285613512,
"acc_norm": 0.23765432098765432,
"acc_norm_stderr": 0.023683591837008557
},
"hendrycksTest-professional_accounting": {
"acc": 0.2695035460992908,
"acc_stderr": 0.02646903681859063,
"acc_norm": 0.2801418439716312,
"acc_norm_stderr": 0.02678917235114024
},
"piqa": {
"acc": 0.5402611534276387,
"acc_stderr": 0.011627942981817168,
"acc_norm": 0.5195865070729053,
"acc_norm_stderr": 0.011656869979288454
},
"hendrycksTest-moral_scenarios": {
"acc": 0.23798882681564246,
"acc_stderr": 0.014242630070574915,
"acc_norm": 0.27262569832402234,
"acc_norm_stderr": 0.014893391735249588
},
"hendrycksTest-computer_security": {
"acc": 0.25,
"acc_stderr": 0.04351941398892446,
"acc_norm": 0.29,
"acc_norm_stderr": 0.045604802157206845
},
"hendrycksTest-high_school_biology": {
"acc": 0.19032258064516128,
"acc_stderr": 0.02233170761182307,
"acc_norm": 0.24516129032258063,
"acc_norm_stderr": 0.02447224384089552
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.1814814814814815,
"acc_stderr": 0.023499264669407306,
"acc_norm": 0.27037037037037037,
"acc_norm_stderr": 0.027080372815145658
},
"crows_pairs_english": {
"likelihood_difference": 5.231738223017293,
"likelihood_difference_stderr": 0.1788777548247783,
"pct_stereotype": 0.43530113297555156,
"pct_stereotype_stderr": 0.012110619233278561
},
"crows_pairs_english_sexual_orientation": {
"likelihood_difference": 5.415994623655914,
"likelihood_difference_stderr": 0.7173989239033504,
"pct_stereotype": 0.45161290322580644,
"pct_stereotype_stderr": 0.051883930752016603
},
"hendrycksTest-nutrition": {
"acc": 0.20261437908496732,
"acc_stderr": 0.023015446877985693,
"acc_norm": 0.2647058823529412,
"acc_norm_stderr": 0.02526169121972948
},
"hendrycksTest-sociology": {
"acc": 0.2885572139303483,
"acc_stderr": 0.0320384104021332,
"acc_norm": 0.31343283582089554,
"acc_norm_stderr": 0.03280188205348642
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.22,
"acc_stderr": 0.041633319989322695,
"acc_norm": 0.36,
"acc_norm_stderr": 0.04824181513244218
},
"crows_pairs_english_age": {
"likelihood_difference": 3.3361950549450547,
"likelihood_difference_stderr": 0.5594307162046473,
"pct_stereotype": 0.45054945054945056,
"pct_stereotype_stderr": 0.052446231001012276
},
"hendrycksTest-high_school_physics": {
"acc": 0.1986754966887417,
"acc_stderr": 0.032578473844367774,
"acc_norm": 0.2781456953642384,
"acc_norm_stderr": 0.036586032627637426
},
"hendrycksTest-public_relations": {
"acc": 0.3,
"acc_stderr": 0.04389311454644287,
"acc_norm": 0.17272727272727273,
"acc_norm_stderr": 0.03620691833929219
},
"arc_easy": {
"acc": 0.2706228956228956,
"acc_stderr": 0.009116466166403832,
"acc_norm": 0.26515151515151514,
"acc_norm_stderr": 0.009057621139172618
},
"hendrycksTest-moral_disputes": {
"acc": 0.21676300578034682,
"acc_stderr": 0.022183477668412853,
"acc_norm": 0.20520231213872833,
"acc_norm_stderr": 0.021742519835276277
},
"hendrycksTest-marketing": {
"acc": 0.19230769230769232,
"acc_stderr": 0.025819233256483713,
"acc_norm": 0.24358974358974358,
"acc_norm_stderr": 0.0281209665039144
},
"hendrycksTest-electrical_engineering": {
"acc": 0.21379310344827587,
"acc_stderr": 0.03416520447747549,
"acc_norm": 0.20689655172413793,
"acc_norm_stderr": 0.03375672449560554
},
"crows_pairs_french_physical_appearance": {
"likelihood_difference": 9.36111111111111,
"likelihood_difference_stderr": 1.299896180819777,
"pct_stereotype": 0.5833333333333334,
"pct_stereotype_stderr": 0.05850912479161746
},
"hendrycksTest-formal_logic": {
"acc": 0.29365079365079366,
"acc_stderr": 0.040735243221471276,
"acc_norm": 0.2777777777777778,
"acc_norm_stderr": 0.04006168083848876
},
"crows_pairs_english_religion": {
"likelihood_difference": 5.315596846846847,
"likelihood_difference_stderr": 0.6585436977642254,
"pct_stereotype": 0.40540540540540543,
"pct_stereotype_stderr": 0.046812183988348
},
"hendrycksTest-professional_medicine": {
"acc": 0.2426470588235294,
"acc_stderr": 0.026040662474201264,
"acc_norm": 0.28308823529411764,
"acc_norm_stderr": 0.02736586113151381
},
"lambada_openai": {
"ppl": 2347965.083490206,
"ppl_stderr": 208687.88666648002,
"acc": 0.0,
"acc_stderr": 0.0
},
"crows_pairs_english_gender": {
"likelihood_difference": 4.4865234375,
"likelihood_difference_stderr": 0.46807997363511794,
"pct_stereotype": 0.515625,
"pct_stereotype_stderr": 0.027980952958187033
},
"crows_pairs_french_sexual_orientation": {
"likelihood_difference": 14.45467032967033,
"likelihood_difference_stderr": 1.0156884564726079,
"pct_stereotype": 0.8021978021978022,
"pct_stereotype_stderr": 0.04198895203196222
},
"crows_pairs_french_age": {
"likelihood_difference": 5.195138888888889,
"likelihood_difference_stderr": 0.6908977353059731,
"pct_stereotype": 0.5777777777777777,
"pct_stereotype_stderr": 0.05235473399540658
}
},
"versions": {
"hendrycksTest-elementary_mathematics": 0,
"hendrycksTest-high_school_world_history": 0,
"winogrande": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"hendrycksTest-machine_learning": 0,
"hendrycksTest-college_physics": 0,
"hendrycksTest-anatomy": 0,
"hendrycksTest-management": 0,
"hendrycksTest-college_medicine": 0,
"hendrycksTest-conceptual_physics": 0,
"hendrycksTest-professional_psychology": 0,
"hendrycksTest-high_school_geography": 0,
"crows_pairs_english_physical_appearance": 0,
"crows_pairs_english_disability": 0,
"hendrycksTest-logical_fallacies": 0,
"hendrycksTest-security_studies": 0,
"hendrycksTest-virology": 0,
"hendrycksTest-us_foreign_policy": 0,
"hendrycksTest-econometrics": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"logiqa": 0,
"hendrycksTest-human_sexuality": 0,
"crows_pairs_french_autre": 0,
"crows_pairs_french_disability": 0,
"hendrycksTest-high_school_psychology": 0,
"sciq": 0,
"crows_pairs_french_socioeconomic": 0,
"crows_pairs_french_religion": 0,
"hendrycksTest-jurisprudence": 0,
"crows_pairs_french_gender": 0,
"hendrycksTest-college_mathematics": 0,
"arc_challenge": 0,
"hendrycksTest-clinical_knowledge": 0,
"crows_pairs_english_autre": 0,
"crows_pairs_english_socioeconomic": 0,
"hendrycksTest-medical_genetics": 0,
"crows_pairs_english_race_color": 0,
"hendrycksTest-human_aging": 0,
"crows_pairs_french_nationality": 0,
"hendrycksTest-high_school_statistics": 0,
"hendrycksTest-philosophy": 0,
"hendrycksTest-miscellaneous": 0,
"hendrycksTest-professional_law": 0,
"hendrycksTest-astronomy": 0,
"crows_pairs_english_nationality": 0,
"hendrycksTest-college_chemistry": 0,
"hendrycksTest-high_school_chemistry": 0,
"hendrycksTest-high_school_european_history": 0,
"hendrycksTest-high_school_us_history": 0,
"hendrycksTest-college_computer_science": 0,
"crows_pairs_french_race_color": 0,
"crows_pairs_french": 0,
"hendrycksTest-world_religions": 0,
"hendrycksTest-global_facts": 0,
"hendrycksTest-abstract_algebra": 0,
"hendrycksTest-business_ethics": 0,
"hendrycksTest-international_law": 0,
"hendrycksTest-college_biology": 0,
"wsc": 0,
"hendrycksTest-high_school_microeconomics": 0,
"hendrycksTest-prehistory": 0,
"hendrycksTest-professional_accounting": 0,
"piqa": 0,
"hendrycksTest-moral_scenarios": 0,
"hendrycksTest-computer_security": 0,
"hendrycksTest-high_school_biology": 0,
"hendrycksTest-high_school_mathematics": 0,
"crows_pairs_english": 0,
"crows_pairs_english_sexual_orientation": 0,
"hendrycksTest-nutrition": 0,
"hendrycksTest-sociology": 0,
"hendrycksTest-high_school_computer_science": 0,
"crows_pairs_english_age": 0,
"hendrycksTest-high_school_physics": 0,
"hendrycksTest-public_relations": 0,
"arc_easy": 0,
"hendrycksTest-moral_disputes": 0,
"hendrycksTest-marketing": 0,
"hendrycksTest-electrical_engineering": 0,
"crows_pairs_french_physical_appearance": 0,
"hendrycksTest-formal_logic": 0,
"crows_pairs_english_religion": 0,
"hendrycksTest-professional_medicine": 0,
"lambada_openai": 0,
"crows_pairs_english_gender": 0,
"crows_pairs_french_sexual_orientation": 0,
"crows_pairs_french_age": 0
},
"config": {
"model": "hf-causal",
"model_args": "use_accelerate=True,pretrained=EleutherAI/pythia-v1.1-70m,revision=step64",
"num_fewshot": 0,
"batch_size": 32,
"device": null,
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}