Spaces:
Running
Running
| { | |
| "rows": [ | |
| { | |
| "model": "meta-llama/llama-3.1-8b-instruct", | |
| "price_prompt_per_token": 2e-08, | |
| "price_completion_per_token": 5e-08, | |
| "price_total_per_token": 6.999999999999999e-08, | |
| "price_ratio_vs_baseline": 1.0, | |
| "json_valid_rate_selection": 1.0, | |
| "json_parse_fail_rate_selection": 0.0, | |
| "selection_attempt_error_rate": 0.0, | |
| "selection_call_exhaust_rate": 0.06451612903225806, | |
| "macro_precision": 0.48757333333333336, | |
| "macro_recall": 0.68523, | |
| "macro_f1": 0.5531033333333333, | |
| "micro_precision": 0.4859038142620232, | |
| "micro_recall": 0.6845794392523364, | |
| "micro_f1": 0.5683802133850631, | |
| "hallucinated_avg_extra_tags": 10.333333333333334, | |
| "hallucinated_rate_fp_over_selected": 0.5140961857379768, | |
| "latency_avg_total_s": 16.39666666666667, | |
| "latency_p95_total_s": 28.11, | |
| "weird_spurious_character_samples": 1, | |
| "weird_rare_extra_ratio": 0.22580645161290322, | |
| "top_extra_tags": [ | |
| [ | |
| "clothed", | |
| 18 | |
| ], | |
| [ | |
| "clothing", | |
| 15 | |
| ], | |
| [ | |
| "anthro", | |
| 12 | |
| ], | |
| [ | |
| "looking_at_viewer", | |
| 11 | |
| ], | |
| [ | |
| "eyes", | |
| 8 | |
| ], | |
| [ | |
| "male", | |
| 5 | |
| ], | |
| [ | |
| "text", | |
| 5 | |
| ], | |
| [ | |
| "bottomwear", | |
| 3 | |
| ], | |
| [ | |
| "holding_object", | |
| 3 | |
| ], | |
| [ | |
| "nose", | |
| 3 | |
| ], | |
| [ | |
| "pose", | |
| 3 | |
| ], | |
| [ | |
| "topless", | |
| 3 | |
| ] | |
| ], | |
| "n_errors": 0 | |
| }, | |
| { | |
| "model": "mistralai/mistral-nemo", | |
| "price_prompt_per_token": 2e-08, | |
| "price_completion_per_token": 4e-08, | |
| "price_total_per_token": 6.000000000000001e-08, | |
| "price_ratio_vs_baseline": 0.8571428571428573, | |
| "json_valid_rate_selection": 1.0, | |
| "json_parse_fail_rate_selection": 0.0, | |
| "selection_attempt_error_rate": 0.0, | |
| "selection_call_exhaust_rate": 0.22580645161290322, | |
| "macro_precision": 0.5021633333333333, | |
| "macro_recall": 0.4754366666666667, | |
| "macro_f1": 0.4667133333333333, | |
| "micro_precision": 0.5153846153846153, | |
| "micro_recall": 0.4696261682242991, | |
| "micro_f1": 0.49144254278728605, | |
| "hallucinated_avg_extra_tags": 6.3, | |
| "hallucinated_rate_fp_over_selected": 0.4846153846153846, | |
| "latency_avg_total_s": 16.548666666666666, | |
| "latency_p95_total_s": 26.270000000000003, | |
| "weird_spurious_character_samples": 3, | |
| "weird_rare_extra_ratio": 0.13227513227513227, | |
| "top_extra_tags": [ | |
| [ | |
| "clothed", | |
| 21 | |
| ], | |
| [ | |
| "clothing", | |
| 17 | |
| ], | |
| [ | |
| "anthro", | |
| 14 | |
| ], | |
| [ | |
| "looking_at_viewer", | |
| 11 | |
| ], | |
| [ | |
| "female", | |
| 8 | |
| ], | |
| [ | |
| "male", | |
| 6 | |
| ], | |
| [ | |
| "pose", | |
| 5 | |
| ], | |
| [ | |
| "text", | |
| 5 | |
| ], | |
| [ | |
| "playful", | |
| 3 | |
| ], | |
| [ | |
| "ambiguous_gender", | |
| 2 | |
| ], | |
| [ | |
| "eyes", | |
| 2 | |
| ], | |
| [ | |
| "humanoid", | |
| 2 | |
| ] | |
| ], | |
| "n_errors": 0 | |
| }, | |
| { | |
| "model": "qwen/qwen-2.5-7b-instruct", | |
| "price_prompt_per_token": 4e-08, | |
| "price_completion_per_token": 1e-07, | |
| "price_total_per_token": 1.3999999999999998e-07, | |
| "price_ratio_vs_baseline": 2.0, | |
| "json_valid_rate_selection": 1.0, | |
| "json_parse_fail_rate_selection": 0.0, | |
| "selection_attempt_error_rate": 0.09090909090909091, | |
| "selection_call_exhaust_rate": 0.03225806451612903, | |
| "macro_precision": 0.5090266666666666, | |
| "macro_recall": 0.5072733333333334, | |
| "macro_f1": 0.4856666666666667, | |
| "micro_precision": 0.5071090047393365, | |
| "micro_recall": 0.5, | |
| "micro_f1": 0.5035294117647059, | |
| "hallucinated_avg_extra_tags": 6.933333333333334, | |
| "hallucinated_rate_fp_over_selected": 0.4928909952606635, | |
| "latency_avg_total_s": 18.443333333333335, | |
| "latency_p95_total_s": 33.75, | |
| "weird_spurious_character_samples": 1, | |
| "weird_rare_extra_ratio": 0.22596153846153846, | |
| "top_extra_tags": [ | |
| [ | |
| "looking_at_viewer", | |
| 16 | |
| ], | |
| [ | |
| "clothed", | |
| 12 | |
| ], | |
| [ | |
| "anthro", | |
| 9 | |
| ], | |
| [ | |
| "clothing", | |
| 9 | |
| ], | |
| [ | |
| "text", | |
| 6 | |
| ], | |
| [ | |
| "eyes", | |
| 3 | |
| ], | |
| [ | |
| "feral", | |
| 3 | |
| ], | |
| [ | |
| "playful", | |
| 3 | |
| ], | |
| [ | |
| "pose", | |
| 3 | |
| ], | |
| [ | |
| "black_clothing", | |
| 2 | |
| ], | |
| [ | |
| "dark_background", | |
| 2 | |
| ], | |
| [ | |
| "excited", | |
| 2 | |
| ] | |
| ], | |
| "n_errors": 0 | |
| }, | |
| { | |
| "model": "qwen/qwen2.5-coder-7b-instruct", | |
| "price_prompt_per_token": 3e-08, | |
| "price_completion_per_token": 9e-08, | |
| "price_total_per_token": 1.2e-07, | |
| "price_ratio_vs_baseline": 1.7142857142857144, | |
| "json_valid_rate_selection": 1.0, | |
| "json_parse_fail_rate_selection": 0.0, | |
| "selection_attempt_error_rate": 0.0, | |
| "selection_call_exhaust_rate": 0.0, | |
| "macro_precision": 0.3461066666666667, | |
| "macro_recall": 0.5469366666666666, | |
| "macro_f1": 0.40475666666666665, | |
| "micro_precision": 0.3219178082191781, | |
| "micro_recall": 0.5490654205607477, | |
| "micro_f1": 0.4058721934369603, | |
| "hallucinated_avg_extra_tags": 16.5, | |
| "hallucinated_rate_fp_over_selected": 0.678082191780822, | |
| "latency_avg_total_s": 14.717333333333332, | |
| "latency_p95_total_s": 24.189999999999998, | |
| "weird_spurious_character_samples": 10, | |
| "weird_rare_extra_ratio": 0.2080808080808081, | |
| "top_extra_tags": [ | |
| [ | |
| "male", | |
| 23 | |
| ], | |
| [ | |
| "clothed", | |
| 22 | |
| ], | |
| [ | |
| "clothing", | |
| 18 | |
| ], | |
| [ | |
| "anthro", | |
| 14 | |
| ], | |
| [ | |
| "eyes", | |
| 8 | |
| ], | |
| [ | |
| "looking_at_viewer", | |
| 8 | |
| ], | |
| [ | |
| "duo", | |
| 7 | |
| ], | |
| [ | |
| "pose", | |
| 6 | |
| ], | |
| [ | |
| "canis", | |
| 5 | |
| ], | |
| [ | |
| "holding_object", | |
| 5 | |
| ], | |
| [ | |
| "wolf", | |
| 5 | |
| ], | |
| [ | |
| "background_character", | |
| 4 | |
| ] | |
| ], | |
| "n_errors": 0 | |
| }, | |
| { | |
| "model": "mistralai/mistral-small-3.1-24b-instruct", | |
| "price_prompt_per_token": 3e-08, | |
| "price_completion_per_token": 1.1e-07, | |
| "price_total_per_token": 1.4e-07, | |
| "price_ratio_vs_baseline": 2.0000000000000004, | |
| "json_valid_rate_selection": 1.0, | |
| "json_parse_fail_rate_selection": 0.0, | |
| "selection_attempt_error_rate": 0.0, | |
| "selection_call_exhaust_rate": 0.0, | |
| "macro_precision": 0.5346866666666666, | |
| "macro_recall": 0.80151, | |
| "macro_f1": 0.62949, | |
| "micro_precision": 0.5291411042944786, | |
| "micro_recall": 0.8060747663551402, | |
| "micro_f1": 0.638888888888889, | |
| "hallucinated_avg_extra_tags": 10.233333333333333, | |
| "hallucinated_rate_fp_over_selected": 0.4708588957055215, | |
| "latency_avg_total_s": 21.778666666666666, | |
| "latency_p95_total_s": 31.800000000000004, | |
| "weird_spurious_character_samples": 0, | |
| "weird_rare_extra_ratio": 0.2280130293159609, | |
| "top_extra_tags": [ | |
| [ | |
| "anthro", | |
| 11 | |
| ], | |
| [ | |
| "clothed", | |
| 11 | |
| ], | |
| [ | |
| "eyes", | |
| 10 | |
| ], | |
| [ | |
| "pose", | |
| 8 | |
| ], | |
| [ | |
| "ambiguous_gender", | |
| 7 | |
| ], | |
| [ | |
| "clothing", | |
| 7 | |
| ], | |
| [ | |
| "text", | |
| 5 | |
| ], | |
| [ | |
| "holding_object", | |
| 4 | |
| ], | |
| [ | |
| "humanoid", | |
| 4 | |
| ], | |
| [ | |
| "playful", | |
| 4 | |
| ], | |
| [ | |
| "action_pose", | |
| 3 | |
| ], | |
| [ | |
| "feral", | |
| 3 | |
| ] | |
| ], | |
| "n_errors": 0 | |
| } | |
| ], | |
| "important_tags": [ | |
| "anthro", | |
| "clothed", | |
| "clothing", | |
| "canid", | |
| "felid", | |
| "bear", | |
| "looking_at_viewer", | |
| "solo" | |
| ], | |
| "per_tag": { | |
| "meta-llama/llama-3.1-8b-instruct": { | |
| "anthro": { | |
| "support_gt": 16, | |
| "predicted": 28, | |
| "tp": 16, | |
| "fp": 12, | |
| "fn": 0, | |
| "precision": 0.5714285714285714, | |
| "recall": 1.0, | |
| "f1": 0.7272727272727273 | |
| }, | |
| "clothed": { | |
| "support_gt": 8, | |
| "predicted": 26, | |
| "tp": 8, | |
| "fp": 18, | |
| "fn": 0, | |
| "precision": 0.3076923076923077, | |
| "recall": 1.0, | |
| "f1": 0.47058823529411764 | |
| }, | |
| "clothing": { | |
| "support_gt": 12, | |
| "predicted": 27, | |
| "tp": 12, | |
| "fp": 15, | |
| "fn": 0, | |
| "precision": 0.4444444444444444, | |
| "recall": 1.0, | |
| "f1": 0.6153846153846153 | |
| }, | |
| "canid": { | |
| "support_gt": 9, | |
| "predicted": 9, | |
| "tp": 9, | |
| "fp": 0, | |
| "fn": 0, | |
| "precision": 1.0, | |
| "recall": 1.0, | |
| "f1": 1.0 | |
| }, | |
| "felid": { | |
| "support_gt": 2, | |
| "predicted": 2, | |
| "tp": 2, | |
| "fp": 0, | |
| "fn": 0, | |
| "precision": 1.0, | |
| "recall": 1.0, | |
| "f1": 1.0 | |
| }, | |
| "bear": { | |
| "support_gt": 3, | |
| "predicted": 1, | |
| "tp": 1, | |
| "fp": 0, | |
| "fn": 2, | |
| "precision": 1.0, | |
| "recall": 0.3333333333333333, | |
| "f1": 0.5 | |
| }, | |
| "looking_at_viewer": { | |
| "support_gt": 2, | |
| "predicted": 13, | |
| "tp": 2, | |
| "fp": 11, | |
| "fn": 0, | |
| "precision": 0.15384615384615385, | |
| "recall": 1.0, | |
| "f1": 0.2666666666666667 | |
| }, | |
| "solo": { | |
| "support_gt": 23, | |
| "predicted": 25, | |
| "tp": 23, | |
| "fp": 2, | |
| "fn": 0, | |
| "precision": 0.92, | |
| "recall": 1.0, | |
| "f1": 0.9583333333333334 | |
| } | |
| }, | |
| "mistralai/mistral-nemo": { | |
| "anthro": { | |
| "support_gt": 16, | |
| "predicted": 30, | |
| "tp": 16, | |
| "fp": 14, | |
| "fn": 0, | |
| "precision": 0.5333333333333333, | |
| "recall": 1.0, | |
| "f1": 0.6956521739130436 | |
| }, | |
| "clothed": { | |
| "support_gt": 8, | |
| "predicted": 29, | |
| "tp": 8, | |
| "fp": 21, | |
| "fn": 0, | |
| "precision": 0.27586206896551724, | |
| "recall": 1.0, | |
| "f1": 0.4324324324324324 | |
| }, | |
| "clothing": { | |
| "support_gt": 12, | |
| "predicted": 29, | |
| "tp": 12, | |
| "fp": 17, | |
| "fn": 0, | |
| "precision": 0.41379310344827586, | |
| "recall": 1.0, | |
| "f1": 0.5853658536585366 | |
| }, | |
| "canid": { | |
| "support_gt": 9, | |
| "predicted": 5, | |
| "tp": 4, | |
| "fp": 1, | |
| "fn": 5, | |
| "precision": 0.8, | |
| "recall": 0.4444444444444444, | |
| "f1": 0.5714285714285714 | |
| }, | |
| "felid": { | |
| "support_gt": 2, | |
| "predicted": 1, | |
| "tp": 1, | |
| "fp": 0, | |
| "fn": 1, | |
| "precision": 1.0, | |
| "recall": 0.5, | |
| "f1": 0.6666666666666666 | |
| }, | |
| "bear": { | |
| "support_gt": 3, | |
| "predicted": 2, | |
| "tp": 2, | |
| "fp": 0, | |
| "fn": 1, | |
| "precision": 1.0, | |
| "recall": 0.6666666666666666, | |
| "f1": 0.8 | |
| }, | |
| "looking_at_viewer": { | |
| "support_gt": 2, | |
| "predicted": 13, | |
| "tp": 2, | |
| "fp": 11, | |
| "fn": 0, | |
| "precision": 0.15384615384615385, | |
| "recall": 1.0, | |
| "f1": 0.2666666666666667 | |
| }, | |
| "solo": { | |
| "support_gt": 23, | |
| "predicted": 25, | |
| "tp": 23, | |
| "fp": 2, | |
| "fn": 0, | |
| "precision": 0.92, | |
| "recall": 1.0, | |
| "f1": 0.9583333333333334 | |
| } | |
| }, | |
| "qwen/qwen-2.5-7b-instruct": { | |
| "anthro": { | |
| "support_gt": 16, | |
| "predicted": 25, | |
| "tp": 16, | |
| "fp": 9, | |
| "fn": 0, | |
| "precision": 0.64, | |
| "recall": 1.0, | |
| "f1": 0.7804878048780487 | |
| }, | |
| "clothed": { | |
| "support_gt": 8, | |
| "predicted": 19, | |
| "tp": 7, | |
| "fp": 12, | |
| "fn": 1, | |
| "precision": 0.3684210526315789, | |
| "recall": 0.875, | |
| "f1": 0.5185185185185185 | |
| }, | |
| "clothing": { | |
| "support_gt": 12, | |
| "predicted": 21, | |
| "tp": 12, | |
| "fp": 9, | |
| "fn": 0, | |
| "precision": 0.5714285714285714, | |
| "recall": 1.0, | |
| "f1": 0.7272727272727273 | |
| }, | |
| "canid": { | |
| "support_gt": 9, | |
| "predicted": 6, | |
| "tp": 5, | |
| "fp": 1, | |
| "fn": 4, | |
| "precision": 0.8333333333333334, | |
| "recall": 0.5555555555555556, | |
| "f1": 0.6666666666666667 | |
| }, | |
| "felid": { | |
| "support_gt": 2, | |
| "predicted": 3, | |
| "tp": 2, | |
| "fp": 1, | |
| "fn": 0, | |
| "precision": 0.6666666666666666, | |
| "recall": 1.0, | |
| "f1": 0.8 | |
| }, | |
| "bear": { | |
| "support_gt": 3, | |
| "predicted": 1, | |
| "tp": 1, | |
| "fp": 0, | |
| "fn": 2, | |
| "precision": 1.0, | |
| "recall": 0.3333333333333333, | |
| "f1": 0.5 | |
| }, | |
| "looking_at_viewer": { | |
| "support_gt": 2, | |
| "predicted": 18, | |
| "tp": 2, | |
| "fp": 16, | |
| "fn": 0, | |
| "precision": 0.1111111111111111, | |
| "recall": 1.0, | |
| "f1": 0.19999999999999998 | |
| }, | |
| "solo": { | |
| "support_gt": 23, | |
| "predicted": 22, | |
| "tp": 20, | |
| "fp": 2, | |
| "fn": 3, | |
| "precision": 0.9090909090909091, | |
| "recall": 0.8695652173913043, | |
| "f1": 0.888888888888889 | |
| } | |
| }, | |
| "qwen/qwen2.5-coder-7b-instruct": { | |
| "anthro": { | |
| "support_gt": 16, | |
| "predicted": 30, | |
| "tp": 16, | |
| "fp": 14, | |
| "fn": 0, | |
| "precision": 0.5333333333333333, | |
| "recall": 1.0, | |
| "f1": 0.6956521739130436 | |
| }, | |
| "clothed": { | |
| "support_gt": 8, | |
| "predicted": 30, | |
| "tp": 8, | |
| "fp": 22, | |
| "fn": 0, | |
| "precision": 0.26666666666666666, | |
| "recall": 1.0, | |
| "f1": 0.4210526315789474 | |
| }, | |
| "clothing": { | |
| "support_gt": 12, | |
| "predicted": 30, | |
| "tp": 12, | |
| "fp": 18, | |
| "fn": 0, | |
| "precision": 0.4, | |
| "recall": 1.0, | |
| "f1": 0.5714285714285715 | |
| }, | |
| "canid": { | |
| "support_gt": 9, | |
| "predicted": 10, | |
| "tp": 6, | |
| "fp": 4, | |
| "fn": 3, | |
| "precision": 0.6, | |
| "recall": 0.6666666666666666, | |
| "f1": 0.631578947368421 | |
| }, | |
| "felid": { | |
| "support_gt": 2, | |
| "predicted": 1, | |
| "tp": 1, | |
| "fp": 0, | |
| "fn": 1, | |
| "precision": 1.0, | |
| "recall": 0.5, | |
| "f1": 0.6666666666666666 | |
| }, | |
| "bear": { | |
| "support_gt": 3, | |
| "predicted": 2, | |
| "tp": 2, | |
| "fp": 0, | |
| "fn": 1, | |
| "precision": 1.0, | |
| "recall": 0.6666666666666666, | |
| "f1": 0.8 | |
| }, | |
| "looking_at_viewer": { | |
| "support_gt": 2, | |
| "predicted": 9, | |
| "tp": 1, | |
| "fp": 8, | |
| "fn": 1, | |
| "precision": 0.1111111111111111, | |
| "recall": 0.5, | |
| "f1": 0.1818181818181818 | |
| }, | |
| "solo": { | |
| "support_gt": 23, | |
| "predicted": 26, | |
| "tp": 23, | |
| "fp": 3, | |
| "fn": 0, | |
| "precision": 0.8846153846153846, | |
| "recall": 1.0, | |
| "f1": 0.9387755102040816 | |
| } | |
| }, | |
| "mistralai/mistral-small-3.1-24b-instruct": { | |
| "anthro": { | |
| "support_gt": 16, | |
| "predicted": 27, | |
| "tp": 16, | |
| "fp": 11, | |
| "fn": 0, | |
| "precision": 0.5925925925925926, | |
| "recall": 1.0, | |
| "f1": 0.7441860465116279 | |
| }, | |
| "clothed": { | |
| "support_gt": 8, | |
| "predicted": 19, | |
| "tp": 8, | |
| "fp": 11, | |
| "fn": 0, | |
| "precision": 0.42105263157894735, | |
| "recall": 1.0, | |
| "f1": 0.5925925925925926 | |
| }, | |
| "clothing": { | |
| "support_gt": 12, | |
| "predicted": 19, | |
| "tp": 12, | |
| "fp": 7, | |
| "fn": 0, | |
| "precision": 0.631578947368421, | |
| "recall": 1.0, | |
| "f1": 0.7741935483870968 | |
| }, | |
| "canid": { | |
| "support_gt": 9, | |
| "predicted": 10, | |
| "tp": 9, | |
| "fp": 1, | |
| "fn": 0, | |
| "precision": 0.9, | |
| "recall": 1.0, | |
| "f1": 0.9473684210526316 | |
| }, | |
| "felid": { | |
| "support_gt": 2, | |
| "predicted": 1, | |
| "tp": 1, | |
| "fp": 0, | |
| "fn": 1, | |
| "precision": 1.0, | |
| "recall": 0.5, | |
| "f1": 0.6666666666666666 | |
| }, | |
| "bear": { | |
| "support_gt": 3, | |
| "predicted": 2, | |
| "tp": 2, | |
| "fp": 0, | |
| "fn": 1, | |
| "precision": 1.0, | |
| "recall": 0.6666666666666666, | |
| "f1": 0.8 | |
| }, | |
| "looking_at_viewer": { | |
| "support_gt": 2, | |
| "predicted": 3, | |
| "tp": 2, | |
| "fp": 1, | |
| "fn": 0, | |
| "precision": 0.6666666666666666, | |
| "recall": 1.0, | |
| "f1": 0.8 | |
| }, | |
| "solo": { | |
| "support_gt": 23, | |
| "predicted": 25, | |
| "tp": 23, | |
| "fp": 2, | |
| "fn": 0, | |
| "precision": 0.92, | |
| "recall": 1.0, | |
| "f1": 0.9583333333333334 | |
| } | |
| } | |
| } | |
| } |