{ "rows": [ { "model": "meta-llama/llama-3.1-8b-instruct", "price_prompt_per_token": 2e-08, "price_completion_per_token": 5e-08, "price_total_per_token": 6.999999999999999e-08, "price_ratio_vs_baseline": 1.0, "json_valid_rate_selection": 1.0, "json_parse_fail_rate_selection": 0.0, "selection_attempt_error_rate": 0.0, "selection_call_exhaust_rate": 0.06451612903225806, "macro_precision": 0.48757333333333336, "macro_recall": 0.68523, "macro_f1": 0.5531033333333333, "micro_precision": 0.4859038142620232, "micro_recall": 0.6845794392523364, "micro_f1": 0.5683802133850631, "hallucinated_avg_extra_tags": 10.333333333333334, "hallucinated_rate_fp_over_selected": 0.5140961857379768, "latency_avg_total_s": 16.39666666666667, "latency_p95_total_s": 28.11, "weird_spurious_character_samples": 1, "weird_rare_extra_ratio": 0.22580645161290322, "top_extra_tags": [ [ "clothed", 18 ], [ "clothing", 15 ], [ "anthro", 12 ], [ "looking_at_viewer", 11 ], [ "eyes", 8 ], [ "male", 5 ], [ "text", 5 ], [ "bottomwear", 3 ], [ "holding_object", 3 ], [ "nose", 3 ], [ "pose", 3 ], [ "topless", 3 ] ], "n_errors": 0 }, { "model": "mistralai/mistral-nemo", "price_prompt_per_token": 2e-08, "price_completion_per_token": 4e-08, "price_total_per_token": 6.000000000000001e-08, "price_ratio_vs_baseline": 0.8571428571428573, "json_valid_rate_selection": 1.0, "json_parse_fail_rate_selection": 0.0, "selection_attempt_error_rate": 0.0, "selection_call_exhaust_rate": 0.22580645161290322, "macro_precision": 0.5021633333333333, "macro_recall": 0.4754366666666667, "macro_f1": 0.4667133333333333, "micro_precision": 0.5153846153846153, "micro_recall": 0.4696261682242991, "micro_f1": 0.49144254278728605, "hallucinated_avg_extra_tags": 6.3, "hallucinated_rate_fp_over_selected": 0.4846153846153846, "latency_avg_total_s": 16.548666666666666, "latency_p95_total_s": 26.270000000000003, "weird_spurious_character_samples": 3, "weird_rare_extra_ratio": 0.13227513227513227, "top_extra_tags": [ [ "clothed", 21 ], [ "clothing", 17 ], [ "anthro", 14 ], [ "looking_at_viewer", 11 ], [ "female", 8 ], [ "male", 6 ], [ "pose", 5 ], [ "text", 5 ], [ "playful", 3 ], [ "ambiguous_gender", 2 ], [ "eyes", 2 ], [ "humanoid", 2 ] ], "n_errors": 0 }, { "model": "qwen/qwen-2.5-7b-instruct", "price_prompt_per_token": 4e-08, "price_completion_per_token": 1e-07, "price_total_per_token": 1.3999999999999998e-07, "price_ratio_vs_baseline": 2.0, "json_valid_rate_selection": 1.0, "json_parse_fail_rate_selection": 0.0, "selection_attempt_error_rate": 0.09090909090909091, "selection_call_exhaust_rate": 0.03225806451612903, "macro_precision": 0.5090266666666666, "macro_recall": 0.5072733333333334, "macro_f1": 0.4856666666666667, "micro_precision": 0.5071090047393365, "micro_recall": 0.5, "micro_f1": 0.5035294117647059, "hallucinated_avg_extra_tags": 6.933333333333334, "hallucinated_rate_fp_over_selected": 0.4928909952606635, "latency_avg_total_s": 18.443333333333335, "latency_p95_total_s": 33.75, "weird_spurious_character_samples": 1, "weird_rare_extra_ratio": 0.22596153846153846, "top_extra_tags": [ [ "looking_at_viewer", 16 ], [ "clothed", 12 ], [ "anthro", 9 ], [ "clothing", 9 ], [ "text", 6 ], [ "eyes", 3 ], [ "feral", 3 ], [ "playful", 3 ], [ "pose", 3 ], [ "black_clothing", 2 ], [ "dark_background", 2 ], [ "excited", 2 ] ], "n_errors": 0 }, { "model": "qwen/qwen2.5-coder-7b-instruct", "price_prompt_per_token": 3e-08, "price_completion_per_token": 9e-08, "price_total_per_token": 1.2e-07, "price_ratio_vs_baseline": 1.7142857142857144, "json_valid_rate_selection": 1.0, "json_parse_fail_rate_selection": 0.0, "selection_attempt_error_rate": 0.0, "selection_call_exhaust_rate": 0.0, "macro_precision": 0.3461066666666667, "macro_recall": 0.5469366666666666, "macro_f1": 0.40475666666666665, "micro_precision": 0.3219178082191781, "micro_recall": 0.5490654205607477, "micro_f1": 0.4058721934369603, "hallucinated_avg_extra_tags": 16.5, "hallucinated_rate_fp_over_selected": 0.678082191780822, "latency_avg_total_s": 14.717333333333332, "latency_p95_total_s": 24.189999999999998, "weird_spurious_character_samples": 10, "weird_rare_extra_ratio": 0.2080808080808081, "top_extra_tags": [ [ "male", 23 ], [ "clothed", 22 ], [ "clothing", 18 ], [ "anthro", 14 ], [ "eyes", 8 ], [ "looking_at_viewer", 8 ], [ "duo", 7 ], [ "pose", 6 ], [ "canis", 5 ], [ "holding_object", 5 ], [ "wolf", 5 ], [ "background_character", 4 ] ], "n_errors": 0 }, { "model": "mistralai/mistral-small-3.1-24b-instruct", "price_prompt_per_token": 3e-08, "price_completion_per_token": 1.1e-07, "price_total_per_token": 1.4e-07, "price_ratio_vs_baseline": 2.0000000000000004, "json_valid_rate_selection": 1.0, "json_parse_fail_rate_selection": 0.0, "selection_attempt_error_rate": 0.0, "selection_call_exhaust_rate": 0.0, "macro_precision": 0.5346866666666666, "macro_recall": 0.80151, "macro_f1": 0.62949, "micro_precision": 0.5291411042944786, "micro_recall": 0.8060747663551402, "micro_f1": 0.638888888888889, "hallucinated_avg_extra_tags": 10.233333333333333, "hallucinated_rate_fp_over_selected": 0.4708588957055215, "latency_avg_total_s": 21.778666666666666, "latency_p95_total_s": 31.800000000000004, "weird_spurious_character_samples": 0, "weird_rare_extra_ratio": 0.2280130293159609, "top_extra_tags": [ [ "anthro", 11 ], [ "clothed", 11 ], [ "eyes", 10 ], [ "pose", 8 ], [ "ambiguous_gender", 7 ], [ "clothing", 7 ], [ "text", 5 ], [ "holding_object", 4 ], [ "humanoid", 4 ], [ "playful", 4 ], [ "action_pose", 3 ], [ "feral", 3 ] ], "n_errors": 0 } ], "important_tags": [ "anthro", "clothed", "clothing", "canid", "felid", "bear", "looking_at_viewer", "solo" ], "per_tag": { "meta-llama/llama-3.1-8b-instruct": { "anthro": { "support_gt": 16, "predicted": 28, "tp": 16, "fp": 12, "fn": 0, "precision": 0.5714285714285714, "recall": 1.0, "f1": 0.7272727272727273 }, "clothed": { "support_gt": 8, "predicted": 26, "tp": 8, "fp": 18, "fn": 0, "precision": 0.3076923076923077, "recall": 1.0, "f1": 0.47058823529411764 }, "clothing": { "support_gt": 12, "predicted": 27, "tp": 12, "fp": 15, "fn": 0, "precision": 0.4444444444444444, "recall": 1.0, "f1": 0.6153846153846153 }, "canid": { "support_gt": 9, "predicted": 9, "tp": 9, "fp": 0, "fn": 0, "precision": 1.0, "recall": 1.0, "f1": 1.0 }, "felid": { "support_gt": 2, "predicted": 2, "tp": 2, "fp": 0, "fn": 0, "precision": 1.0, "recall": 1.0, "f1": 1.0 }, "bear": { "support_gt": 3, "predicted": 1, "tp": 1, "fp": 0, "fn": 2, "precision": 1.0, "recall": 0.3333333333333333, "f1": 0.5 }, "looking_at_viewer": { "support_gt": 2, "predicted": 13, "tp": 2, "fp": 11, "fn": 0, "precision": 0.15384615384615385, "recall": 1.0, "f1": 0.2666666666666667 }, "solo": { "support_gt": 23, "predicted": 25, "tp": 23, "fp": 2, "fn": 0, "precision": 0.92, "recall": 1.0, "f1": 0.9583333333333334 } }, "mistralai/mistral-nemo": { "anthro": { "support_gt": 16, "predicted": 30, "tp": 16, "fp": 14, "fn": 0, "precision": 0.5333333333333333, "recall": 1.0, "f1": 0.6956521739130436 }, "clothed": { "support_gt": 8, "predicted": 29, "tp": 8, "fp": 21, "fn": 0, "precision": 0.27586206896551724, "recall": 1.0, "f1": 0.4324324324324324 }, "clothing": { "support_gt": 12, "predicted": 29, "tp": 12, "fp": 17, "fn": 0, "precision": 0.41379310344827586, "recall": 1.0, "f1": 0.5853658536585366 }, "canid": { "support_gt": 9, "predicted": 5, "tp": 4, "fp": 1, "fn": 5, "precision": 0.8, "recall": 0.4444444444444444, "f1": 0.5714285714285714 }, "felid": { "support_gt": 2, "predicted": 1, "tp": 1, "fp": 0, "fn": 1, "precision": 1.0, "recall": 0.5, "f1": 0.6666666666666666 }, "bear": { "support_gt": 3, "predicted": 2, "tp": 2, "fp": 0, "fn": 1, "precision": 1.0, "recall": 0.6666666666666666, "f1": 0.8 }, "looking_at_viewer": { "support_gt": 2, "predicted": 13, "tp": 2, "fp": 11, "fn": 0, "precision": 0.15384615384615385, "recall": 1.0, "f1": 0.2666666666666667 }, "solo": { "support_gt": 23, "predicted": 25, "tp": 23, "fp": 2, "fn": 0, "precision": 0.92, "recall": 1.0, "f1": 0.9583333333333334 } }, "qwen/qwen-2.5-7b-instruct": { "anthro": { "support_gt": 16, "predicted": 25, "tp": 16, "fp": 9, "fn": 0, "precision": 0.64, "recall": 1.0, "f1": 0.7804878048780487 }, "clothed": { "support_gt": 8, "predicted": 19, "tp": 7, "fp": 12, "fn": 1, "precision": 0.3684210526315789, "recall": 0.875, "f1": 0.5185185185185185 }, "clothing": { "support_gt": 12, "predicted": 21, "tp": 12, "fp": 9, "fn": 0, "precision": 0.5714285714285714, "recall": 1.0, "f1": 0.7272727272727273 }, "canid": { "support_gt": 9, "predicted": 6, "tp": 5, "fp": 1, "fn": 4, "precision": 0.8333333333333334, "recall": 0.5555555555555556, "f1": 0.6666666666666667 }, "felid": { "support_gt": 2, "predicted": 3, "tp": 2, "fp": 1, "fn": 0, "precision": 0.6666666666666666, "recall": 1.0, "f1": 0.8 }, "bear": { "support_gt": 3, "predicted": 1, "tp": 1, "fp": 0, "fn": 2, "precision": 1.0, "recall": 0.3333333333333333, "f1": 0.5 }, "looking_at_viewer": { "support_gt": 2, "predicted": 18, "tp": 2, "fp": 16, "fn": 0, "precision": 0.1111111111111111, "recall": 1.0, "f1": 0.19999999999999998 }, "solo": { "support_gt": 23, "predicted": 22, "tp": 20, "fp": 2, "fn": 3, "precision": 0.9090909090909091, "recall": 0.8695652173913043, "f1": 0.888888888888889 } }, "qwen/qwen2.5-coder-7b-instruct": { "anthro": { "support_gt": 16, "predicted": 30, "tp": 16, "fp": 14, "fn": 0, "precision": 0.5333333333333333, "recall": 1.0, "f1": 0.6956521739130436 }, "clothed": { "support_gt": 8, "predicted": 30, "tp": 8, "fp": 22, "fn": 0, "precision": 0.26666666666666666, "recall": 1.0, "f1": 0.4210526315789474 }, "clothing": { "support_gt": 12, "predicted": 30, "tp": 12, "fp": 18, "fn": 0, "precision": 0.4, "recall": 1.0, "f1": 0.5714285714285715 }, "canid": { "support_gt": 9, "predicted": 10, "tp": 6, "fp": 4, "fn": 3, "precision": 0.6, "recall": 0.6666666666666666, "f1": 0.631578947368421 }, "felid": { "support_gt": 2, "predicted": 1, "tp": 1, "fp": 0, "fn": 1, "precision": 1.0, "recall": 0.5, "f1": 0.6666666666666666 }, "bear": { "support_gt": 3, "predicted": 2, "tp": 2, "fp": 0, "fn": 1, "precision": 1.0, "recall": 0.6666666666666666, "f1": 0.8 }, "looking_at_viewer": { "support_gt": 2, "predicted": 9, "tp": 1, "fp": 8, "fn": 1, "precision": 0.1111111111111111, "recall": 0.5, "f1": 0.1818181818181818 }, "solo": { "support_gt": 23, "predicted": 26, "tp": 23, "fp": 3, "fn": 0, "precision": 0.8846153846153846, "recall": 1.0, "f1": 0.9387755102040816 } }, "mistralai/mistral-small-3.1-24b-instruct": { "anthro": { "support_gt": 16, "predicted": 27, "tp": 16, "fp": 11, "fn": 0, "precision": 0.5925925925925926, "recall": 1.0, "f1": 0.7441860465116279 }, "clothed": { "support_gt": 8, "predicted": 19, "tp": 8, "fp": 11, "fn": 0, "precision": 0.42105263157894735, "recall": 1.0, "f1": 0.5925925925925926 }, "clothing": { "support_gt": 12, "predicted": 19, "tp": 12, "fp": 7, "fn": 0, "precision": 0.631578947368421, "recall": 1.0, "f1": 0.7741935483870968 }, "canid": { "support_gt": 9, "predicted": 10, "tp": 9, "fp": 1, "fn": 0, "precision": 0.9, "recall": 1.0, "f1": 0.9473684210526316 }, "felid": { "support_gt": 2, "predicted": 1, "tp": 1, "fp": 0, "fn": 1, "precision": 1.0, "recall": 0.5, "f1": 0.6666666666666666 }, "bear": { "support_gt": 3, "predicted": 2, "tp": 2, "fp": 0, "fn": 1, "precision": 1.0, "recall": 0.6666666666666666, "f1": 0.8 }, "looking_at_viewer": { "support_gt": 2, "predicted": 3, "tp": 2, "fp": 1, "fn": 0, "precision": 0.6666666666666666, "recall": 1.0, "f1": 0.8 }, "solo": { "support_gt": 23, "predicted": 25, "tp": 23, "fp": 2, "fn": 0, "precision": 0.92, "recall": 1.0, "f1": 0.9583333333333334 } } } }