Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

Claude commited on Feb 13

Commit

998779f

1 Parent(s): 684cf99

Add diagnostic script for structural clothing inference

Creates diagnose_structural_clothing.py to isolate clothing state tag
inference failures in structural inference system.

Tests 7 hand-crafted captions with explicit clothing mentions:
- Clothed (formal wear, casual wear, complex descriptions)
- Nude, topless, bottomless variations
- Multiple characters

Logs full LLM responses to identify if failure is due to:
- Prompt design issues
- Model capability limits (Llama 3.1 8B)
- Response parsing bugs

Run locally with: python scripts/diagnose_structural_clothing.py

Context: n=50 eval showed 14/50 samples (28%) missed 'clothed' tag
despite explicit clothing mentions in captions. New group-based
structural inference had zero performance improvement over baseline.

https://claude.ai/code/session_015ZwE7a5E6YVTrMpuB2pXX7

Files changed (1) hide show

scripts/diagnose_structural_clothing.py +196 -0

scripts/diagnose_structural_clothing.py ADDED Viewed

	@@ -0,0 +1,196 @@

+#!/usr/bin/env python3
+"""
+Diagnostic script to test structural inference for clothing state tags.
+Tests with hand-crafted captions that explicitly mention clothing to identify
+why the LLM is systematically failing to infer clothing state tags.
+"""
+import sys
+import os
+from pathlib import Path
+# Add project root to path
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+from psq_rag.llm.select import llm_infer_structural_tags, _get_structural_groups, _build_structural_prompt
+# Test cases with explicit clothing mentions
+TEST_CASES = [
+    {
+        "name": "Explicit clothed - formal wear",
+        "caption": "A male wolf wearing a black suit, white shirt, and red tie standing in an office.",
+        "expected": ["solo", "anthro", "male", "clothed"],
+    },
+    {
+        "name": "Explicit clothed - casual wear",
+        "caption": "An anthropomorphic fox in blue jeans and a t-shirt walking down a street.",
+        "expected": ["solo", "anthro", "clothed"],
+    },
+    {
+        "name": "Explicit nude",
+        "caption": "A naked female cat sitting on a beach, no clothing visible.",
+        "expected": ["solo", "anthro", "female", "nude"],
+    },
+    {
+        "name": "Explicit topless",
+        "caption": "A shirtless male dragon wearing pants, showing his muscular chest.",
+        "expected": ["solo", "anthro", "male", "topless"],
+    },
+    {
+        "name": "Explicit bottomless",
+        "caption": "A female rabbit wearing only a hoodie on her upper body, with her lower half uncovered.",
+        "expected": ["solo", "anthro", "female", "bottomless"],
+    },
+    {
+        "name": "Multiple characters with clothing",
+        "caption": "Two male dogs wearing police uniforms standing side by side.",
+        "expected": ["duo", "anthro", "male", "clothed"],
+    },
+    {
+        "name": "Clothing mentioned in middle of description",
+        "caption": "A muscular male wolf with red fur stands in a forest. He wears a black leather jacket and torn jeans. His eyes glow blue in the darkness.",
+        "expected": ["solo", "anthro", "male", "clothed"],
+    },
+]
+def print_structural_prompt():
+    """Print the actual statements the LLM sees."""
+    groups = _get_structural_groups()
+    statement_lines, flat_tags = _build_structural_prompt(groups)
+    print("=" * 80)
+    print("STRUCTURAL INFERENCE STATEMENTS")
+    print("=" * 80)
+    print(statement_lines)
+    print("\n" + "=" * 80)
+    print("TAG MAPPING (1-based index)")
+    print("=" * 80)
+    for i, (tag, defn) in enumerate(flat_tags, 1):
+        print(f"{i:2d}. {tag:20s} | {defn[:60]}...")
+    print("=" * 80 + "\n")
+def run_diagnostic():
+    """Run diagnostic tests on structural clothing inference."""
+    print_structural_prompt()
+    print("\n" + "=" * 80)
+    print("RUNNING DIAGNOSTIC TESTS")
+    print("=" * 80 + "\n")
+    results = []
+    for i, test_case in enumerate(TEST_CASES, 1):
+        name = test_case["name"]
+        caption = test_case["caption"]
+        expected = test_case["expected"]
+        print(f"\n{'─' * 80}")
+        print(f"TEST {i}/{len(TEST_CASES)}: {name}")
+        print(f"{'─' * 80}")
+        print(f"Caption: {caption}")
+        print(f"Expected tags: {expected}")
+        print(f"\nCalling LLM...", flush=True)
+        # Call structural inference
+        def log_fn(msg):
+            print(f"  [LOG] {msg}")
+        selected = llm_infer_structural_tags(
+            caption,
+            log=log_fn,
+            temperature=0.0,
+            max_tokens=512,
+        )
+        print(f"\nSelected tags: {selected}")
+        # Analyze results
+        expected_set = set(expected)
+        selected_set = set(selected)
+        clothing_tags = {'clothed', 'nude', 'topless', 'bottomless'}
+        expected_clothing = expected_set & clothing_tags
+        selected_clothing = selected_set & clothing_tags
+        missed = expected_set - selected_set
+        extra = selected_set - expected_set
+        correct = expected_set & selected_set
+        clothing_correct = expected_clothing == selected_clothing
+        print(f"\n✓ Correct:  {sorted(correct)}")
+        if missed:
+            print(f"✗ Missed:   {sorted(missed)}")
+        if extra:
+            print(f"⚠ Extra:    {sorted(extra)}")
+        print(f"\nClothing state inference: {'✓ PASS' if clothing_correct else '✗ FAIL'}")
+        if expected_clothing:
+            print(f"  Expected: {sorted(expected_clothing)}")
+            print(f"  Selected: {sorted(selected_clothing) if selected_clothing else '(none)'}")
+        results.append({
+            "name": name,
+            "caption": caption,
+            "expected": expected,
+            "selected": selected,
+            "clothing_correct": clothing_correct,
+            "missed": list(missed),
+            "extra": list(extra),
+        })
+    # Summary
+    print("\n\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+    total_tests = len(results)
+    clothing_pass = sum(1 for r in results if r["clothing_correct"])
+    clothing_fail = total_tests - clothing_pass
+    print(f"\nTotal tests: {total_tests}")
+    print(f"Clothing state inference:")
+    print(f"  ✓ Pass: {clothing_pass}/{total_tests} ({100*clothing_pass/total_tests:.0f}%)")
+    print(f"  ✗ Fail: {clothing_fail}/{total_tests} ({100*clothing_fail/total_tests:.0f}%)")
+    if clothing_fail > 0:
+        print(f"\n{'─' * 80}")
+        print("FAILURES:")
+        print(f"{'─' * 80}")
+        for r in results:
+            if not r["clothing_correct"]:
+                print(f"\n• {r['name']}")
+                print(f"  Caption: {r['caption'][:60]}...")
+                clothing_tags = {'clothed', 'nude', 'topless', 'bottomless'}
+                exp_clothing = set(r['expected']) & clothing_tags
+                sel_clothing = set(r['selected']) & clothing_tags
+                print(f"  Expected: {sorted(exp_clothing)}")
+                print(f"  Selected: {sorted(sel_clothing) if sel_clothing else '(none)'}")
+    # Overall assessment
+    print(f"\n{'=' * 80}")
+    print("DIAGNOSIS")
+    print(f"{'=' * 80}")
+    if clothing_pass == total_tests:
+        print("\n✓ All tests passed! Clothing inference is working correctly.")
+    elif clothing_pass == 0:
+        print("\n✗ ALL tests failed! The LLM is completely ignoring the clothing state group.")
+        print("\nPossible causes:")
+        print("1. Prompt design issue - clothing group not salient enough")
+        print("2. Model capability issue - Llama 3.1 8B cannot handle this task")
+        print("3. Response parsing issue - LLM is selecting but parser is missing it")
+    else:
+        print(f"\n⚠ Partial failure! {clothing_fail}/{total_tests} tests failed.")
+        print("\nThe LLM is sometimes inferring clothing state but inconsistently.")
+    return results
+if __name__ == "__main__":
+    results = run_diagnostic()