Claude commited on
Commit
998779f
·
1 Parent(s): 684cf99

Add diagnostic script for structural clothing inference

Browse files

Creates diagnose_structural_clothing.py to isolate clothing state tag
inference failures in structural inference system.

Tests 7 hand-crafted captions with explicit clothing mentions:
- Clothed (formal wear, casual wear, complex descriptions)
- Nude, topless, bottomless variations
- Multiple characters

Logs full LLM responses to identify if failure is due to:
- Prompt design issues
- Model capability limits (Llama 3.1 8B)
- Response parsing bugs

Run locally with: python scripts/diagnose_structural_clothing.py

Context: n=50 eval showed 14/50 samples (28%) missed 'clothed' tag
despite explicit clothing mentions in captions. New group-based
structural inference had zero performance improvement over baseline.

https://claude.ai/code/session_015ZwE7a5E6YVTrMpuB2pXX7

scripts/diagnose_structural_clothing.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Diagnostic script to test structural inference for clothing state tags.
4
+
5
+ Tests with hand-crafted captions that explicitly mention clothing to identify
6
+ why the LLM is systematically failing to infer clothing state tags.
7
+ """
8
+
9
+ import sys
10
+ import os
11
+ from pathlib import Path
12
+
13
+ # Add project root to path
14
+ sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
15
+
16
+ from psq_rag.llm.select import llm_infer_structural_tags, _get_structural_groups, _build_structural_prompt
17
+
18
+
19
+ # Test cases with explicit clothing mentions
20
+ TEST_CASES = [
21
+ {
22
+ "name": "Explicit clothed - formal wear",
23
+ "caption": "A male wolf wearing a black suit, white shirt, and red tie standing in an office.",
24
+ "expected": ["solo", "anthro", "male", "clothed"],
25
+ },
26
+ {
27
+ "name": "Explicit clothed - casual wear",
28
+ "caption": "An anthropomorphic fox in blue jeans and a t-shirt walking down a street.",
29
+ "expected": ["solo", "anthro", "clothed"],
30
+ },
31
+ {
32
+ "name": "Explicit nude",
33
+ "caption": "A naked female cat sitting on a beach, no clothing visible.",
34
+ "expected": ["solo", "anthro", "female", "nude"],
35
+ },
36
+ {
37
+ "name": "Explicit topless",
38
+ "caption": "A shirtless male dragon wearing pants, showing his muscular chest.",
39
+ "expected": ["solo", "anthro", "male", "topless"],
40
+ },
41
+ {
42
+ "name": "Explicit bottomless",
43
+ "caption": "A female rabbit wearing only a hoodie on her upper body, with her lower half uncovered.",
44
+ "expected": ["solo", "anthro", "female", "bottomless"],
45
+ },
46
+ {
47
+ "name": "Multiple characters with clothing",
48
+ "caption": "Two male dogs wearing police uniforms standing side by side.",
49
+ "expected": ["duo", "anthro", "male", "clothed"],
50
+ },
51
+ {
52
+ "name": "Clothing mentioned in middle of description",
53
+ "caption": "A muscular male wolf with red fur stands in a forest. He wears a black leather jacket and torn jeans. His eyes glow blue in the darkness.",
54
+ "expected": ["solo", "anthro", "male", "clothed"],
55
+ },
56
+ ]
57
+
58
+
59
+ def print_structural_prompt():
60
+ """Print the actual statements the LLM sees."""
61
+ groups = _get_structural_groups()
62
+ statement_lines, flat_tags = _build_structural_prompt(groups)
63
+
64
+ print("=" * 80)
65
+ print("STRUCTURAL INFERENCE STATEMENTS")
66
+ print("=" * 80)
67
+ print(statement_lines)
68
+ print("\n" + "=" * 80)
69
+ print("TAG MAPPING (1-based index)")
70
+ print("=" * 80)
71
+ for i, (tag, defn) in enumerate(flat_tags, 1):
72
+ print(f"{i:2d}. {tag:20s} | {defn[:60]}...")
73
+ print("=" * 80 + "\n")
74
+
75
+
76
+ def run_diagnostic():
77
+ """Run diagnostic tests on structural clothing inference."""
78
+
79
+ print_structural_prompt()
80
+
81
+ print("\n" + "=" * 80)
82
+ print("RUNNING DIAGNOSTIC TESTS")
83
+ print("=" * 80 + "\n")
84
+
85
+ results = []
86
+
87
+ for i, test_case in enumerate(TEST_CASES, 1):
88
+ name = test_case["name"]
89
+ caption = test_case["caption"]
90
+ expected = test_case["expected"]
91
+
92
+ print(f"\n{'─' * 80}")
93
+ print(f"TEST {i}/{len(TEST_CASES)}: {name}")
94
+ print(f"{'─' * 80}")
95
+ print(f"Caption: {caption}")
96
+ print(f"Expected tags: {expected}")
97
+ print(f"\nCalling LLM...", flush=True)
98
+
99
+ # Call structural inference
100
+ def log_fn(msg):
101
+ print(f" [LOG] {msg}")
102
+
103
+ selected = llm_infer_structural_tags(
104
+ caption,
105
+ log=log_fn,
106
+ temperature=0.0,
107
+ max_tokens=512,
108
+ )
109
+
110
+ print(f"\nSelected tags: {selected}")
111
+
112
+ # Analyze results
113
+ expected_set = set(expected)
114
+ selected_set = set(selected)
115
+
116
+ clothing_tags = {'clothed', 'nude', 'topless', 'bottomless'}
117
+ expected_clothing = expected_set & clothing_tags
118
+ selected_clothing = selected_set & clothing_tags
119
+
120
+ missed = expected_set - selected_set
121
+ extra = selected_set - expected_set
122
+ correct = expected_set & selected_set
123
+
124
+ clothing_correct = expected_clothing == selected_clothing
125
+
126
+ print(f"\n✓ Correct: {sorted(correct)}")
127
+ if missed:
128
+ print(f"✗ Missed: {sorted(missed)}")
129
+ if extra:
130
+ print(f"⚠ Extra: {sorted(extra)}")
131
+
132
+ print(f"\nClothing state inference: {'✓ PASS' if clothing_correct else '✗ FAIL'}")
133
+ if expected_clothing:
134
+ print(f" Expected: {sorted(expected_clothing)}")
135
+ print(f" Selected: {sorted(selected_clothing) if selected_clothing else '(none)'}")
136
+
137
+ results.append({
138
+ "name": name,
139
+ "caption": caption,
140
+ "expected": expected,
141
+ "selected": selected,
142
+ "clothing_correct": clothing_correct,
143
+ "missed": list(missed),
144
+ "extra": list(extra),
145
+ })
146
+
147
+ # Summary
148
+ print("\n\n" + "=" * 80)
149
+ print("SUMMARY")
150
+ print("=" * 80)
151
+
152
+ total_tests = len(results)
153
+ clothing_pass = sum(1 for r in results if r["clothing_correct"])
154
+ clothing_fail = total_tests - clothing_pass
155
+
156
+ print(f"\nTotal tests: {total_tests}")
157
+ print(f"Clothing state inference:")
158
+ print(f" ✓ Pass: {clothing_pass}/{total_tests} ({100*clothing_pass/total_tests:.0f}%)")
159
+ print(f" ✗ Fail: {clothing_fail}/{total_tests} ({100*clothing_fail/total_tests:.0f}%)")
160
+
161
+ if clothing_fail > 0:
162
+ print(f"\n{'─' * 80}")
163
+ print("FAILURES:")
164
+ print(f"{'─' * 80}")
165
+ for r in results:
166
+ if not r["clothing_correct"]:
167
+ print(f"\n• {r['name']}")
168
+ print(f" Caption: {r['caption'][:60]}...")
169
+ clothing_tags = {'clothed', 'nude', 'topless', 'bottomless'}
170
+ exp_clothing = set(r['expected']) & clothing_tags
171
+ sel_clothing = set(r['selected']) & clothing_tags
172
+ print(f" Expected: {sorted(exp_clothing)}")
173
+ print(f" Selected: {sorted(sel_clothing) if sel_clothing else '(none)'}")
174
+
175
+ # Overall assessment
176
+ print(f"\n{'=' * 80}")
177
+ print("DIAGNOSIS")
178
+ print(f"{'=' * 80}")
179
+
180
+ if clothing_pass == total_tests:
181
+ print("\n✓ All tests passed! Clothing inference is working correctly.")
182
+ elif clothing_pass == 0:
183
+ print("\n✗ ALL tests failed! The LLM is completely ignoring the clothing state group.")
184
+ print("\nPossible causes:")
185
+ print("1. Prompt design issue - clothing group not salient enough")
186
+ print("2. Model capability issue - Llama 3.1 8B cannot handle this task")
187
+ print("3. Response parsing issue - LLM is selecting but parser is missing it")
188
+ else:
189
+ print(f"\n⚠ Partial failure! {clothing_fail}/{total_tests} tests failed.")
190
+ print("\nThe LLM is sometimes inferring clothing state but inconsistently.")
191
+
192
+ return results
193
+
194
+
195
+ if __name__ == "__main__":
196
+ results = run_diagnostic()