Spaces:

primerz
/

pixagram-stable

Runtime error

App Files Files Community

primerz commited on Oct 31, 2025

Commit

b22253e

verified ·

1 Parent(s): e9201b0

Upload 2 files

Browse files

Files changed (2) hide show

generator.py +186 -51
models.py +173 -50

generator.py CHANGED Viewed

@@ -48,7 +48,7 @@ class RetroArtConverter:
         self.mediapipe_face, mediapipe_success = load_mediapipe_face_detector()
         self.models_loaded['mediapipe_face'] = mediapipe_success
-        # Load Depth detector with fallback hierarchy (Leres -> Midas)
         self.depth_detector, self.depth_type, depth_success = load_depth_detector()
         self.models_loaded['depth_detector'] = depth_success
         self.models_loaded['depth_type'] = self.depth_type
@@ -116,11 +116,29 @@ class RetroArtConverter:
         self.models_loaded['lora'] = lora_success
         # Setup IP-Adapter
-        if self.instantid_active and self.image_encoder is not None: # <-- Check instantid_active
             self.image_proj_model, ip_adapter_success = setup_ip_adapter(self.pipe, self.image_encoder)
             self.models_loaded['ip_adapter'] = ip_adapter_success
         else:
-            print("[INFO] Face preservation: IP-Adapter disabled (InstantID model failed or encoder failed)")
             self.models_loaded['ip_adapter'] = False
             self.image_proj_model = None
@@ -166,6 +184,25 @@ class RetroArtConverter:
             print(f"{model}: {status}")
         print("===================\n")
         print("=== UPGRADE VERIFICATION ===")
         try:
             from resampler_enhanced import EnhancedResampler
@@ -191,7 +228,7 @@ class RetroArtConverter:
     def get_depth_map(self, image):
             """
             Generate depth map using available depth detector.
-            Supports: LeresDetector or MidasDetector.
             """
             if self.depth_detector is not None:
                 try:
@@ -253,6 +290,11 @@ class RetroArtConverter:
         +1-2% improvement in face preservation.
         """
         try:
             multi_scale_embeds = []
             for scale in MULTI_SCALE_FACTORS:
@@ -268,8 +310,9 @@ class RetroArtConverter:
                 scaled_array = cv2.cvtColor(np.array(scaled_crop), cv2.COLOR_RGB2BGR)
                 scaled_faces = self.face_app.get(scaled_array)
-                if len(scaled_faces) > 0:
-                    multi_scale_embeds.append(scaled_faces[0].normed_embedding)
             # Average embeddings
             if len(multi_scale_embeds) > 0:
@@ -279,7 +322,13 @@ class RetroArtConverter:
                 print(f"[MULTI-SCALE] Combined {len(multi_scale_embeds)} scales")
                 return averaged
-            return face.normed_embedding
         except Exception as e:
             print(f"[MULTI-SCALE] Failed: {e}, using single scale")
@@ -539,7 +588,7 @@ class RetroArtConverter:
         # Generate depth map
         depth_image = None
         if self.depth_active:
-            print("Generating depth map...")
             depth_image = self.get_depth_map(resized_image)
             if depth_image.size != (target_width, target_height):
                 depth_image = depth_image.resize((int(target_width), int(target_height)), Image.LANCZOS)
@@ -594,32 +643,82 @@ class RetroArtConverter:
                             guidance_scale = adaptive_params['guidance_scale']
                             lora_scale = adaptive_params['lora_scale']
-                        # Extract face embeddings
-                        face_embeddings_base = face.normed_embedding
-                        # Extract face crop
-                        bbox = face.bbox.astype(int)
-                        x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3]
-                        face_bbox_original = [x1, y1, x2, y2]
-                        # Add padding
-                        face_width = x2 - x1
-                        face_height = y2 - y1
-                        padding_x = int(face_width * 0.3)
-                        padding_y = int(face_height * 0.3)
-                        x1 = max(0, x1 - padding_x)
-                        y1 = max(0, y1 - padding_y)
-                        x2 = min(resized_image.width, x2 + padding_x)
-                        y2 = min(resized_image.height, y2 + padding_y)
-                        # Crop face region
-                        face_crop = resized_image.crop((x1, y1, x2, y2))
-                        # MULTI-SCALE PROCESSING
-                        face_embeddings = self.extract_multi_scale_face(face_crop, face)
-                        # Enhance face crop
-                        face_crop_enhanced = enhance_face_crop(face_crop)
                         # Draw keypoints
                         face_kps = face.kps
@@ -691,6 +790,26 @@ class RetroArtConverter:
                     print("  - MediapipeFace: tried, found nothing")
                 else:
                     print("  - MediapipeFace: not available")
                 print()
         # Set LORA scale
@@ -761,31 +880,47 @@ class RetroArtConverter:
                 # Add face embeddings for IP-Adapter if available
                 if face_embeddings is not None and self.models_loaded.get('ip_adapter', False) and face_crop_enhanced is not None:
                     print(f"Processing InstantID face embeddings with Resampler...")
-                    with torch.no_grad():
-                        face_emb_tensor = torch.from_numpy(face_embeddings).to(device=self.device, dtype=self.dtype)
-                        face_emb_tensor = face_emb_tensor.reshape(1, -1, 512)
-                        face_proj_embeds = self.image_proj_model(face_emb_tensor)
-                        boosted_scale = identity_preservation * IDENTITY_BOOST_MULTIPLIER
-                        face_proj_embeds = face_proj_embeds * boosted_scale
-                        print(f"  - Face embedding: {face_emb_tensor.shape} -> {face_proj_embeds.shape}, Scale: {boosted_scale:.2f}")
-                        if 'prompt_embeds' in pipe_kwargs:
-                            original_embeds = pipe_kwargs['prompt_embeds']
-                            if original_embeds.shape[0] > 1: # Handle CFG
-                                face_proj_embeds = torch.cat([torch.zeros_like(face_proj_embeds), face_proj_embeds], dim=0)
-                            combined_embeds = torch.cat([original_embeds, face_proj_embeds], dim=1)
-                            pipe_kwargs['prompt_embeds'] = combined_embeds
-                            print(f"  [OK] Face embeddings concatenated successfully! New shape: {combined_embeds.shape}")
-                        else:
-                            print(f"  [WARNING] Can't concatenate - no prompt_embeds (use Compel)")
                 elif has_detected_faces:
                     print("  Face detected but IP-Adapter/embeddings unavailable, using keypoints only")
             else:
                 # No face detected - blank map needed to maintain ControlNet list order

         self.mediapipe_face, mediapipe_success = load_mediapipe_face_detector()
         self.models_loaded['mediapipe_face'] = mediapipe_success
+        # Load Depth detector with fallback hierarchy (Leres â†’ Zoe â†’ Midas)
         self.depth_detector, self.depth_type, depth_success = load_depth_detector()
         self.models_loaded['depth_detector'] = depth_success
         self.models_loaded['depth_type'] = self.depth_type
         self.models_loaded['lora'] = lora_success
         # Setup IP-Adapter
+        if self.instantid_active and self.image_encoder is not None:
+            print("[IP-ADAPTER] Attempting IP-Adapter setup...")
+            print(f"  - InstantID active: {self.instantid_active}")
+            print(f"  - Image encoder available: {self.image_encoder is not None}")
+            print(f"  - Device: {device}, dtype: {dtype}")
             self.image_proj_model, ip_adapter_success = setup_ip_adapter(self.pipe, self.image_encoder)
             self.models_loaded['ip_adapter'] = ip_adapter_success
+            if ip_adapter_success:
+                print("[IP-ADAPTER] ✓ Successfully loaded!")
+            else:
+                print("[IP-ADAPTER] ✗ Setup failed - face embeddings will not be used")
+                print("[IP-ADAPTER] System will fallback to keypoints-only mode (reduced quality)")
         else:
+            reasons = []
+            if not self.instantid_active:
+                reasons.append("InstantID ControlNet not loaded")
+            if self.image_encoder is None:
+                reasons.append("Image encoder not loaded")
+            print(f"[INFO] Face preservation: IP-Adapter disabled ({', '.join(reasons)})")
+            print("[INFO] System will use keypoints-only mode (reduced quality)")
             self.models_loaded['ip_adapter'] = False
             self.image_proj_model = None
             print(f"{model}: {status}")
         print("===================\n")
+        # Additional IP-Adapter diagnostic
+        print("=== IP-ADAPTER DIAGNOSTIC ===")
+        print(f"InstantID ControlNet loaded: {self.models_loaded.get('instantid', False)}")
+        print(f"Image encoder available: {self.image_encoder is not None}")
+        print(f"Image projection model available: {self.image_proj_model is not None}")
+        print(f"IP-Adapter marked as loaded: {self.models_loaded.get('ip_adapter', False)}")
+        if self.models_loaded.get('ip_adapter', False):
+            print("✓ IP-Adapter FULLY FUNCTIONAL - face embeddings will be used")
+        else:
+            print("✗ IP-Adapter NOT AVAILABLE - will use keypoints only (reduced quality)")
+            if not self.models_loaded.get('instantid', False):
+                print("  Issue: InstantID ControlNet failed to load")
+            if self.image_encoder is None:
+                print("  Issue: Image encoder (CLIP) failed to load")
+            if self.image_proj_model is None:
+                print("  Issue: Image projection model (Resampler) failed to load")
+        print("=============================\n")
         print("=== UPGRADE VERIFICATION ===")
         try:
             from resampler_enhanced import EnhancedResampler
     def get_depth_map(self, image):
             """
             Generate depth map using available depth detector.
+            Supports: LeresDetector, ZoeDetector, or MidasDetector.
             """
             if self.depth_detector is not None:
                 try:
         +1-2% improvement in face preservation.
         """
         try:
+            # Check if face has valid embedding first
+            if not hasattr(face, 'normed_embedding') or face.normed_embedding is None:
+                print("[MULTI-SCALE] Face has no normed_embedding, cannot extract features")
+                return None
             multi_scale_embeds = []
             for scale in MULTI_SCALE_FACTORS:
                 scaled_array = cv2.cvtColor(np.array(scaled_crop), cv2.COLOR_RGB2BGR)
                 scaled_faces = self.face_app.get(scaled_array)
+                if len(scaled_faces) > 0 and hasattr(scaled_faces[0], 'normed_embedding'):
+                    if scaled_faces[0].normed_embedding is not None:
+                        multi_scale_embeds.append(scaled_faces[0].normed_embedding)
             # Average embeddings
             if len(multi_scale_embeds) > 0:
                 print(f"[MULTI-SCALE] Combined {len(multi_scale_embeds)} scales")
                 return averaged
+            # Return original if multi-scale failed but original exists
+            if hasattr(face, 'normed_embedding') and face.normed_embedding is not None:
+                print("[MULTI-SCALE] Multi-scale failed, using original embedding")
+                return face.normed_embedding
+            print("[MULTI-SCALE] No embeddings available at any scale")
+            return None
         except Exception as e:
             print(f"[MULTI-SCALE] Failed: {e}, using single scale")
         # Generate depth map
         depth_image = None
         if self.depth_active:
+            print("Generating Zoe depth map...")
             depth_image = self.get_depth_map(resized_image)
             if depth_image.size != (target_width, target_height):
                 depth_image = depth_image.resize((int(target_width), int(target_height)), Image.LANCZOS)
                             guidance_scale = adaptive_params['guidance_scale']
                             lora_scale = adaptive_params['lora_scale']
+                        # Extract face embeddings with validation
+                        try:
+                            if not hasattr(face, 'normed_embedding') or face.normed_embedding is None:
+                                print("  [ERROR] Face object has no normed_embedding attribute")
+                                face_embeddings_base = None
+                            else:
+                                face_embeddings_base = face.normed_embedding
+                                print(f"  [OK] Base embeddings extracted: shape {face_embeddings_base.shape}")
+                        except Exception as e:
+                            print(f"  [ERROR] Failed to extract base embeddings: {e}")
+                            face_embeddings_base = None
+                        # Extract face crop with validation
+                        try:
+                            bbox = face.bbox.astype(int)
+                            x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3]
+                            face_bbox_original = [x1, y1, x2, y2]
+                            # Validate bbox
+                            face_width = x2 - x1
+                            face_height = y2 - y1
+                            print(f"  [INFO] Face bbox: ({x1}, {y1}, {x2}, {y2}), size: {face_width}x{face_height}")
+                            if face_width <= 0 or face_height <= 0:
+                                print(f"  [ERROR] Invalid face dimensions: {face_width}x{face_height}")
+                                raise ValueError("Invalid face bbox")
+                            if face_width < 20 or face_height < 20:
+                                print(f"  [WARNING] Face very small: {face_width}x{face_height} (may affect quality)")
+                            # Add padding
+                            padding_x = int(face_width * 0.3)
+                            padding_y = int(face_height * 0.3)
+                            x1 = max(0, x1 - padding_x)
+                            y1 = max(0, y1 - padding_y)
+                            x2 = min(resized_image.width, x2 + padding_x)
+                            y2 = min(resized_image.height, y2 + padding_y)
+                            # Validate padded bbox
+                            if x2 <= x1 or y2 <= y1:
+                                print(f"  [ERROR] Invalid padded bbox: ({x1}, {y1}, {x2}, {y2})")
+                                raise ValueError("Invalid padded bbox")
+                            # Crop face region
+                            face_crop = resized_image.crop((x1, y1, x2, y2))
+                            print(f"  [OK] Face cropped: {face_crop.size}")
+                        except Exception as e:
+                            print(f"  [ERROR] Face cropping failed: {e}")
+                            face_crop = None
+                            face_bbox_original = None
+                        # MULTI-SCALE PROCESSING (only if we have valid crop and base embeddings)
+                        if face_crop is not None and face_embeddings_base is not None:
+                            try:
+                                face_embeddings = self.extract_multi_scale_face(face_crop, face)
+                                print(f"  [OK] Multi-scale embeddings extracted")
+                            except Exception as e:
+                                print(f"  [WARNING] Multi-scale extraction failed: {e}, using base embeddings")
+                                face_embeddings = face_embeddings_base
+                        else:
+                            print(f"  [ERROR] Cannot extract embeddings - crop or base embeddings unavailable")
+                            face_embeddings = None
+                        # Enhance face crop (only if crop succeeded)
+                        if face_crop is not None:
+                            try:
+                                face_crop_enhanced = enhance_face_crop(face_crop)
+                                print(f"  [OK] Face crop enhanced: {face_crop_enhanced.size}")
+                            except Exception as e:
+                                print(f"  [WARNING] Face enhancement failed: {e}, using original crop")
+                                face_crop_enhanced = face_crop
+                        else:
+                            print(f"  [ERROR] Cannot enhance - no face crop available")
+                            face_crop_enhanced = None
                         # Draw keypoints
                         face_kps = face.kps
                     print("  - MediapipeFace: tried, found nothing")
                 else:
                     print("  - MediapipeFace: not available")
+                print("\n[RECOMMENDATION] To improve face detection:")
+                print("  1. Ensure face is clearly visible and front-facing")
+                print("  2. Face should be at least 30% of the image area")
+                print("  3. Use good lighting and avoid extreme angles")
+                print("  4. Minimum recommended face size: 100x100 pixels")
+                print()
+            elif face_embeddings is None and has_detected_faces:
+                print("\n[SUMMARY] Face detected but embeddings extraction failed")
+                print("[REASON] This can happen when:")
+                print("  1. Face is detected but too small for embedding extraction (<50x50px)")
+                print("  2. Face angle is too extreme (profile view >45°)")
+                print("  3. Face is partially occluded or cut off at image edge")
+                print("  4. Detection confidence is low (<0.5)")
+                print("\n[RECOMMENDATION] To fix:")
+                print("  1. Use a larger, clearer image")
+                print("  2. Ensure face is centered and front-facing")
+                print("  3. Crop image to focus on the face")
+                print("  4. Avoid faces near image borders")
+                print("\n[IMPACT] Generation will use keypoints only (85-90% similarity vs 96-99% with embeddings)")
                 print()
         # Set LORA scale
                 # Add face embeddings for IP-Adapter if available
                 if face_embeddings is not None and self.models_loaded.get('ip_adapter', False) and face_crop_enhanced is not None:
                     print(f"Processing InstantID face embeddings with Resampler...")
+                    print(f"  [DEBUG] face_embeddings shape: {face_embeddings.shape if hasattr(face_embeddings, 'shape') else 'numpy array'}")
+                    print(f"  [DEBUG] image_proj_model available: {self.image_proj_model is not None}")
+                    print(f"  [DEBUG] IP-Adapter loaded: {self.models_loaded.get('ip_adapter', False)}")
+                    try:
+                        with torch.no_grad():
+                            face_emb_tensor = torch.from_numpy(face_embeddings).to(device=self.device, dtype=self.dtype)
+                            face_emb_tensor = face_emb_tensor.reshape(1, -1, 512)
+                            if self.image_proj_model is None:
+                                print("  [ERROR] image_proj_model is None! Cannot process embeddings.")
+                            else:
+                                face_proj_embeds = self.image_proj_model(face_emb_tensor)
+                                boosted_scale = identity_preservation * IDENTITY_BOOST_MULTIPLIER
+                                face_proj_embeds = face_proj_embeds * boosted_scale
+                                print(f"  - Face embedding: {face_emb_tensor.shape} -> {face_proj_embeds.shape}, Scale: {boosted_scale:.2f}")
+                                if 'prompt_embeds' in pipe_kwargs:
+                                    original_embeds = pipe_kwargs['prompt_embeds']
+                                    if original_embeds.shape[0] > 1: # Handle CFG
+                                        face_proj_embeds = torch.cat([torch.zeros_like(face_proj_embeds), face_proj_embeds], dim=0)
+                                    combined_embeds = torch.cat([original_embeds, face_proj_embeds], dim=1)
+                                    pipe_kwargs['prompt_embeds'] = combined_embeds
+                                    print(f"  [OK] Face embeddings concatenated successfully! New shape: {combined_embeds.shape}")
+                                else:
+                                    print(f"  [WARNING] Can't concatenate - no prompt_embeds (use Compel)")
+                    except Exception as e:
+                        print(f"  [ERROR] Failed to process face embeddings: {e}")
+                        import traceback
+                        traceback.print_exc()
                 elif has_detected_faces:
                     print("  Face detected but IP-Adapter/embeddings unavailable, using keypoints only")
+                    print(f"    - face_embeddings available: {face_embeddings is not None}")
+                    print(f"    - IP-Adapter loaded: {self.models_loaded.get('ip_adapter', False)}")
+                    print(f"    - face_crop_enhanced available: {face_crop_enhanced is not None}")
+                    print(f"    - image_proj_model available: {self.image_proj_model is not None}")
             else:
                 # No face detected - blank map needed to maintain ControlNet list order

models.py CHANGED Viewed

@@ -13,7 +13,7 @@ from diffusers import (
 from diffusers.models.attention_processor import AttnProcessor2_0
 from transformers import CLIPVisionModelWithProjection
 from insightface.app import FaceAnalysis
-from controlnet_aux import OpenposeDetector, LeresDetector, MidasDetector, MediapipeFaceDetector
 from huggingface_hub import hf_hub_download
 from compel import Compel, ReturnedEmbeddingsType
@@ -62,7 +62,7 @@ def download_model_with_retry(repo_id, filename, max_retries=None):
 def load_face_analysis():
-    """Load face analysis model with proper error handling."""
     print("Loading face analysis model...")
     try:
         face_app = FaceAnalysis(
@@ -70,20 +70,39 @@ def load_face_analysis():
             root='./models/insightface',
             providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
         )
         face_app.prepare(
             ctx_id=FACE_DETECTION_CONFIG['ctx_id'],
             det_size=FACE_DETECTION_CONFIG['det_size']
         )
         print("  [OK] Face analysis model loaded successfully")
         return face_app, True
     except Exception as e:
         print(f"  [WARNING] Face detection not available: {e}")
         return None, False
 def load_depth_detector():
     """
-    Load depth detector with fallback hierarchy: Leres -> Midas.
     Returns (detector, detector_type, success).
     """
     print("Loading depth detector with fallback hierarchy...")
@@ -98,9 +117,19 @@ def load_depth_detector():
     except Exception as e:
         print(f"  [INFO] LeresDetector not available: {e}")
-    # Fallback to MidasDetector
     try:
-        print("  Attempting MidasDetector (fallback)...")
         midas_depth = MidasDetector.from_pretrained("lllyasviel/Annotators")
         midas_depth.to(device)
         print("  [OK] MidasDetector loaded successfully")
@@ -140,24 +169,40 @@ def load_mediapipe_face_detector():
 def load_controlnets():
     """Load ControlNet models."""
-    print("Loading ControlNet Depth model...")
     controlnet_depth = ControlNetModel.from_pretrained(
-        "diffusers/controlnet-zoe-depth-sdxl-1.0",  # Model repo name (not tied to detector)
         torch_dtype=dtype
     ).to(device)
     print("  [OK] ControlNet Depth loaded")
     # --- NEW: Load OpenPose ControlNet ---
     print("Loading ControlNet OpenPose model...")
-    try:
-        controlnet_openpose = ControlNetModel.from_pretrained(
-            "diffusers/controlnet-openpose-sdxl-1.0",
-            torch_dtype=dtype
-        ).to(device)
-        print("  [OK] ControlNet OpenPose loaded")
-    except Exception as e:
-        print(f"  [WARNING] ControlNet OpenPose not available: {e}")
-        controlnet_openpose = None
     # --- END NEW ---
     print("Loading InstantID ControlNet...")
@@ -237,18 +282,37 @@ def setup_ip_adapter(pipe, image_encoder):
     Based on the reference InstantID pipeline.
     """
     if image_encoder is None:
         return None, False
     print("Setting up IP-Adapter for InstantID face embeddings (proper implementation)...")
     try:
-        # Download InstantID weights
         ip_adapter_path = download_model_with_retry(
             "InstantX/InstantID",
             "ip-adapter.bin"
         )
-        # Load full state dict
         state_dict = torch.load(ip_adapter_path, map_location="cpu")
         # Extract image_proj and ip_adapter weights
         image_proj_state_dict = {}
@@ -260,38 +324,81 @@ def setup_ip_adapter(pipe, image_encoder):
             elif key.startswith("ip_adapter."):
                 ip_adapter_state_dict[key.replace("ip_adapter.", "")] = value
-        # Create Resampler (image projection model) with CORRECT parameters from reference
-        print("Creating Resampler (Perceiver architecture)...")
         image_proj_model = Resampler(
-            dim=1280,                                       # Hidden dimension
-            depth=4,                                        # IMPORTANT: 4 layers (not 8!)
-            dim_head=64,                                    # Dimension per head
-            heads=20,                                       # Number of heads
-            num_queries=16,                                 # Number of output tokens
-            embedding_dim=512,                              # InsightFace embedding dim
-            output_dim=pipe.unet.config.cross_attention_dim,  # SDXL cross-attention dim (2048)
-            ff_mult=4                                       # Feedforward multiplier
         )
         image_proj_model.eval()
         image_proj_model = image_proj_model.to(device, dtype=dtype)
-        # Load image_proj weights
         if image_proj_state_dict:
             try:
-                image_proj_model.load_state_dict(image_proj_state_dict, strict=True)
                 print("  [OK] Resampler loaded with pretrained weights")
             except Exception as e:
                 print(f"  [WARNING] Could not load Resampler weights: {e}")
-                print("  Using randomly initialized Resampler")
         else:
-            print("  [WARNING] No image_proj weights found, using random initialization")
-        # Setup IP-Adapter attention processors
-        print("Setting up IP-Adapter attention processors...")
         attn_procs = {}
-        num_tokens = 16  # Match Resampler num_queries
         for name in pipe.unet.attn_processors.keys():
             cross_attention_dim = None if name.endswith("attn1.processor") else pipe.unet.config.cross_attention_dim
@@ -315,32 +422,48 @@ def setup_ip_adapter(pipe, image_encoder):
                     scale=1.0,
                     num_tokens=num_tokens
                 ).to(device, dtype=dtype)
         # Set attention processors
         pipe.unet.set_attn_processor(attn_procs)
-        # Load IP-Adapter weights into attention processors
         if ip_adapter_state_dict:
-            try:
-                ip_layers = torch.nn.ModuleList(pipe.unet.attn_processors.values())
-                ip_layers.load_state_dict(ip_adapter_state_dict, strict=False)
-                print("  [OK] IP-Adapter attention weights loaded")
-            except Exception as e:
-                print(f"  [WARNING] Could not load IP-Adapter weights: {e}")
         else:
-            print("  [WARNING] No ip_adapter weights found")
-        # Store image encoder and projection model
         pipe.image_encoder = image_encoder
-        print("  [OK] IP-Adapter fully loaded with InstantID architecture")
-        print(f"  - Resampler: 4 layers, 20 heads, 16 output tokens")
-        print(f"  - Face embeddings: 512D -> 16x2048D")
         return image_proj_model, True
     except Exception as e:
-        print(f"  [ERROR] Could not setup IP-Adapter: {e}")
         import traceback
         traceback.print_exc()
         return None, False

 from diffusers.models.attention_processor import AttnProcessor2_0
 from transformers import CLIPVisionModelWithProjection
 from insightface.app import FaceAnalysis
+from controlnet_aux import ZoeDetector, OpenposeDetector, LeresDetector, MidasDetector, MediapipeFaceDetector
 from huggingface_hub import hf_hub_download
 from compel import Compel, ReturnedEmbeddingsType
 def load_face_analysis():
+    """Load face analysis model with proper error handling and recognition enabled."""
     print("Loading face analysis model...")
     try:
         face_app = FaceAnalysis(
             root='./models/insightface',
             providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
         )
+        # Prepare with explicit recognition model enabled
+        print("  Preparing face analysis with recognition...")
         face_app.prepare(
             ctx_id=FACE_DETECTION_CONFIG['ctx_id'],
             det_size=FACE_DETECTION_CONFIG['det_size']
         )
+        # Verify recognition model is available
+        has_rec = False
+        for task in face_app.models.keys():
+            if 'recognition' in task or 'rec' in task:
+                has_rec = True
+                print(f"  [OK] Recognition model found: {task}")
+                break
+        if not has_rec:
+            print("  [WARNING] No recognition model found in face_app")
+            print(f"  [INFO] Available models: {list(face_app.models.keys())}")
+            print("  [INFO] Face embeddings may not be available")
         print("  [OK] Face analysis model loaded successfully")
         return face_app, True
     except Exception as e:
         print(f"  [WARNING] Face detection not available: {e}")
+        import traceback
+        traceback.print_exc()
         return None, False
 def load_depth_detector():
     """
+    Load depth detector with fallback hierarchy: Leres â†’ Zoe â†’ Midas.
     Returns (detector, detector_type, success).
     """
     print("Loading depth detector with fallback hierarchy...")
     except Exception as e:
         print(f"  [INFO] LeresDetector not available: {e}")
+    # Fallback to ZoeDetector
     try:
+        print("  Attempting ZoeDetector (fallback #1)...")
+        zoe_depth = ZoeDetector.from_pretrained("lllyasviel/Annotators")
+        zoe_depth.to(device)
+        print("  [OK] ZoeDetector loaded successfully")
+        return zoe_depth, 'zoe', True
+    except Exception as e:
+        print(f"  [INFO] ZoeDetector not available: {e}")
+    # Final fallback to MidasDetector
+    try:
+        print("  Attempting MidasDetector (fallback #2)...")
         midas_depth = MidasDetector.from_pretrained("lllyasviel/Annotators")
         midas_depth.to(device)
         print("  [OK] MidasDetector loaded successfully")
 def load_controlnets():
     """Load ControlNet models."""
+    print("Loading ControlNet Zoe Depth model...")
     controlnet_depth = ControlNetModel.from_pretrained(
+        "diffusers/controlnet-zoe-depth-sdxl-1.0",
         torch_dtype=dtype
     ).to(device)
     print("  [OK] ControlNet Depth loaded")
     # --- NEW: Load OpenPose ControlNet ---
     print("Loading ControlNet OpenPose model...")
+    controlnet_openpose = None  # Initialize as None
+    # Try multiple known OpenPose ControlNet models for SDXL
+    openpose_models = [
+        ("lllyasviel/control_v11p_sd15_openpose", "SDXL-compatible OpenPose from lllyasviel"),
+        ("CrucibleAI/ControlNetMediaPipeFace", "MediaPipe Face alternative"),
+    ]
+    for model_id, description in openpose_models:
+        try:
+            print(f"  Trying {description}: {model_id}")
+            controlnet_openpose = ControlNetModel.from_pretrained(
+                model_id,
+                torch_dtype=dtype
+            ).to(device)
+            print(f"  [OK] ControlNet OpenPose loaded from {model_id}")
+            break
+        except Exception as e:
+            print(f"  [INFO] {model_id} not compatible: {str(e)[:100]}")
+            continue
+    if controlnet_openpose is None:
+        print("  [WARNING] No OpenPose ControlNet available for SDXL")
+        print("  [INFO] Expression control will be disabled (not critical)")
+        print("  [INFO] System will work with Identity + Depth ControlNets only")
     # --- END NEW ---
     print("Loading InstantID ControlNet...")
     Based on the reference InstantID pipeline.
     """
     if image_encoder is None:
+        print("[ERROR] setup_ip_adapter: image_encoder is None")
+        return None, False
+    if pipe is None:
+        print("[ERROR] setup_ip_adapter: pipe is None")
         return None, False
     print("Setting up IP-Adapter for InstantID face embeddings (proper implementation)...")
+    # Step 1: Download weights
     try:
+        print("  [1/5] Downloading IP-Adapter weights...")
         ip_adapter_path = download_model_with_retry(
             "InstantX/InstantID",
             "ip-adapter.bin"
         )
+        if ip_adapter_path is None:
+            print("  [ERROR] Failed to download ip-adapter.bin")
+            return None, False
+        print(f"  [OK] IP-Adapter weights downloaded to: {ip_adapter_path}")
+    except Exception as e:
+        print(f"  [ERROR] Download failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return None, False
+    # Step 2: Load state dict
+    try:
+        print("  [2/5] Loading state dict...")
         state_dict = torch.load(ip_adapter_path, map_location="cpu")
+        print(f"  [OK] State dict loaded with {len(state_dict)} keys")
         # Extract image_proj and ip_adapter weights
         image_proj_state_dict = {}
             elif key.startswith("ip_adapter."):
                 ip_adapter_state_dict[key.replace("ip_adapter.", "")] = value
+        print(f"  [OK] Extracted {len(image_proj_state_dict)} image_proj keys")
+        print(f"  [OK] Extracted {len(ip_adapter_state_dict)} ip_adapter keys")
+        if len(image_proj_state_dict) == 0:
+            print("  [WARNING] No image_proj weights found in state dict!")
+        if len(ip_adapter_state_dict) == 0:
+            print("  [WARNING] No ip_adapter weights found in state dict!")
+    except Exception as e:
+        print(f"  [ERROR] Failed to load state dict: {e}")
+        import traceback
+        traceback.print_exc()
+        return None, False
+    # Step 3: Create Resampler
+    try:
+        print("  [3/5] Creating Resampler (Perceiver architecture)...")
+        # Verify pipe config
+        if not hasattr(pipe.unet, 'config'):
+            print("  [ERROR] pipe.unet has no config attribute")
+            return None, False
+        if not hasattr(pipe.unet.config, 'cross_attention_dim'):
+            print("  [ERROR] pipe.unet.config has no cross_attention_dim")
+            return None, False
+        output_dim = pipe.unet.config.cross_attention_dim
+        print(f"  [INFO] Using cross_attention_dim: {output_dim}")
         image_proj_model = Resampler(
+            dim=1280,
+            depth=4,
+            dim_head=64,
+            heads=20,
+            num_queries=16,
+            embedding_dim=512,
+            output_dim=output_dim,
+            ff_mult=4
         )
         image_proj_model.eval()
         image_proj_model = image_proj_model.to(device, dtype=dtype)
+        print(f"  [OK] Resampler created and moved to {device}")
+        # Load weights
         if image_proj_state_dict:
             try:
+                missing_keys, unexpected_keys = image_proj_model.load_state_dict(
+                    image_proj_state_dict, strict=False
+                )
+                if len(missing_keys) > 0:
+                    print(f"  [WARNING] Missing keys in Resampler: {len(missing_keys)}")
+                if len(unexpected_keys) > 0:
+                    print(f"  [WARNING] Unexpected keys in Resampler: {len(unexpected_keys)}")
                 print("  [OK] Resampler loaded with pretrained weights")
             except Exception as e:
                 print(f"  [WARNING] Could not load Resampler weights: {e}")
+                print("  [INFO] Using randomly initialized Resampler (reduced quality)")
         else:
+            print("  [WARNING] No image_proj weights available (reduced quality)")
+    except Exception as e:
+        print(f"  [ERROR] Failed to create Resampler: {e}")
+        import traceback
+        traceback.print_exc()
+        return None, False
+    # Step 4: Setup attention processors
+    try:
+        print("  [4/5] Setting up IP-Adapter attention processors...")
         attn_procs = {}
+        num_tokens = 16
+        processor_count = 0
         for name in pipe.unet.attn_processors.keys():
             cross_attention_dim = None if name.endswith("attn1.processor") else pipe.unet.config.cross_attention_dim
                     scale=1.0,
                     num_tokens=num_tokens
                 ).to(device, dtype=dtype)
+                processor_count += 1
+        print(f"  [OK] Created {processor_count} IP-Adapter attention processors")
         # Set attention processors
         pipe.unet.set_attn_processor(attn_procs)
+        print("  [OK] Attention processors set on UNet")
+    except Exception as e:
+        print(f"  [ERROR] Failed to setup attention processors: {e}")
+        import traceback
+        traceback.print_exc()
+        return None, False
+    # Step 5: Load IP-Adapter weights
+    try:
+        print("  [5/5] Loading IP-Adapter weights into attention processors...")
         if ip_adapter_state_dict:
+            ip_layers = torch.nn.ModuleList(pipe.unet.attn_processors.values())
+            missing_keys, unexpected_keys = ip_layers.load_state_dict(
+                ip_adapter_state_dict, strict=False
+            )
+            if len(missing_keys) > 0:
+                print(f"  [WARNING] Missing keys in IP-Adapter: {len(missing_keys)}")
+            if len(unexpected_keys) > 0:
+                print(f"  [WARNING] Unexpected keys in IP-Adapter: {len(unexpected_keys)}")
+            print("  [OK] IP-Adapter attention weights loaded")
         else:
+            print("  [WARNING] No ip_adapter weights available (reduced quality)")
+        # Store image encoder
         pipe.image_encoder = image_encoder
+        print("\n  [SUCCESS] IP-Adapter fully loaded with InstantID architecture")
+        print(f"    - Resampler: 4 layers, 20 heads, 16 output tokens")
+        print(f"    - Face embeddings: 512D -> 16x2048D")
+        print(f"    - Device: {device}, dtype: {dtype}\n")
         return image_proj_model, True
     except Exception as e:
+        print(f"  [ERROR] Failed to load IP-Adapter weights: {e}")
         import traceback
         traceback.print_exc()
         return None, False