Spaces:

primerz
/

pixagram-stable

Runtime error

App Files Files Community

primerz commited on Oct 31, 2025

Commit

e9201b0

verified ·

1 Parent(s): d31bf62

Upload 2 files

Browse files

Files changed (2) hide show

generator.py +163 -102
models.py +50 -32

generator.py CHANGED Viewed

@@ -20,7 +20,7 @@ from models import (
     load_face_analysis, load_depth_detector, load_controlnets, load_image_encoder,
     load_sdxl_pipeline, load_lora, setup_ip_adapter, setup_compel,
     setup_scheduler, optimize_pipeline, load_caption_model, set_clip_skip,
-    load_openpose_detector
 )
@@ -34,17 +34,24 @@ class RetroArtConverter:
             'custom_checkpoint': False,
             'lora': False,
             'instantid': False,
-            'zoe_depth': False,
             'ip_adapter': False,
-            'openpose': False
         }
-        # Initialize face analysis
         self.face_app, self.face_detection_enabled = load_face_analysis()
-        # Load Zoe Depth detector
-        self.zoe_depth, zoe_success = load_depth_detector()
-        self.models_loaded['zoe_depth'] = zoe_success
         # --- NEW: Load OpenPose detector ---
         self.openpose_detector, openpose_success = load_openpose_detector()
@@ -182,8 +189,11 @@ class RetroArtConverter:
         print("============================\n")
     def get_depth_map(self, image):
-            """Generate depth map using Zoe Depth"""
-            if self.zoe_depth is not None:
                 try:
                     if image.mode != 'RGB':
                         image = image.convert('RGB')
@@ -203,40 +213,38 @@ class RetroArtConverter:
                     image_for_depth = image.resize(size_for_depth, Image.LANCZOS)
                     if target_width != orig_width or target_height != orig_height:
-                        print(f"[DEPTH] Resized for ZoeDetector: {orig_width}x{orig_height} -> {target_width}x{target_height}")
                     # FIXED: Add torch.no_grad() wrapper
                     with torch.no_grad():
-                        depth_image = self.zoe_depth(image_for_depth)
                     depth_width, depth_height = depth_image.size
                     if depth_width != orig_width or depth_height != orig_height:
                         depth_image = depth_image.resize((int(orig_width), int(orig_height)), Image.LANCZOS)
-                    print(f"[DEPTH] Zoe depth map generated: {orig_width}x{orig_height}")
                     return depth_image
                 except Exception as e:
-                    print(f"[DEPTH] ZoeDetector failed ({e}), falling back to grayscale depth")
                     gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
                     depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
                     return Image.fromarray(depth_colored)
             else:
                 gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
                 depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
                 return Image.fromarray(depth_colored)
-    def add_trigger_word(self, prompt, lora_name="retroart"):
         """Add trigger word to prompt if not present"""
-        from config import LORA_CONFIGS
-        trigger_word = LORA_CONFIGS.get(lora_name, {}).get("trigger", TRIGGER_WORD)
-        if trigger_word.lower() not in prompt.lower():
             if not prompt or not prompt.strip():
-                return trigger_word
-            return f"{trigger_word}, {prompt}"
         return prompt
     def extract_multi_scale_face(self, face_crop, face):
@@ -487,7 +495,6 @@ class RetroArtConverter:
         identity_control_scale=0.85,
         expression_control_scale=0.6,
         lora_scale=1.0,
-        lora_name="retroart",
         identity_preservation=0.8,
         strength=0.75,
         enable_color_matching=False,
@@ -514,7 +521,7 @@ class RetroArtConverter:
                 )
         # Add trigger word
-        prompt = self.add_trigger_word(prompt, lora_name)
         # Calculate optimal size with flexible aspect ratio support
         original_width, original_height = input_image.size
@@ -532,7 +539,7 @@ class RetroArtConverter:
         # Generate depth map
         depth_image = None
         if self.depth_active:
-            print("Generating Zoe depth map...")
             depth_image = self.get_depth_map(resized_image)
             if depth_image.size != (target_width, target_height):
                 depth_image = depth_image.resize((int(target_width), int(target_height)), Image.LANCZOS)
@@ -557,88 +564,142 @@ class RetroArtConverter:
         has_detected_faces = False
         face_bbox_original = None
-        if self.instantid_active and self.face_app is not None: # <-- Check instantid_active
-            print("Detecting faces and extracting keypoints...")
-            img_array = cv2.cvtColor(np.array(resized_image), cv2.COLOR_RGB2BGR)
-            faces = self.face_app.get(img_array)
-            if len(faces) > 0:
-                has_detected_faces = True
-                print(f"Detected {len(faces)} face(s)")
-                # Get largest face
-                face = sorted(faces, key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]))[-1]
-                # ADAPTIVE PARAMETERS
-                adaptive_params = self.detect_face_quality(face)
-                if adaptive_params is not None:
-                    print(f"[ADAPTIVE] {adaptive_params['reason']}")
-                    identity_preservation = adaptive_params['identity_preservation']
-                    identity_control_scale = adaptive_params['identity_control_scale']
-                    guidance_scale = adaptive_params['guidance_scale']
-                    lora_scale = adaptive_params['lora_scale']
-                # Extract face embeddings
-                face_embeddings_base = face.normed_embedding
-                # Extract face crop
-                bbox = face.bbox.astype(int)
-                x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3]
-                face_bbox_original = [x1, y1, x2, y2]
-                # Add padding
-                face_width = x2 - x1
-                face_height = y2 - y1
-                padding_x = int(face_width * 0.3)
-                padding_y = int(face_height * 0.3)
-                x1 = max(0, x1 - padding_x)
-                y1 = max(0, y1 - padding_y)
-                x2 = min(resized_image.width, x2 + padding_x)
-                y2 = min(resized_image.height, y2 + padding_y)
-                # Crop face region
-                face_crop = resized_image.crop((x1, y1, x2, y2))
-                # MULTI-SCALE PROCESSING
-                face_embeddings = self.extract_multi_scale_face(face_crop, face)
-                # Enhance face crop
-                face_crop_enhanced = enhance_face_crop(face_crop)
-                # Draw keypoints
-                face_kps = face.kps
-                face_kps_image = draw_kps(resized_image, face_kps)
-                # ENHANCED: Extract comprehensive facial attributes
-                from utils import get_facial_attributes, build_enhanced_prompt
-                facial_attrs = get_facial_attributes(face)
-                # Update prompt with detected attributes
-                prompt = build_enhanced_prompt(prompt, facial_attrs, TRIGGER_WORD)
-                # Legacy output for compatibility
-                age = facial_attrs['age']
-                gender_code = facial_attrs['gender']
-                det_score = facial_attrs['quality']
-                gender_str = 'M' if gender_code == 1 else ('F' if gender_code == 0 else 'N/A')
-                print(f"Face info: bbox={face.bbox}, age={age if age else 'N/A'}, gender={gender_str}")
-                print(f"Face crop size: {face_crop.size}, enhanced: {face_crop_enhanced.size if face_crop_enhanced else 'N/A'}")
         # Set LORA scale
         if hasattr(self.pipe, 'set_adapters') and self.models_loaded['lora']:
             try:
-                self.pipe.set_adapters([lora_name], adapter_weights=[lora_scale])
-                print(f"LoRA ({lora_name}) scale: {lora_scale}")
             except Exception as e:
-                print(f"Could not set LoRA scale: {e}")
-                # Try fallback to retroart if selected LoRA fails
-                if lora_name != "retroart":
-                    try:
-                        self.pipe.set_adapters(["retroart"], adapter_weights=[lora_scale])
-                        print(f"Fallback to RetroArt LoRA, scale: {lora_scale}")
-                    except:
-                        pass
         # Prepare generation kwargs
         pipe_kwargs = {
@@ -727,11 +788,11 @@ class RetroArtConverter:
                     print("  Face detected but IP-Adapter/embeddings unavailable, using keypoints only")
             else:
-                # No face, must add a blank image to keep list order
-                print("Using blank map for InstantID (no face/disabled)")
                 control_images.append(Image.new("RGB", (target_width, target_height), (0,0,0)))
                 conditioning_scales.append(0.0) # Set scale to 0
-                scale_debug_str.append("Identity: 0.00")
         # 2. Depth
         if self.depth_active:

     load_face_analysis, load_depth_detector, load_controlnets, load_image_encoder,
     load_sdxl_pipeline, load_lora, setup_ip_adapter, setup_compel,
     setup_scheduler, optimize_pipeline, load_caption_model, set_clip_skip,
+    load_openpose_detector, load_mediapipe_face_detector
 )
             'custom_checkpoint': False,
             'lora': False,
             'instantid': False,
+            'depth_detector': False,
+            'depth_type': None,
             'ip_adapter': False,
+            'openpose': False,
+            'mediapipe_face': False
         }
+        # Initialize face analysis (InsightFace)
         self.face_app, self.face_detection_enabled = load_face_analysis()
+        # Load MediapipeFaceDetector (alternative face detection)
+        self.mediapipe_face, mediapipe_success = load_mediapipe_face_detector()
+        self.models_loaded['mediapipe_face'] = mediapipe_success
+        # Load Depth detector with fallback hierarchy (Leres -> Midas)
+        self.depth_detector, self.depth_type, depth_success = load_depth_detector()
+        self.models_loaded['depth_detector'] = depth_success
+        self.models_loaded['depth_type'] = self.depth_type
         # --- NEW: Load OpenPose detector ---
         self.openpose_detector, openpose_success = load_openpose_detector()
         print("============================\n")
     def get_depth_map(self, image):
+            """
+            Generate depth map using available depth detector.
+            Supports: LeresDetector or MidasDetector.
+            """
+            if self.depth_detector is not None:
                 try:
                     if image.mode != 'RGB':
                         image = image.convert('RGB')
                     image_for_depth = image.resize(size_for_depth, Image.LANCZOS)
                     if target_width != orig_width or target_height != orig_height:
+                        print(f"[DEPTH] Resized for {self.depth_type.upper()}Detector: {orig_width}x{orig_height} -> {target_width}x{target_height}")
                     # FIXED: Add torch.no_grad() wrapper
                     with torch.no_grad():
+                        depth_image = self.depth_detector(image_for_depth)
                     depth_width, depth_height = depth_image.size
                     if depth_width != orig_width or depth_height != orig_height:
                         depth_image = depth_image.resize((int(orig_width), int(orig_height)), Image.LANCZOS)
+                    print(f"[DEPTH] {self.depth_type.upper()} depth map generated: {orig_width}x{orig_height}")
                     return depth_image
                 except Exception as e:
+                    print(f"[DEPTH] {self.depth_type.upper()}Detector failed ({e}), falling back to grayscale depth")
                     gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
                     depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
                     return Image.fromarray(depth_colored)
             else:
+                # No depth detector available, use grayscale fallback
+                print("[DEPTH] No depth detector available, using grayscale fallback")
                 gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
                 depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
                 return Image.fromarray(depth_colored)
+    def add_trigger_word(self, prompt):
         """Add trigger word to prompt if not present"""
+        if TRIGGER_WORD.lower() not in prompt.lower():
             if not prompt or not prompt.strip():
+                return TRIGGER_WORD
+            return f"{TRIGGER_WORD}, {prompt}"
         return prompt
     def extract_multi_scale_face(self, face_crop, face):
         identity_control_scale=0.85,
         expression_control_scale=0.6,
         lora_scale=1.0,
         identity_preservation=0.8,
         strength=0.75,
         enable_color_matching=False,
                 )
         # Add trigger word
+        prompt = self.add_trigger_word(prompt)
         # Calculate optimal size with flexible aspect ratio support
         original_width, original_height = input_image.size
         # Generate depth map
         depth_image = None
         if self.depth_active:
+            print("Generating depth map...")
             depth_image = self.get_depth_map(resized_image)
             if depth_image.size != (target_width, target_height):
                 depth_image = depth_image.resize((int(target_width), int(target_height)), Image.LANCZOS)
         has_detected_faces = False
         face_bbox_original = None
+        if self.instantid_active:
+            # Try InsightFace first (if available)
+            insightface_tried = False
+            insightface_success = False
+            if self.face_app is not None:
+                print("Detecting faces with InsightFace...")
+                insightface_tried = True
+                try:
+                    img_array = cv2.cvtColor(np.array(resized_image), cv2.COLOR_RGB2BGR)
+                    faces = self.face_app.get(img_array)
+                    if len(faces) > 0:
+                        insightface_success = True
+                        has_detected_faces = True
+                        print(f"âœ“ InsightFace detected {len(faces)} face(s)")
+                        # Get largest face
+                        face = sorted(faces, key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]))[-1]
+                        # ADAPTIVE PARAMETERS
+                        adaptive_params = self.detect_face_quality(face)
+                        if adaptive_params is not None:
+                            print(f"[ADAPTIVE] {adaptive_params['reason']}")
+                            identity_preservation = adaptive_params['identity_preservation']
+                            identity_control_scale = adaptive_params['identity_control_scale']
+                            guidance_scale = adaptive_params['guidance_scale']
+                            lora_scale = adaptive_params['lora_scale']
+                        # Extract face embeddings
+                        face_embeddings_base = face.normed_embedding
+                        # Extract face crop
+                        bbox = face.bbox.astype(int)
+                        x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3]
+                        face_bbox_original = [x1, y1, x2, y2]
+                        # Add padding
+                        face_width = x2 - x1
+                        face_height = y2 - y1
+                        padding_x = int(face_width * 0.3)
+                        padding_y = int(face_height * 0.3)
+                        x1 = max(0, x1 - padding_x)
+                        y1 = max(0, y1 - padding_y)
+                        x2 = min(resized_image.width, x2 + padding_x)
+                        y2 = min(resized_image.height, y2 + padding_y)
+                        # Crop face region
+                        face_crop = resized_image.crop((x1, y1, x2, y2))
+                        # MULTI-SCALE PROCESSING
+                        face_embeddings = self.extract_multi_scale_face(face_crop, face)
+                        # Enhance face crop
+                        face_crop_enhanced = enhance_face_crop(face_crop)
+                        # Draw keypoints
+                        face_kps = face.kps
+                        face_kps_image = draw_kps(resized_image, face_kps)
+                        # ENHANCED: Extract comprehensive facial attributes
+                        from utils import get_facial_attributes, build_enhanced_prompt
+                        facial_attrs = get_facial_attributes(face)
+                        # Update prompt with detected attributes
+                        prompt = build_enhanced_prompt(prompt, facial_attrs, TRIGGER_WORD)
+                        # Legacy output for compatibility
+                        age = facial_attrs['age']
+                        gender_code = facial_attrs['gender']
+                        det_score = facial_attrs['quality']
+                        gender_str = 'M' if gender_code == 1 else ('F' if gender_code == 0 else 'N/A')
+                        print(f"Face info: bbox={face.bbox}, age={age if age else 'N/A'}, gender={gender_str}")
+                        print(f"Face crop size: {face_crop.size}, enhanced: {face_crop_enhanced.size if face_crop_enhanced else 'N/A'}")
+                    else:
+                        print("âœ— InsightFace found no faces")
+                except Exception as e:
+                    print(f"[ERROR] InsightFace detection failed: {e}")
+                    import traceback
+                    traceback.print_exc()
+            else:
+                print("[INFO] InsightFace not available (face_app is None)")
+            # If InsightFace didn't succeed, try MediapipeFace
+            if not insightface_success:
+                if self.mediapipe_face is not None:
+                    print("Trying MediapipeFaceDetector as fallback...")
+                    try:
+                        # MediapipeFace returns an annotated image with keypoints
+                        mediapipe_result = self.mediapipe_face(resized_image)
+                        # Check if face was detected (result is not blank/black)
+                        mediapipe_array = np.array(mediapipe_result)
+                        if mediapipe_array.sum() > 1000:  # If image has significant content
+                            has_detected_faces = True
+                            face_kps_image = mediapipe_result
+                            print(f"âœ“ MediapipeFace detected face(s)")
+                            print(f"[INFO] Using MediapipeFace keypoints (no embeddings available)")
+                            # Note: MediapipeFace doesn't provide embeddings or detailed info
+                            # So face_embeddings, face_crop_enhanced remain None
+                            # InstantID will work with keypoints only (reduced quality)
+                        else:
+                            print("âœ— MediapipeFace found no faces")
+                    except Exception as e:
+                        print(f"[ERROR] MediapipeFace detection failed: {e}")
+                        import traceback
+                        traceback.print_exc()
+                else:
+                    print("[INFO] MediapipeFaceDetector not available")
+            # Final summary
+            if not has_detected_faces:
+                print("\n[SUMMARY] No faces detected by any detector")
+                if insightface_tried:
+                    print("  - InsightFace: tried, found nothing")
+                else:
+                    print("  - InsightFace: not available")
+                if self.mediapipe_face is not None:
+                    print("  - MediapipeFace: tried, found nothing")
+                else:
+                    print("  - MediapipeFace: not available")
+                print()
         # Set LORA scale
         if hasattr(self.pipe, 'set_adapters') and self.models_loaded['lora']:
             try:
+                self.pipe.set_adapters(["retroart"], adapter_weights=[lora_scale])
+                print(f"LORA scale: {lora_scale}")
             except Exception as e:
+                print(f"Could not set LORA scale: {e}")
         # Prepare generation kwargs
         pipe_kwargs = {
                     print("  Face detected but IP-Adapter/embeddings unavailable, using keypoints only")
             else:
+                # No face detected - blank map needed to maintain ControlNet list order
+                print("[INSTANTID] Using blank map (scale=0, no effect on generation)")
                 control_images.append(Image.new("RGB", (target_width, target_height), (0,0,0)))
                 conditioning_scales.append(0.0) # Set scale to 0
+                scale_debug_str.append("Identity: 0.00 (no face)")
         # 2. Depth
         if self.depth_active:

models.py CHANGED Viewed

@@ -13,7 +13,7 @@ from diffusers import (
 from diffusers.models.attention_processor import AttnProcessor2_0
 from transformers import CLIPVisionModelWithProjection
 from insightface.app import FaceAnalysis
-from controlnet_aux import ZoeDetector, OpenposeDetector  # <-- NEW
 from huggingface_hub import hf_hub_download
 from compel import Compel, ReturnedEmbeddingsType
@@ -82,16 +82,34 @@ def load_face_analysis():
 def load_depth_detector():
-    """Load Zoe Depth detector."""
-    print("Loading Zoe Depth detector...")
     try:
-        zoe_depth = ZoeDetector.from_pretrained("lllyasviel/Annotators")
-        zoe_depth.to(device)
-        print("  [OK] Zoe Depth loaded successfully")
-        return zoe_depth, True
     except Exception as e:
-        print(f"  [WARNING] Zoe Depth not available: {e}")
-        return None, False
 # --- NEW FUNCTION ---
 def load_openpose_detector():
@@ -107,11 +125,24 @@ def load_openpose_detector():
         return None, False
 # --- END NEW FUNCTION ---
 def load_controlnets():
     """Load ControlNet models."""
-    print("Loading ControlNet Zoe Depth model...")
     controlnet_depth = ControlNetModel.from_pretrained(
-        "diffusers/controlnet-zoe-depth-sdxl-1.0",
         torch_dtype=dtype
     ).to(device)
     print("  [OK] ControlNet Depth loaded")
@@ -188,29 +219,16 @@ def load_sdxl_pipeline(controlnets):
 def load_lora(pipe):
-    """Load both LORA models from HuggingFace Hub."""
-    print("Loading LoRAs from HuggingFace Hub...")
-    success_count = 0
-    # Load RetroArt LoRA
     try:
-        lora_path_retroart = download_model_with_retry(MODEL_REPO, MODEL_FILES['lora_retroart'])
-        pipe.load_lora_weights(lora_path_retroart, adapter_name="retroart")
-        print(f"  [OK] RetroArt LoRA loaded successfully")
-        success_count += 1
     except Exception as e:
-        print(f"  [WARNING] Could not load RetroArt LoRA: {e}")
-    # Load VGA LoRA
-    try:
-        lora_path_vga = download_model_with_retry(MODEL_REPO, MODEL_FILES['lora_vga'])
-        pipe.load_lora_weights(lora_path_vga, adapter_name="vga")
-        print(f"  [OK] VGA LoRA loaded successfully")
-        success_count += 1
-    except Exception as e:
-        print(f"  [WARNING] Could not load VGA LoRA: {e}")
-    return success_count > 0
 def setup_ip_adapter(pipe, image_encoder):

 from diffusers.models.attention_processor import AttnProcessor2_0
 from transformers import CLIPVisionModelWithProjection
 from insightface.app import FaceAnalysis
+from controlnet_aux import OpenposeDetector, LeresDetector, MidasDetector, MediapipeFaceDetector
 from huggingface_hub import hf_hub_download
 from compel import Compel, ReturnedEmbeddingsType
 def load_depth_detector():
+    """
+    Load depth detector with fallback hierarchy: Leres -> Midas.
+    Returns (detector, detector_type, success).
+    """
+    print("Loading depth detector with fallback hierarchy...")
+    # Try LeresDetector first (best quality)
     try:
+        print("  Attempting LeresDetector (highest quality)...")
+        leres_depth = LeresDetector.from_pretrained("lllyasviel/Annotators")
+        leres_depth.to(device)
+        print("  [OK] LeresDetector loaded successfully")
+        return leres_depth, 'leres', True
     except Exception as e:
+        print(f"  [INFO] LeresDetector not available: {e}")
+    # Fallback to MidasDetector
+    try:
+        print("  Attempting MidasDetector (fallback)...")
+        midas_depth = MidasDetector.from_pretrained("lllyasviel/Annotators")
+        midas_depth.to(device)
+        print("  [OK] MidasDetector loaded successfully")
+        return midas_depth, 'midas', True
+    except Exception as e:
+        print(f"  [WARNING] MidasDetector not available: {e}")
+    print("  [ERROR] No depth detector available")
+    return None, None, False
 # --- NEW FUNCTION ---
 def load_openpose_detector():
         return None, False
 # --- END NEW FUNCTION ---
+# --- NEW FUNCTION ---
+def load_mediapipe_face_detector():
+    """Load MediapipeFaceDetector for advanced face detection."""
+    print("Loading MediapipeFaceDetector...")
+    try:
+        face_detector = MediapipeFaceDetector()
+        print("  [OK] MediapipeFaceDetector loaded successfully")
+        return face_detector, True
+    except Exception as e:
+        print(f"  [WARNING] MediapipeFaceDetector not available: {e}")
+        return None, False
+# --- END NEW FUNCTION ---
 def load_controlnets():
     """Load ControlNet models."""
+    print("Loading ControlNet Depth model...")
     controlnet_depth = ControlNetModel.from_pretrained(
+        "diffusers/controlnet-zoe-depth-sdxl-1.0",  # Model repo name (not tied to detector)
         torch_dtype=dtype
     ).to(device)
     print("  [OK] ControlNet Depth loaded")
 def load_lora(pipe):
+    """Load LORA from HuggingFace Hub."""
+    print("Loading LORA (retroart) from HuggingFace Hub...")
     try:
+        lora_path = download_model_with_retry(MODEL_REPO, MODEL_FILES['lora'])
+        pipe.load_lora_weights(lora_path, adapter_name="retroart")
+        print(f"  [OK] LORA loaded successfully")
+        return True
     except Exception as e:
+        print(f"  [WARNING] Could not load LORA: {e}")
+        return False
 def setup_ip_adapter(pipe, image_encoder):