BiliSakura
/

PixelFlow-diffusers

@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import importlib
 import json
 import math
@@ -19,7 +23,6 @@ import sys
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union
-import numpy as np
 import torch
 import torch.nn.functional as F
 from einops import rearrange
@@ -27,8 +30,6 @@ from einops import rearrange
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.models.embeddings import get_2d_rotary_pos_embed
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import replace_example_docstring
 from diffusers.utils.torch_utils import randn_tensor
@@ -38,32 +39,19 @@ EXAMPLE_DOC_STRING = """
     Examples:
         ```py
         >>> from pathlib import Path
         >>> import torch
-        >>> from diffusers import DiffusionPipeline
         >>> model_dir = Path("./PixelFlow-256").resolve()
-        >>> pipe = DiffusionPipeline.from_pretrained(
         ...     str(model_dir),
         ...     local_files_only=True,
-        ...     custom_pipeline=str(model_dir / "pipeline.py"),
-        ...     trust_remote_code=True,
         ...     torch_dtype=torch.bfloat16,
         ... )
-        >>> pipe = pipe.to("cuda")
-        >>> print(pipe.id2label[207])
-        >>> print(pipe.get_label_ids("golden retriever"))
-        >>> generator = torch.Generator(device="cuda").manual_seed(42)
-        >>> image = pipe(
-        ...     class_labels="golden retriever",
-        ...     height=256,
-        ...     width=256,
-        ...     num_inference_steps=[10, 10, 10, 10],
-        ...     guidance_scale=4.0,
-        ...     generator=generator,
-        ... ).images[0]
-        >>> image.save("demo.png")
         ```
 """
@@ -76,12 +64,27 @@ class PixelFlowPipeline(DiffusionPipeline):
     Parameters:
         transformer ([`PixelFlowTransformer2DModel`]):
             Class-conditional PixelFlow transformer operating in pixel space.
-        scheduler ([`PixelFlowScheduler`] or [`KarrasDiffusionSchedulers`]):
-            Multi-stage flow scheduler used by PixelFlow cascade denoising.
         id2label (`dict[int, str]`, *optional*):
             ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
     """
     model_cpu_offload_seq = "transformer"
     def __init__(
@@ -128,7 +131,6 @@ class PixelFlowPipeline(DiffusionPipeline):
                 variant = variant / subfolder
         id2label_override = kwargs.pop("id2label", None)
-        kwargs.pop("trust_remote_code", None)
         model_kwargs = dict(kwargs)
         scheduler_kwargs = model_kwargs.pop("scheduler_kwargs", {})
         inserted = []
@@ -147,14 +149,15 @@ class PixelFlowPipeline(DiffusionPipeline):
             transformer_cls = getattr(importlib.import_module("transformer_pixelflow"), "PixelFlowTransformer2DModel")
             transformer = transformer_cls.from_pretrained(str(transformer_dir), **model_kwargs)
-            scheduler_dir = variant / "scheduler"
-            if not (scheduler_dir / "scheduler_config.json").exists():
-                raise FileNotFoundError(f"Expected scheduler config in {scheduler_dir}")
-            _ensure_path(str(scheduler_dir))
             scheduler_cls = getattr(importlib.import_module("scheduling_pixelflow"), "PixelFlowScheduler")
             try:
-                scheduler = scheduler_cls.from_pretrained(str(scheduler_dir), **scheduler_kwargs)
             except Exception:
                 scheduler = scheduler_cls(**scheduler_kwargs)
@@ -168,11 +171,41 @@ class PixelFlowPipeline(DiffusionPipeline):
                 if comp_path in sys.path:
                     sys.path.remove(comp_path)
-    @staticmethod
-    def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
-        if not id2label:
-            return {}
-        return {int(key): value for key, value in id2label.items()}
     def _ensure_labels_loaded(self) -> None:
         if self._labels_loaded_from_model_index:
@@ -183,6 +216,12 @@ class PixelFlowPipeline(DiffusionPipeline):
             self.labels = self._build_label2id(self._id2label)
         self._labels_loaded_from_model_index = True
     @staticmethod
     def _read_id2label_from_model_index(variant_path: Optional[str]) -> Dict[int, str]:
         if not variant_path:
@@ -304,15 +343,23 @@ class PixelFlowPipeline(DiffusionPipeline):
         channels: int,
         height: int,
         width: int,
         eps: float = 1e-6,
     ) -> torch.Tensor:
         gamma = self.scheduler.gamma
-        dist = torch.distributions.multivariate_normal.MultivariateNormal(
-            torch.zeros(4),
-            torch.eye(4) * (1 - gamma) + torch.ones(4, 4) * gamma + eps * torch.eye(4),
-        )
         block_number = batch_size * channels * (height // 2) * (width // 2)
-        noise = torch.stack([dist.sample() for _ in range(block_number)])
         return rearrange(
             noise,
             "(b c h w) (p q) -> b c (h p) (w q)",
@@ -331,6 +378,7 @@ class PixelFlowPipeline(DiffusionPipeline):
         height: int,
         width: int,
         device: torch.device,
     ) -> torch.Tensor:
         latents = F.interpolate(latents, size=(height, width), mode="nearest")
         original_start_t = self.scheduler.original_start_t[stage_idx]
@@ -338,8 +386,12 @@ class PixelFlowPipeline(DiffusionPipeline):
         alpha = 1 / (math.sqrt(1 - (1 / gamma)) * (1 - original_start_t) + original_start_t)
         beta = alpha * (1 - original_start_t) / math.sqrt(-gamma)
-        noise = self._sample_block_noise(*latents.shape)
-        noise = noise.to(device=device, dtype=latents.dtype)
         return alpha * latents + beta * noise
     def _prepare_rope_pos_embed(self, latents: torch.Tensor, device: torch.device) -> torch.Tensor:
@@ -378,7 +430,6 @@ class PixelFlowPipeline(DiffusionPipeline):
         raise ValueError(f"output_type must be one of: 'pil', 'np', 'pt', 'latent'. Got {output_type}.")
     @torch.inference_mode()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         class_labels: Union[int, str, List[Union[int, str]], torch.LongTensor],
@@ -394,9 +445,6 @@ class PixelFlowPipeline(DiffusionPipeline):
         r"""
         Generate class-conditional images with PixelFlow.
-        Examples:
-            <!-- this section is replaced by replace_example_docstring -->
         Args:
             class_labels (`int`, `str`, `list[int]`, `list[str]`, or `torch.LongTensor`):
                 ImageNet class indices or human-readable English label strings.
@@ -435,37 +483,39 @@ class PixelFlowPipeline(DiffusionPipeline):
         autocast_enabled = device.type == "cuda"
         autocast_dtype = torch.bfloat16 if autocast_enabled else torch.float32
-        with self.progress_bar(total=sum(stage_steps)) as progress_bar:
-            for stage_idx in range(self.scheduler.num_stages):
-                self.scheduler.set_timesteps(stage_steps[stage_idx], stage_idx, device=device, shift=shift)
-                timesteps = self.scheduler.Timesteps
-                if stage_idx > 0:
-                    height, width = height * 2, width * 2
-                    latents = self._upsample_latents_for_stage(latents, stage_idx, height, width, device)
-                    size_tensor = torch.tensor([latents.shape[-1] // self.transformer.patch_size], dtype=torch.int32, device=device)
-                rope_pos = self._prepare_rope_pos_embed(latents, device)
-                for timestep in timesteps:
-                    latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                    timestep_batch = timestep.expand(latent_model_input.shape[0]).to(latent_model_input.dtype)
-                    with torch.autocast(device.type, enabled=autocast_enabled, dtype=autocast_dtype):
-                        noise_pred = self.transformer(
-                            latent_model_input,
-                            timestep=timestep_batch,
-                            class_labels=conditioning,
-                            latent_size=size_tensor,
-                            pos_embed=rope_pos,
-                        ).sample
-                    if do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        stage_scale = self._stage_guidance_scale(stage_idx, guidance_scale)
-                        noise_pred = noise_pred_uncond + stage_scale * (noise_pred_text - noise_pred_uncond)
-                    latents = self.scheduler.step(model_output=noise_pred, sample=latents).prev_sample
-                    progress_bar.update()
         image = self.decode_latents(latents, output_type=output_type)
         self.maybe_free_model_hooks()

 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
+import inspect
 import importlib
 import json
 import math
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 from einops import rearrange
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.models.embeddings import get_2d_rotary_pos_embed
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from diffusers.utils.torch_utils import randn_tensor
     Examples:
         ```py
         >>> from pathlib import Path
+        >>> import sys
         >>> import torch
         >>> model_dir = Path("./PixelFlow-256").resolve()
+        >>> sys.path.insert(0, str(model_dir))
+        >>> from pipeline import PixelFlowPipeline
+        >>> pipe = PixelFlowPipeline.from_pretrained(
         ...     str(model_dir),
         ...     local_files_only=True,
         ...     torch_dtype=torch.bfloat16,
         ... )
+        >>> pipe.to("cuda")
         ```
 """
     Parameters:
         transformer ([`PixelFlowTransformer2DModel`]):
             Class-conditional PixelFlow transformer operating in pixel space.
+        scheduler ([`PixelFlowScheduler`]):
+            Multi-stage flow scheduler used by PixelFlow.
         id2label (`dict[int, str]`, *optional*):
             ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
     """
+    @staticmethod
+    def prepare_extra_step_kwargs(
+        scheduler,
+        generator=None,
+        eta: float | None = None,
+    ):
+        kwargs = {}
+        step_params = set(inspect.signature(scheduler.step).parameters.keys())
+        if "generator" in step_params:
+            kwargs["generator"] = generator
+        if eta is not None and "eta" in step_params:
+            kwargs["eta"] = eta
+        return kwargs
     model_cpu_offload_seq = "transformer"
     def __init__(
                 variant = variant / subfolder
         id2label_override = kwargs.pop("id2label", None)
         model_kwargs = dict(kwargs)
         scheduler_kwargs = model_kwargs.pop("scheduler_kwargs", {})
         inserted = []
             transformer_cls = getattr(importlib.import_module("transformer_pixelflow"), "PixelFlowTransformer2DModel")
             transformer = transformer_cls.from_pretrained(str(transformer_dir), **model_kwargs)
+            scheduling_py = variant / "scheduling_pixelflow.py"
+            scheduler_cfg_dir = variant / "scheduler"
+            if not scheduling_py.is_file() or not (scheduler_cfg_dir / "scheduler_config.json").exists():
+                raise FileNotFoundError(f"Expected scheduler module at {scheduling_py} and config in {scheduler_cfg_dir}")
+            _ensure_path(str(variant.resolve()))
             scheduler_cls = getattr(importlib.import_module("scheduling_pixelflow"), "PixelFlowScheduler")
             try:
+                scheduler = scheduler_cls.from_pretrained(str(scheduler_cfg_dir), **scheduler_kwargs)
             except Exception:
                 scheduler = scheduler_cls(**scheduler_kwargs)
                 if comp_path in sys.path:
                     sys.path.remove(comp_path)
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
+        model_kwargs = dict(kwargs)
+        transformer_subfolder = model_kwargs.pop("transformer_subfolder", None)
+        scheduler_subfolder = model_kwargs.pop("scheduler_subfolder", None)
+        scheduler_kwargs = model_kwargs.pop("scheduler_kwargs", {})
+        base_path = Path(pretrained_model_name_or_path)
+        if transformer_subfolder is None and (base_path / "transformer").exists():
+            transformer_subfolder = "transformer"
+        if scheduler_subfolder is None and (base_path / "scheduler").exists():
+            scheduler_subfolder = "scheduler"
+        try:
+            return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+        except Exception:
+            if transformer_subfolder is not None:
+                transformer_path = str(base_path / transformer_subfolder)
+            else:
+                transformer_path = pretrained_model_name_or_path
+            transformer = PixelFlowTransformer2DModel.from_pretrained(transformer_path, **model_kwargs)
+            try:
+                scheduler = PixelFlowScheduler.from_pretrained(
+                    pretrained_model_name_or_path,
+                    subfolder=scheduler_subfolder,
+                    **scheduler_kwargs,
+                )
+            except Exception:
+                scheduler = PixelFlowScheduler(**scheduler_kwargs)
+            id2label = cls._read_id2label_from_model_index(str(base_path))
+            pipe = cls(transformer=transformer, scheduler=scheduler, id2label=id2label)
+            if hasattr(pipe, "register_to_config"):
+                pipe.register_to_config(_name_or_path=str(base_path))
+            return pipe
     def _ensure_labels_loaded(self) -> None:
         if self._labels_loaded_from_model_index:
             self.labels = self._build_label2id(self._id2label)
         self._labels_loaded_from_model_index = True
+    @staticmethod
+    def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
+        if not id2label:
+            return {}
+        return {int(key): value for key, value in id2label.items()}
     @staticmethod
     def _read_id2label_from_model_index(variant_path: Optional[str]) -> Dict[int, str]:
         if not variant_path:
         channels: int,
         height: int,
         width: int,
+        device: torch.device,
+        dtype: torch.dtype,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         eps: float = 1e-6,
     ) -> torch.Tensor:
         gamma = self.scheduler.gamma
+        cov = torch.eye(4, dtype=torch.float32) * (1 - gamma) + torch.ones(4, 4, dtype=torch.float32) * gamma
+        cov = cov + eps * torch.eye(4, dtype=torch.float32)
+        chol = torch.linalg.cholesky(cov).to(device=device, dtype=dtype)
         block_number = batch_size * channels * (height // 2) * (width // 2)
+        standard = randn_tensor(
+            (block_number, 4),
+            generator=generator,
+            device=device,
+            dtype=dtype,
+        )
+        noise = standard @ chol.T
         return rearrange(
             noise,
             "(b c h w) (p q) -> b c (h p) (w q)",
         height: int,
         width: int,
         device: torch.device,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
     ) -> torch.Tensor:
         latents = F.interpolate(latents, size=(height, width), mode="nearest")
         original_start_t = self.scheduler.original_start_t[stage_idx]
         alpha = 1 / (math.sqrt(1 - (1 / gamma)) * (1 - original_start_t) + original_start_t)
         beta = alpha * (1 - original_start_t) / math.sqrt(-gamma)
+        noise = self._sample_block_noise(
+            *latents.shape,
+            device=device,
+            dtype=latents.dtype,
+            generator=generator,
+        )
         return alpha * latents + beta * noise
     def _prepare_rope_pos_embed(self, latents: torch.Tensor, device: torch.device) -> torch.Tensor:
         raise ValueError(f"output_type must be one of: 'pil', 'np', 'pt', 'latent'. Got {output_type}.")
     @torch.inference_mode()
     def __call__(
         self,
         class_labels: Union[int, str, List[Union[int, str]], torch.LongTensor],
         r"""
         Generate class-conditional images with PixelFlow.
         Args:
             class_labels (`int`, `str`, `list[int]`, `list[str]`, or `torch.LongTensor`):
                 ImageNet class indices or human-readable English label strings.
         autocast_enabled = device.type == "cuda"
         autocast_dtype = torch.bfloat16 if autocast_enabled else torch.float32
+        extra_step_kwargs = self.prepare_extra_step_kwargs(self.scheduler, generator=generator)
+        for stage_idx in range(self.scheduler.num_stages):
+            self.scheduler.set_timesteps(stage_steps[stage_idx], stage_idx, device=device, shift=shift)
+            timesteps = self.scheduler.Timesteps
+            if stage_idx > 0:
+                height, width = height * 2, width * 2
+                latents = self._upsample_latents_for_stage(
+                    latents, stage_idx, height, width, device, generator=generator
+                )
+                size_tensor = torch.tensor([latents.shape[-1] // self.transformer.patch_size], dtype=torch.int32, device=device)
+            rope_pos = self._prepare_rope_pos_embed(latents, device)
+            for timestep in timesteps:
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                timestep_batch = timestep.expand(latent_model_input.shape[0]).to(latent_model_input.dtype)
+                with torch.autocast(device.type, enabled=autocast_enabled, dtype=autocast_dtype):
+                    noise_pred = self.transformer(
+                        latent_model_input,
+                        timestep=timestep_batch,
+                        class_labels=conditioning,
+                        latent_size=size_tensor,
+                        pos_embed=rope_pos,
+                    ).sample
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    stage_scale = self._stage_guidance_scale(stage_idx, guidance_scale)
+                    noise_pred = noise_pred_uncond + stage_scale * (noise_pred_text - noise_pred_uncond)
+                latents = self.scheduler.step(model_output=noise_pred, sample=latents, **extra_step_kwargs).prev_sample
         image = self.decode_latents(latents, output_type=output_type)
         self.maybe_free_model_hooks()