Address comments

miguelmartin75 · miguelmartin75 · commit c14a3dad8a1a · 2025-12-18T03:25:08.000Z
diff --git a/docs/source/en/api/pipelines/cosmos.md b/docs/source/en/api/pipelines/cosmos.md
@@ -70,6 +70,12 @@ output.save("output.png")
   - all
   - __call__
 
+## Cosmos2_5_PredictBasePipeline
+
+[[autodoc]] Cosmos2_5_PredictBasePipeline
+  - all
+  - __call__
+
 ## CosmosPipelineOutput
 
 [[autodoc]] pipelines.cosmos.pipeline_output.CosmosPipelineOutput
diff --git a/scripts/convert_cosmos_to_diffusers.py b/scripts/convert_cosmos_to_diffusers.py
@@ -63,7 +63,7 @@
     FlowMatchEulerDiscreteScheduler,
     UniPCMultistepScheduler,
 )
-from diffusers.pipelines.cosmos.pipeline_cosmos2_5_predict import Cosmos2_5_PredictBase
+from diffusers.pipelines.cosmos.pipeline_cosmos2_5_predict import Cosmos2_5_PredictBasePipeline
 
 
 def remove_keys_(key: str, state_dict: Dict[str, Any]):
@@ -545,7 +545,7 @@ def save_pipeline_cosmos2_5(args, transformer, vae):
         sigma_min=0.01,
     )
 
-    pipe = Cosmos2_5_PredictBase(
+    pipe = Cosmos2_5_PredictBasePipeline(
         text_encoder=text_encoder,
         tokenizer=tokenizer,
         transformer=transformer,
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -463,7 +463,7 @@
             "CogView4ControlPipeline",
             "CogView4Pipeline",
             "ConsisIDPipeline",
-            "Cosmos2_5_PredictBase",
+            "Cosmos2_5_PredictBasePipeline",
             "Cosmos2_5_PredictImage2World",
             "Cosmos2_5_PredictText2World",
             "Cosmos2_5_PredictVideo2World",
@@ -1179,7 +1179,7 @@
             CogView4ControlPipeline,
             CogView4Pipeline,
             ConsisIDPipeline,
-            Cosmos2_5_PredictBase,
+            Cosmos2_5_PredictBasePipeline,
             Cosmos2_5_PredictImage2World,
             Cosmos2_5_PredictText2World,
             Cosmos2_5_PredictVideo2World,
diff --git a/src/diffusers/models/transformers/transformer_cosmos.py b/src/diffusers/models/transformers/transformer_cosmos.py
@@ -488,8 +488,7 @@ def __init__(
             hidden_size, patch_size[0] * patch_size[1] * patch_size[2] * out_channels, bias=False
         )
 
-        self.use_crossattn_projection = use_crossattn_projection
-        if self.use_crossattn_projection:
+        if self.config.use_crossattn_projection:
             self.crossattn_proj = nn.Sequential(
                 nn.Linear(crossattn_proj_in_channels, encoder_hidden_states_channels, bias=True),
                 nn.GELU(),
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
@@ -165,7 +165,7 @@
     _import_structure["cogview4"] = ["CogView4Pipeline", "CogView4ControlPipeline"]
     _import_structure["consisid"] = ["ConsisIDPipeline"]
     _import_structure["cosmos"] = [
-        "Cosmos2_5_PredictBase",
+        "Cosmos2_5_PredictBasePipeline",
         "Cosmos2_5_PredictImage2World",
         "Cosmos2_5_PredictText2World",
         "Cosmos2_5_PredictVideo2World",
@@ -626,7 +626,7 @@
             StableDiffusionXLControlNetXSPipeline,
         )
         from .cosmos import (
-            Cosmos2_5_PredictBase,
+            Cosmos2_5_PredictBasePipeline,
             Cosmos2_5_PredictImage2World,
             Cosmos2_5_PredictText2World,
             Cosmos2_5_PredictVideo2World,
diff --git a/src/diffusers/pipelines/cosmos/__init__.py b/src/diffusers/pipelines/cosmos/__init__.py
@@ -23,7 +23,7 @@
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
     _import_structure["pipeline_cosmos2_5_predict"] = [
-        "Cosmos2_5_PredictBase",
+        "Cosmos2_5_PredictBasePipeline",
         "Cosmos2_5_PredictImage2World",
         "Cosmos2_5_PredictText2World",
         "Cosmos2_5_PredictVideo2World",
@@ -42,7 +42,7 @@
         from ...utils.dummy_torch_and_transformers_objects import *
     else:
         from .pipeline_cosmos2_5_predict import (
-            Cosmos2_5_PredictBase,
+            Cosmos2_5_PredictBasePipeline,
             Cosmos2_5_PredictImage2World,
             Cosmos2_5_PredictText2World,
             Cosmos2_5_PredictVideo2World,
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
@@ -71,11 +71,11 @@ def retrieve_latents(
     Examples:
         ```python
         >>> import torch
-        >>> from diffusers import Cosmos2_5_PredictBase
+        >>> from diffusers import Cosmos2_5_PredictBasePipeline
         >>> from diffusers.utils import export_to_video, load_image, load_video
 
         >>> model_id = "nvidia/Cosmos-Predict2.5-Base-2B"
-        >>> pipe = Cosmos2_5_PredictBase.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        >>> pipe = Cosmos2_5_PredictBasePipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
         >>> pipe = pipe.to("cuda")
 
         >>> # Common negative prompt reused across modes.
@@ -163,7 +163,7 @@ def retrieve_latents(
 """
 
 
-class Cosmos2_5_PredictBase(DiffusionPipeline):
+class Cosmos2_5_PredictBasePipeline(DiffusionPipeline):
     r"""
     Pipeline for [Cosmos Predict2.5](https://github.com/nvidia-cosmos/cosmos-predict2.5) base model.
 
@@ -233,20 +233,6 @@ def __init__(
         if self.latents_mean is None or self.latents_std is None:
             raise ValueError("VAE configuration must define both `latents_mean` and `latents_std`.")
 
-    
-    @property
-    def _execution_device(self):
-        device = super()._execution_device
-        if isinstance(device, torch.device) and device.type == "cpu":
-            for module_name in ("transformer", "text_encoder", "vae"):
-                module = getattr(self, module_name, None)
-                if module is None or not isinstance(module, torch.nn.Module):
-                    continue
-                module_device = getattr(module, "device", None)
-                if isinstance(module_device, torch.device) and module_device.type != "cpu":
-                    return module_device
-        return device
-
     # Copied from diffusers.pipelines.cosmos.pipeline_cosmos_text2world.CosmosTextToWorldPipeline._get_prompt_embeds
     def _get_prompt_embeds(
         self,
@@ -398,6 +384,8 @@ def encode_prompt(
 
         return prompt_embeds, negative_prompt_embeds
 
+    # Modified from diffusers.pipelines.cosmos.pipeline_cosmos2_video2world.Cosmos2VideoToWorldPipeline.prepare_latents and 
+    # diffusers.pipelines.cosmos.pipeline_cosmos2_video2world.Cosmos2TextToImagePipeline.prepare_latents
     def prepare_latents(
         self,
         video: Optional[torch.Tensor],
@@ -458,8 +446,6 @@ def prepare_latents(
 
             cond_latents = torch.cat(cond_latents, dim=0).to(dtype)
 
-            if self.latents_mean is None or self.latents_std is None:
-                raise ValueError("VAE configuration must define `latents_mean` and `latents_std`.")
             latents_mean = self.latents_mean.to(device=device, dtype=dtype)
             latents_std = self.latents_std.to(device=device, dtype=dtype)
             cond_latents = (cond_latents - latents_mean) / latents_std
diff --git a/tests/pipelines/cosmos/test_cosmos2_5_predict.py b/tests/pipelines/cosmos/test_cosmos2_5_predict.py
@@ -22,7 +22,12 @@
 import torch
 from transformers import AutoTokenizer, Qwen2VLForConditionalGeneration
 
-from diffusers import AutoencoderKLWan, Cosmos2_5_PredictBase, CosmosTransformer3DModel, UniPCMultistepScheduler
+from diffusers import (
+    AutoencoderKLWan,
+    Cosmos2_5_PredictBasePipeline,
+    CosmosTransformer3DModel,
+    UniPCMultistepScheduler,
+)
 
 from ...testing_utils import enable_full_determinism, torch_device
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
@@ -33,7 +38,7 @@
 enable_full_determinism()
 
 
-class Cosmos2_5_PredictBaseWrapper(Cosmos2_5_PredictBase):
+class Cosmos2_5_PredictBaseWrapper(Cosmos2_5_PredictBasePipeline):
     @staticmethod
     def from_pretrained(*args, **kwargs):
         if "safety_checker" not in kwargs or kwargs["safety_checker"] is None:
@@ -42,7 +47,7 @@ def from_pretrained(*args, **kwargs):
             if isinstance(torch_dtype, torch.dtype):
                 safety_checker = safety_checker.to(dtype=torch_dtype)
             kwargs["safety_checker"] = safety_checker
-        return Cosmos2_5_PredictBase.from_pretrained(*args, **kwargs)
+        return Cosmos2_5_PredictBasePipeline.from_pretrained(*args, **kwargs)
 
 
 class Cosmos2_5_PredictPipelineFastTests(PipelineTesterMixin, unittest.TestCase):

Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,7 @@`
`63`	`63`	`FlowMatchEulerDiscreteScheduler,`
`64`	`64`	`UniPCMultistepScheduler,`
`65`	`65`	`)`
`66`		`-from diffusers.pipelines.cosmos.pipeline_cosmos2_5_predict import Cosmos2_5_PredictBase`
	`66`	`+from diffusers.pipelines.cosmos.pipeline_cosmos2_5_predict import Cosmos2_5_PredictBasePipeline`
`67`	`67`
`68`	`68`
`69`	`69`	`def remove_keys_(key: str, state_dict: Dict[str, Any]):`
`@@ -545,7 +545,7 @@ def save_pipeline_cosmos2_5(args, transformer, vae):`
`545`	`545`	`sigma_min=0.01,`
`546`	`546`	`)`
`547`	`547`
`548`		`- pipe = Cosmos2_5_PredictBase(`
	`548`	`+ pipe = Cosmos2_5_PredictBasePipeline(`
`549`	`549`	`text_encoder=text_encoder,`
`550`	`550`	`tokenizer=tokenizer,`
`551`	`551`	`transformer=transformer,`
Original file line number	Diff line number	Diff line change
`@@ -488,8 +488,7 @@ def __init__(`
`488`	`488`	`hidden_size, patch_size[0] * patch_size[1] * patch_size[2] * out_channels, bias=False`
`489`	`489`	`)`
`490`	`490`
`491`		`- self.use_crossattn_projection = use_crossattn_projection`
`492`		`- if self.use_crossattn_projection:`
	`491`	`+ if self.config.use_crossattn_projection:`
`493`	`492`	`self.crossattn_proj = nn.Sequential(`
`494`	`493`	`nn.Linear(crossattn_proj_in_channels, encoder_hidden_states_channels, bias=True),`
`495`	`494`	`nn.GELU(),`