NVIDIA · Victor49152 · May 1, 2024 · Jan 29, 2024 · Jan 31, 2024 · Feb 1, 2024
diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml
@@ -49,8 +49,8 @@ model:
   precision: ${trainer.precision}
   # specify micro_batch_size, global_batch_size, and model parallelism
   # gradient accumulation will be done automatically based on data_parallel_size
-  micro_batch_size: 1 # limited by GPU memory
-  global_batch_size: 1 # will use more micro batches to reach global batch size
+  micro_batch_size: 16 # limited by GPU memory
+  global_batch_size: 16 # will use more micro batches to reach global batch size
   native_amp_init_scale: 65536.0 # Init scale for grad scaler used at fp16
 
 
@@ -97,15 +97,15 @@ model:
   unet_config:
     _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel
     from_pretrained: #/ckpts/nemo-v1-2.ckpt
-    from_NeMo: True #Must be specified when from pretrained is not None, False means loading unet from HF ckpt
+    from_NeMo: False #Must be specified when from pretrained is not None, False means loading unet from HF ckpt
     image_size: 32 # unused
     in_channels: 4
     out_channels: 4
     model_channels: 320
     attention_resolutions:
-    - 4
-    - 2
-    - 1
+      - 4
+      - 2
+      - 1
     num_res_blocks: 2
     channel_mult:
     - 1
@@ -121,6 +121,7 @@ model:
     use_flash_attention: True
     unet_precision: fp32
     resblock_gn_groups: 32
+    use_te_fp8: False
 
   first_stage_config:
     _target_: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKL
@@ -140,30 +141,30 @@ model:
       - 4
       - 4
       num_res_blocks: 2
-      attn_resolutions: []
+      attn_resolutions: [ ]
       dropout: 0.0
     lossconfig:
       target: torch.nn.Identity
 
   cond_stage_config:
-    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenMegatronCLIPEmbedder
-    restore_from_path: /ckpts/openai.nemo
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder
+    version: openai/clip-vit-large-patch14
     device: cuda
-    freeze: True
-    layer: "last"
-    #    For compatibility of history version that uses HF clip model
-    #    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder
-    #    version: openai/clip-vit-large-patch14
-    #    device: cuda
-    #    max_length: 77
+    max_length: 77
+  #    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenMegatronCLIPEmbedder
+  #    restore_from_path: /ckpts/openai-old.nemo
+  #    device: cuda
+  #    freeze: True
+  #    layer: "last"
+
 
 
   # miscellaneous
   seed: 1234
   resume_from_checkpoint: null # manually set the checkpoint file to load from
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-  ddp_overlap: True # True for using PyTorch DDP overlap.
+  ddp_overlap: False # True for using PyTorch DDP overlap.
 
   optim:
     name: fused_adam
@@ -191,7 +192,7 @@ model:
       synthetic_data_length: 10000
       train:
           dataset_path:
-            - /datasets/coyo/test.pkl
+            - /datasets/coyo/wdinfo/coyo-700m/wdinfo-selene.pkl
           augmentations:
             resize_smallest_side: 512
             center_crop_h_w: 512, 512

diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_infer.py b/examples/multimodal/text_to_image/stable_diffusion/sd_infer.py
@@ -28,6 +28,9 @@ def model_cfg_modifier(model_cfg):
         model_cfg.unet_config.use_flash_attention = False
         model_cfg.unet_config.from_pretrained = None
         model_cfg.first_stage_config.from_pretrained = None
+        model_cfg.first_stage_config._target_ = (
+            'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKL'
+        )
 
     torch.backends.cuda.matmul.allow_tf32 = True
     trainer, megatron_diffusion_model = setup_trainer_and_model_for_inference(

diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
@@ -1674,7 +1674,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         # megatron_amp_O2 is not yet supported in diffusion models
         self.megatron_amp_O2 = cfg.get('megatron_amp_O2', False)
 
-        if self.cfg.precision in ['16', 16, 'bf16']:
+        if self.megatron_amp_O2 and self.cfg.precision in ['16', 16, 'bf16']:
             self.model_parallel_config.enable_autocast = False
             if not hasattr(self.cfg.unet_config, 'unet_precision') or not '16' in str(
                 self.cfg.unet_config.unet_precision

diff --git a/nemo/collections/multimodal/modules/stable_diffusion/attention.py b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
+import os
 from inspect import isfunction
 
 import torch
@@ -21,6 +22,13 @@
 from torch import einsum, nn
 from torch._dynamo import disable
 
+if os.environ.get("USE_NATIVE_GROUP_NORM", "0") == "1":
+    from nemo.gn_native import GroupNormNormlization as GroupNorm
+else:
+    from apex.contrib.group_norm import GroupNorm
+
+from transformer_engine.pytorch.module import LayerNormLinear, LayerNormMLP
+
 from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.util import checkpoint
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import (
     AdapterName,
@@ -96,13 +104,19 @@ def forward(self, x):
 
 
 class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0, use_te=False):
         super().__init__()
         inner_dim = int(dim * mult)
         dim_out = default(dim_out, dim)
-        project_in = nn.Sequential(LinearWrapper(dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim)
 
-        self.net = nn.Sequential(project_in, nn.Dropout(dropout), LinearWrapper(inner_dim, dim_out))
+        if use_te:
+            activation = 'gelu' if not glu else 'geglu'
+            # TODO: more parameters to be confirmed, dropout, seq_length
+            self.net = LayerNormMLP(hidden_size=dim, ffn_hidden_size=inner_dim, activation=activation,)
+        else:
+            norm = nn.LayerNorm(dim)
+            project_in = nn.Sequential(LinearWrapper(dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim)
+            self.net = nn.Sequential(norm, project_in, nn.Dropout(dropout), LinearWrapper(inner_dim, dim_out))
 
     def forward(self, x):
         return self.net(x)
@@ -225,10 +239,15 @@ def __init__(
         dropout=0.0,
         use_flash_attention=False,
         lora_network_alpha=None,
+        use_te=False,
     ):
         super().__init__()
 
         self.inner_dim = dim_head * heads
+        if context_dim is None:
+            self.is_self_attn = True
+        else:
+            self.is_self_attn = False  # cross-attention
         context_dim = default(context_dim, query_dim)
         # make attention part be aware of self-attention/cross-attention
         self.context_dim = context_dim
@@ -238,10 +257,19 @@ def __init__(
         self.scale = dim_head ** -0.5
         self.heads = heads
 
-        self.to_q = LinearWrapper(query_dim, self.inner_dim, bias=False, lora_network_alpha=lora_network_alpha)
         self.to_k = LinearWrapper(context_dim, self.inner_dim, bias=False, lora_network_alpha=lora_network_alpha)
         self.to_v = LinearWrapper(context_dim, self.inner_dim, bias=False, lora_network_alpha=lora_network_alpha)
 
+        self.use_te = use_te
+        if use_te:
+            return_layernorm_output = True if self.is_self_attn else False
+            self.norm_to_q = LayerNormLinear(
+                query_dim, self.inner_dim, bias=False, return_layernorm_output=return_layernorm_output
+            )
+        else:
+            self.norm = nn.LayerNorm(query_dim)
+            self.to_q = LinearWrapper(query_dim, self.inner_dim, bias=False)
+
         self.to_out = nn.Sequential(
             LinearWrapper(self.inner_dim, query_dim, lora_network_alpha=lora_network_alpha), nn.Dropout(dropout)
         )
@@ -262,8 +290,18 @@ def forward(self, x, context=None, mask=None, additional_tokens=None, n_times_cr
             # add additional token
             x = torch.cat([additional_tokens, x], dim=1)
 
-        q = self.to_q(x)
-        context = default(context, x)
+        if self.use_te:
+            q_out = self.norm_to_q(x)
+            if self.is_self_attn:
+                q, ln_out = q_out
+                context = default(context, ln_out)
+            else:
+                q = q_out
+                context = default(context, x)
+        else:
+            x = self.norm(x)
+            q = self.to_q(x)
+            context = default(context, x)
         k = self.to_k(context)
         v = self.to_v(context)
 
@@ -351,6 +389,7 @@ def __init__(
         use_flash_attention=False,
         disable_self_attn=False,
         lora_network_alpha=None,
+        use_te=False,
     ):
         super().__init__()
         self.disable_self_attn = disable_self_attn
@@ -362,8 +401,9 @@ def __init__(
             use_flash_attention=use_flash_attention,
             context_dim=context_dim if self.disable_self_attn else None,
             lora_network_alpha=lora_network_alpha,
+            use_te=use_te,
         )  # is a self-attention
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff, use_te=use_te)
         self.attn2 = CrossAttention(
             query_dim=dim,
             context_dim=context_dim,
@@ -372,10 +412,8 @@ def __init__(
             dropout=dropout,
             use_flash_attention=use_flash_attention,
             lora_network_alpha=lora_network_alpha,
+            use_te=use_te,
         )  # is self-attn if context is none
-        self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
-        self.norm3 = nn.LayerNorm(dim)
         self.use_checkpoint = use_checkpoint
 
     def forward(self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0):
@@ -397,15 +435,15 @@ def forward(self, x, context=None, additional_tokens=None, n_times_crossframe_at
     def _forward(self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0):
         x = (
             self.attn1(
-                self.norm1(x),
+                x,
                 context=context if self.disable_self_attn else None,
                 additional_tokens=additional_tokens,
                 n_times_crossframe_attn_in_self=n_times_crossframe_attn_in_self if not self.disable_self_attn else 0,
             )
             + x
         )
-        x = self.attn2(self.norm2(x), context=context, additional_tokens=additional_tokens) + x
-        x = self.ff(self.norm3(x)) + x
+        x = self.attn2(x, context=context, additional_tokens=additional_tokens) + x
+        x = self.ff(x) + x
         return x
 
 
@@ -431,6 +469,7 @@ def __init__(
         use_checkpoint=False,
         use_flash_attention=False,
         lora_network_alpha=None,
+        use_te=False,
     ):
         super().__init__()
         logging.info(
@@ -473,6 +512,7 @@ def __init__(
                     use_flash_attention=use_flash_attention,
                     disable_self_attn=disable_self_attn,
                     lora_network_alpha=lora_network_alpha,
+                    use_te=use_te,
                 )
                 for d in range(depth)
             ]