add stable unclip

iam-sydao · Mar 24, 2023 · b4bdae9 · b4bdae9
1 parent d55bcd4
commit b4bdae9
Show file tree

Hide file tree

Showing 42 changed files with 4,746 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -8,6 +8,14 @@ new checkpoints. The following list provides an overview of all currently availa
 
 ## News
 
+**February 27, 2023**
+
+*Stable UnCLIP 2.1*
+- New stable diffusion finetune (_Stable unCLIP 2.1_, [HuggingFace](https://huggingface.co/stabilityai/)) at 768x768 resolution, 
+based on SD2.1-768. This model allows for image variations and mixing operations as described in [*Hierarchical Text-Conditional Image Generation with CLIP Latents*](https://arxiv.org/abs/2204.06125), and, thanks to its modularity, can be combined with other models
+such as [KARLO](https://github.com/kakaobrain/karlo). Comes in two variants: [*Stable unCLIP-L*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt) and [*Stable unCLIP-H*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-h.ckpt), which are conditioned on CLIP
+ViT-L and ViT-H image embeddings, respectively. Instructions are available [here](doc/UNCLIP.MD).
+
 **December 7, 2022**
 
 *Version 2.1*
@@ -137,6 +145,9 @@ Note: The inference config for all model versions is designed to be used with EM
 For this reason `use_ema=False` is set in the configuration, otherwise the code will try to switch from
 non-EMA to EMA weights. 
 
+### Stable unCLIP
+See [doc/UNCLIP.MD](doc/UNCLIP.MD).
+
 ### Image Modification with Stable Diffusion
 
 ![depth2img-stable2](assets/stable-samples/depth2img/merged-0000.png)

diff --git a/assets/stable-samples/stable-unclip/houses_out.jpeg b/assets/stable-samples/stable-unclip/houses_out.jpeg
diff --git a/assets/stable-samples/stable-unclip/oldcar000.jpeg b/assets/stable-samples/stable-unclip/oldcar000.jpeg
diff --git a/assets/stable-samples/stable-unclip/oldcar500.jpeg b/assets/stable-samples/stable-unclip/oldcar500.jpeg
diff --git a/assets/stable-samples/stable-unclip/oldcar800.jpeg b/assets/stable-samples/stable-unclip/oldcar800.jpeg
diff --git a/assets/stable-samples/stable-unclip/panda.jpg b/assets/stable-samples/stable-unclip/panda.jpg
diff --git a/assets/stable-samples/stable-unclip/plates_out.jpeg b/assets/stable-samples/stable-unclip/plates_out.jpeg
diff --git a/assets/stable-samples/stable-unclip/unclip-variations.png b/assets/stable-samples/stable-unclip/unclip-variations.png
diff --git a/assets/stable-samples/stable-unclip/unclip-variations_noise.png b/assets/stable-samples/stable-unclip/unclip-variations_noise.png
diff --git a/checkpoints/checkpoints.txt b/checkpoints/checkpoints.txt
@@ -0,0 +1 @@
+Put unCLIP checkpoints here.
diff --git a/configs/karlo/decoder_900M_vit_l.yaml b/configs/karlo/decoder_900M_vit_l.yaml
@@ -0,0 +1,37 @@
+model:
+  type: t2i-decoder
+  diffusion_sampler: uniform
+  hparams:
+    image_size: 64
+    num_channels: 320
+    num_res_blocks: 3
+    channel_mult: ''
+    attention_resolutions: 32,16,8
+    num_heads: -1
+    num_head_channels: 64
+    num_heads_upsample: -1
+    use_scale_shift_norm: true
+    dropout: 0.1
+    clip_dim: 768
+    clip_emb_mult: 4
+    text_ctx: 77
+    xf_width: 1536
+    xf_layers: 0
+    xf_heads: 0
+    xf_final_ln: false
+    resblock_updown: true
+    learn_sigma: true
+    text_drop: 0.3
+    clip_emb_type: image
+    clip_emb_drop: 0.1
+    use_plm: true
+
+diffusion:
+  steps: 1000
+  learn_sigma: true
+  sigma_small: false
+  noise_schedule: squaredcos_cap_v2
+  use_kl: false
+  predict_xstart: false
+  rescale_learned_sigmas: true
+  timestep_respacing: ''
diff --git a/configs/karlo/improved_sr_64_256_1.4B.yaml b/configs/karlo/improved_sr_64_256_1.4B.yaml
@@ -0,0 +1,27 @@
+model:
+  type: improved_sr_64_256
+  diffusion_sampler: uniform
+  hparams:
+    channels: 320
+    depth: 3
+    channels_multiple:
+    - 1
+    - 2
+    - 3
+    - 4
+    dropout: 0.0
+
+diffusion:
+  steps: 1000
+  learn_sigma: false
+  sigma_small: true
+  noise_schedule: squaredcos_cap_v2
+  use_kl: false
+  predict_xstart: false
+  rescale_learned_sigmas: true
+  timestep_respacing: '7'
+
+
+sampling:
+  timestep_respacing: '7' # fix
+  clip_denoise: true
diff --git a/configs/karlo/prior_1B_vit_l.yaml b/configs/karlo/prior_1B_vit_l.yaml
@@ -0,0 +1,21 @@
+model:
+  type: prior
+  diffusion_sampler: uniform
+  hparams:
+    text_ctx: 77
+    xf_width: 2048
+    xf_layers: 20
+    xf_heads: 32
+    xf_final_ln: true
+    text_drop: 0.2
+    clip_dim: 768
+
+diffusion:
+  steps: 1000
+  learn_sigma: false
+  sigma_small: true
+  noise_schedule: squaredcos_cap_v2
+  use_kl: false
+  predict_xstart: true
+  rescale_learned_sigmas: false
+  timestep_respacing: ''
diff --git a/configs/stable-diffusion/v2-1-stable-unclip-h-inference.yaml b/configs/stable-diffusion/v2-1-stable-unclip-h-inference.yaml
@@ -0,0 +1,80 @@
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.ImageEmbeddingConditionedLatentDiffusion
+  params:
+    embedding_dropout: 0.25
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.0120
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 96
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn-adm
+    scale_factor: 0.18215
+    monitor: val/loss_simple_ema
+    use_ema: False
+
+    embedder_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+
+    noise_aug_config:
+      target: ldm.modules.encoders.modules.CLIPEmbeddingNoiseAugmentation
+      params:
+        timestep_dim: 1024
+        noise_schedule_config:
+          timesteps: 1000
+          beta_schedule: squaredcos_cap_v2
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        num_classes: "sequential"
+        adm_in_channels: 2048
+        use_checkpoint: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+            - 1
+            - 2
+            - 4
+            - 4
+          num_res_blocks: 2
+          attn_resolutions: [ ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"
diff --git a/configs/stable-diffusion/v2-1-stable-unclip-l-inference.yaml b/configs/stable-diffusion/v2-1-stable-unclip-l-inference.yaml
@@ -0,0 +1,83 @@
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.ImageEmbeddingConditionedLatentDiffusion
+  params:
+    embedding_dropout: 0.25
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.0120
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 96
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn-adm
+    scale_factor: 0.18215
+    monitor: val/loss_simple_ema
+    use_ema: False
+
+    embedder_config:
+      target: ldm.modules.encoders.modules.ClipImageEmbedder
+      params:
+        model: "ViT-L/14"
+
+    noise_aug_config:
+      target: ldm.modules.encoders.modules.CLIPEmbeddingNoiseAugmentation
+      params:
+        clip_stats_path: "checkpoints/karlo_models/ViT-L-14_stats.th"
+        timestep_dim: 768
+        noise_schedule_config:
+          timesteps: 1000
+          beta_schedule: squaredcos_cap_v2
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        num_classes: "sequential"
+        adm_in_channels: 1536
+        use_checkpoint: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+            - 1
+            - 2
+            - 4
+            - 4
+          num_res_blocks: 2
+          attn_resolutions: [ ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"
diff --git a/doc/UNCLIP.MD b/doc/UNCLIP.MD
@@ -0,0 +1,56 @@
+### Stable unCLIP
+
+[unCLIP](https://openai.com/dall-e-2/) is the approach behind OpenAI's [DALL·E 2](https://openai.com/dall-e-2/), 
+trained to invert CLIP image embeddings. 
+We finetuned SD 2.1 to accept a CLIP ViT-L/14 image embedding in addition to the text encodings.
+This means that the model can be used to produce image variations, but can also be combined with a text-to-image 
+embedding prior to yield a full text-to-image model at 768x768 resolution. 
+We provide two models, trained on OpenAI CLIP-L and OpenCLIP-H image embeddings, respectively, 
+available from [https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/tree/main).
+To use them, download from Hugging Face, and put and the weights into the `checkpoints` folder.  
+#### Image Variations
+![image-variations-l-1](../assets/stable-samples/stable-unclip/unclip-variations.png)
+
+Run
+
+```
+streamlit run scripts/streamlit/stableunclip.py
+```
+to launch a streamlit script than can be used to make image variations with both models (CLIP-L and OpenCLIP-H).
+These models can process a `noise_level`, which specifies an amount of Gaussian noise added to the CLIP embeddings. 
+This can be used to increase output variance as in the following examples.
+
+![image-variations-noise](../assets/stable-samples/stable-unclip/unclip-variations_noise.png)
+
+
+### Stable Diffusion Meets Karlo
+![panda](../assets/stable-samples/stable-unclip/panda.jpg) 
+
+Recently, [KakaoBrain](https://kakaobrain.com/) openly released [Karlo](https://github.com/kakaobrain/karlo), a pretrained, large-scale replication of [unCLIP](https://arxiv.org/abs/2204.06125).
+We introduce _Stable Karlo_, a combination of the Karlo CLIP image embedding prior, and Stable Diffusion v2.1-768.
+
+To run the model, first download the KARLO checkpoints
+```shell
+mkdir -p checkpoints/karlo_models
+cd checkpoints/karlo_models
+wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/096db1af569b284eb76b3881534822d9/ViT-L-14.pt
+wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/0b62380a75e56f073e2844ab5199153d/ViT-L-14_stats.th
+wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/85626483eaca9f581e2a78d31ff905ca/prior-ckpt-step%3D01000000-of-01000000.ckpt
+cd ../../
+```
+and the finetuned SD2.1 unCLIP-L checkpoint from [here](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt), and put the ckpt into the `checkpoints folder` 
+
+Then, run
+
+```
+streamlit run scripts/streamlit/stableunclip.py
+```
+and pick the `use_karlo` option in the GUI.
+The script optionally supports sampling from the full Karlo model. To use it, download the 64x64 decoder and 64->256 upscaler 
+via 
+```shell
+cd checkpoints/karlo_models
+wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/efdf6206d8ed593961593dc029a8affa/decoder-ckpt-step%3D01000000-of-01000000.ckpt
+wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/4226b831ae0279020d134281f3c31590/improved-sr-ckpt-step%3D1.2M.ckpt
+cd ../../
+```