Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fast latent image preview #454

Draft
wants to merge 7 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ test/
*.gguf
output*.png
models*
*.log
*.log
latent-preview.png
53 changes: 50 additions & 3 deletions examples/cli/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include "flux.hpp"
#include "stable-diffusion.h"

#include "latent-preview.h"

#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_STATIC
#include "stb_image.h"
Expand Down Expand Up @@ -765,6 +767,48 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
fflush(out_stream);
}

void step_callback(int step, struct ggml_tensor* latents, enum SDVersion version) {
const int channel = 3;
int width = latents->ne[0];
int height = latents->ne[1];
int dim = latents->ne[2];

const float(*latent_rgb_proj)[channel];

if (dim == 16) {
// 16 channels VAE -> Flux or SD3

if (sd_version_is_sd3(version)) {
latent_rgb_proj = sd3_latent_rgb_proj;
} else if (sd_version_is_flux(version)) {
latent_rgb_proj = flux_latent_rgb_proj;
} else {
// unknown model
return;
}

} else if (dim == 4) {
// 4 channels VAE
if (version == VERSION_SDXL) {
latent_rgb_proj = sdxl_latent_rgb_proj;
} else if (version == VERSION_SD1 || version == VERSION_SD2) {
latent_rgb_proj = sd_latent_rgb_proj;
} else {
// unknown model
return;
}
} else {
// unknown latent space
return;
}
uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t));

preview_latent_image(data, latents, latent_rgb_proj, width, height, dim);

stbi_write_png("latent-preview.png", width, height, channel, data, 0);
free(data);
}

int main(int argc, const char* argv[]) {
SDParams params;

Expand Down Expand Up @@ -930,7 +974,8 @@ int main(int argc, const char* argv[]) {
params.skip_layers.size(),
params.slg_scale,
params.skip_layer_start,
params.skip_layer_end);
params.skip_layer_end,
(step_callback_t)step_callback);
} else {
sd_image_t input_image = {(uint32_t)params.width,
(uint32_t)params.height,
Expand All @@ -951,7 +996,8 @@ int main(int argc, const char* argv[]) {
params.sample_method,
params.sample_steps,
params.strength,
params.seed);
params.seed,
(step_callback_t)step_callback);
if (results == NULL) {
printf("generate failed\n");
free_sd_ctx(sd_ctx);
Expand Down Expand Up @@ -997,7 +1043,8 @@ int main(int argc, const char* argv[]) {
params.skip_layers.size(),
params.slg_scale,
params.skip_layer_start,
params.skip_layer_end);
params.skip_layer_end,
(step_callback_t)step_callback);
}
}

Expand Down
83 changes: 83 additions & 0 deletions latent-preview.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@

// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169
const float flux_latent_rgb_proj[16][3] = {
{-0.0346, 0.0244, 0.0681},
{0.0034, 0.0210, 0.0687},
{0.0275, -0.0668, -0.0433},
{-0.0174, 0.0160, 0.0617},
{0.0859, 0.0721, 0.0329},
{0.0004, 0.0383, 0.0115},
{0.0405, 0.0861, 0.0915},
{-0.0236, -0.0185, -0.0259},
{-0.0245, 0.0250, 0.1180},
{0.1008, 0.0755, -0.0421},
{-0.0515, 0.0201, 0.0011},
{0.0428, -0.0012, -0.0036},
{0.0817, 0.0765, 0.0749},
{-0.1264, -0.0522, -0.1103},
{-0.0280, -0.0881, -0.0499},
{-0.1262, -0.0982, -0.0778}};

// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246
const float sd3_latent_rgb_proj[16][3] = {
{-0.0645, 0.0177, 0.1052},
{0.0028, 0.0312, 0.0650},
{0.1848, 0.0762, 0.0360},
{0.0944, 0.0360, 0.0889},
{0.0897, 0.0506, -0.0364},
{-0.0020, 0.1203, 0.0284},
{0.0855, 0.0118, 0.0283},
{-0.0539, 0.0658, 0.1047},
{-0.0057, 0.0116, 0.0700},
{-0.0412, 0.0281, -0.0039},
{0.1106, 0.1171, 0.1220},
{-0.0248, 0.0682, -0.0481},
{0.0815, 0.0846, 0.1207},
{-0.0120, -0.0055, -0.0867},
{-0.0749, -0.0634, -0.0456},
{-0.1418, -0.1457, -0.1259},
};

// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
const float sdxl_latent_rgb_proj[4][3] = {
{0.3651, 0.4232, 0.4341},
{-0.2533, -0.0042, 0.1068},
{0.1076, 0.1111, -0.0362},
{-0.3165, -0.2492, -0.2188}};

// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
const float sd_latent_rgb_proj[4][3]{
{0.3512, 0.2297, 0.3227},
{0.3250, 0.4974, 0.2350},
{-0.2829, 0.1762, 0.2721},
{-0.2120, -0.2616, -0.7177}};

void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], int width, int height, int dim) {
size_t buffer_head = 0;
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
size_t latent_id = (i * latents->nb[0] + j * latents->nb[1]);
float r = 0, g = 0, b = 0;
for (int d = 0; d < dim; d++) {
float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[2]);
r += value * latent_rgb_proj[d][0];
g += value * latent_rgb_proj[d][1];
b += value * latent_rgb_proj[d][2];
}

// change range
r = r * .5f + .5f;
g = g * .5f + .5f;
b = b * .5f + .5f;

// clamp rgb values to [0,1] range
r = r >= 0 ? r <= 1 ? r : 1 : 0;
g = g >= 0 ? g <= 1 ? g : 1 : 0;
b = b >= 0 ? b <= 1 ? b : 1 : 0;

buffer[buffer_head++] = (uint8_t)(r * 255);
buffer[buffer_head++] = (uint8_t)(g * 255);
buffer[buffer_head++] = (uint8_t)(b * 255);
}
}
}
44 changes: 29 additions & 15 deletions stable-diffusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -781,10 +781,11 @@ class StableDiffusionGGML {
const std::vector<float>& sigmas,
int start_merge_step,
SDCondition id_cond,
std::vector<int> skip_layers = {},
float slg_scale = 0,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2) {
std::vector<int> skip_layers = {},
float slg_scale = 0,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2,
std::function<void(int, ggml_tensor*, SDVersion)> step_callback = nullptr) {
size_t steps = sigmas.size() - 1;
// noise = load_tensor_from_file(work_ctx, "./rand0.bin");
// print_ggml_tensor(noise);
Expand Down Expand Up @@ -943,6 +944,9 @@ class StableDiffusionGGML {
pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
// LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
}
if (step_callback != nullptr) {
step_callback(step, denoised, version);
}
return denoised;
};

Expand Down Expand Up @@ -1163,10 +1167,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
float style_ratio,
bool normalize_input,
std::string input_id_images_path,
std::vector<int> skip_layers = {},
float slg_scale = 0,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2) {
std::vector<int> skip_layers = {},
float slg_scale = 0,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2,
std::function<void(int, ggml_tensor*, SDVersion)> step_callback = nullptr) {
if (seed < 0) {
// Generally, when using the provided command line, the seed is always >0.
// However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
Expand Down Expand Up @@ -1388,7 +1393,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
skip_layers,
slg_scale,
skip_layer_start,
skip_layer_end);
skip_layer_end,
step_callback);
// struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
// print_ggml_tensor(x_0);
int64_t sampling_end = ggml_time_ms();
Expand Down Expand Up @@ -1459,7 +1465,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
size_t skip_layers_count = 0,
float slg_scale = 0,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2) {
float skip_layer_end = 0.2,
step_callback_t step_callback = NULL) {
std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
LOG_DEBUG("txt2img %dx%d", width, height);
if (sd_ctx == NULL) {
Expand Down Expand Up @@ -1532,7 +1539,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
skip_layers_vec,
slg_scale,
skip_layer_start,
skip_layer_end);
skip_layer_end,
step_callback);

size_t t1 = ggml_time_ms();

Expand Down Expand Up @@ -1564,7 +1572,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
size_t skip_layers_count = 0,
float slg_scale = 0,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2) {
float skip_layer_end = 0.2,
step_callback_t step_callback = NULL) {
std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
LOG_DEBUG("img2img %dx%d", width, height);
if (sd_ctx == NULL) {
Expand Down Expand Up @@ -1643,7 +1652,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
skip_layers_vec,
slg_scale,
skip_layer_start,
skip_layer_end);
skip_layer_end,
step_callback);

size_t t2 = ggml_time_ms();

Expand All @@ -1665,7 +1675,8 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
enum sample_method_t sample_method,
int sample_steps,
float strength,
int64_t seed) {
int64_t seed,
step_callback_t step_callback = NULL) {
if (sd_ctx == NULL) {
return NULL;
}
Expand Down Expand Up @@ -1744,7 +1755,10 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
sample_method,
sigmas,
-1,
SDCondition(NULL, NULL, NULL));
SDCondition(NULL, NULL, NULL),
{},
0, 0, 0,
step_callback);

int64_t t2 = ggml_time_ms();
LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
Expand Down
11 changes: 8 additions & 3 deletions stable-diffusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ SD_API sd_ctx_t* new_sd_ctx(const char* model_path,

SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);

typedef void (*step_callback_t)(int, struct ggml_tensor*, int);

SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
const char* prompt,
const char* negative_prompt,
Expand All @@ -170,7 +172,8 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
size_t skip_layers_count,
float slg_scale,
float skip_layer_start,
float skip_layer_end);
float skip_layer_end,
step_callback_t step_callback);

SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
sd_image_t init_image,
Expand All @@ -195,7 +198,8 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
size_t skip_layers_count,
float slg_scale,
float skip_layer_start,
float skip_layer_end);
float skip_layer_end,
step_callback_t step_callback);

SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
sd_image_t init_image,
Expand All @@ -210,7 +214,8 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
enum sample_method_t sample_method,
int sample_steps,
float strength,
int64_t seed);
int64_t seed,
step_callback_t step_callback);

typedef struct upscaler_ctx_t upscaler_ctx_t;

Expand Down
Loading