feat: add sd3.5 medium and skip layer guidance support (#451)

* mmdit-x

* add support for sd3.5 medium

* add skip layer guidance support (mmdit only)

* ignore slg if slg_scale is zero (optimization)

* init out_skip once

* slg support for flux (expermiental)

* warn if version doesn't support slg

* refactor slg cli args

* set default slg_scale to 0 (oops)

* format code

---------

Co-authored-by: leejet <leejet714@gmail.com>
This commit is contained in:
stduhpf
2024-11-23 04:15:31 +01:00
committed by GitHub
parent ac54e00760
commit 65fa646684
9 changed files with 414 additions and 79 deletions

View File

@@ -32,7 +32,8 @@ const char* model_version_to_str[] = {
"SD3 2B",
"Flux Dev",
"Flux Schnell",
"SD3.5 8B"};
"SD3.5 8B",
"SD3.5 2B"};
const char* sampling_methods_str[] = {
"Euler A",
@@ -288,7 +289,7 @@ public:
"try specifying SDXL VAE FP16 Fix with the --vae parameter. "
"You can find it here: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors");
}
} else if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B) {
} else if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B) {
scale_factor = 1.5305f;
} else if (version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
scale_factor = 0.3611;
@@ -311,7 +312,7 @@ public:
} else {
clip_backend = backend;
bool use_t5xxl = false;
if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B || version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
use_t5xxl = true;
}
if (!ggml_backend_is_cpu(backend) && use_t5xxl && conditioner_wtype != GGML_TYPE_F32) {
@@ -322,7 +323,7 @@ public:
LOG_INFO("CLIP: Using CPU backend");
clip_backend = ggml_backend_cpu_init();
}
if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B) {
if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B) {
cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, conditioner_wtype);
diffusion_model = std::make_shared<MMDiTModel>(backend, diffusion_model_wtype, version);
} else if (version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
@@ -520,7 +521,7 @@ public:
is_using_v_parameterization = true;
}
if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B) {
if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B) {
LOG_INFO("running in FLOW mode");
denoiser = std::make_shared<DiscreteFlowDenoiser>();
} else if (version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
@@ -771,7 +772,11 @@ public:
sample_method_t method,
const std::vector<float>& sigmas,
int start_merge_step,
SDCondition id_cond) {
SDCondition id_cond,
std::vector<int> skip_layers = {},
float slg_scale = 2.5,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2) {
size_t steps = sigmas.size() - 1;
// noise = load_tensor_from_file(work_ctx, "./rand0.bin");
// print_ggml_tensor(noise);
@@ -782,13 +787,24 @@ public:
struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise);
bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL;
bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0;
// denoise wrapper
struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* out_uncond = NULL;
struct ggml_tensor* out_skip = NULL;
if (has_unconditioned) {
out_uncond = ggml_dup_tensor(work_ctx, x);
}
if (has_skiplayer) {
if (version == VERSION_SD3_2B || version == VERSION_SD3_5_2B || version == VERSION_SD3_5_8B || version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
out_skip = ggml_dup_tensor(work_ctx, x);
} else {
has_skiplayer = false;
LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]);
}
}
struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
@@ -869,6 +885,28 @@ public:
&out_uncond);
negative_data = (float*)out_uncond->data;
}
int step_count = sigmas.size();
bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count);
float* skip_layer_data = NULL;
if (is_skiplayer_step) {
LOG_DEBUG("Skipping layers at step %d\n", step);
// skip layer (same as conditionned)
diffusion_model->compute(n_threads,
noised_input,
timesteps,
cond.c_crossattn,
cond.c_concat,
cond.c_vector,
guidance_tensor,
-1,
controls,
control_strength,
&out_skip,
NULL,
skip_layers);
skip_layer_data = (float*)out_skip->data;
}
float* vec_denoised = (float*)denoised->data;
float* vec_input = (float*)input->data;
float* positive_data = (float*)out_cond->data;
@@ -885,6 +923,9 @@ public:
latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
}
}
if (is_skiplayer_step) {
latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
}
// v = latent_result, eps = latent_result
// denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
@@ -948,7 +989,7 @@ public:
if (use_tiny_autoencoder) {
C = 4;
} else {
if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B) {
if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B) {
C = 32;
} else if (version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
C = 32;
@@ -1111,7 +1152,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
float control_strength,
float style_ratio,
bool normalize_input,
std::string input_id_images_path) {
std::string input_id_images_path,
std::vector<int> skip_layers = {},
float slg_scale = 2.5,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2) {
if (seed < 0) {
// Generally, when using the provided command line, the seed is always >0.
// However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1281,7 +1326,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
// Sample
std::vector<struct ggml_tensor*> final_latents; // collect latents to decode
int C = 4;
if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B) {
if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B || sd_ctx->sd->version == VERSION_SD3_5_2B) {
C = 16;
} else if (sd_ctx->sd->version == VERSION_FLUX_DEV || sd_ctx->sd->version == VERSION_FLUX_SCHNELL) {
C = 16;
@@ -1320,7 +1365,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
sample_method,
sigmas,
start_merge_step,
id_cond);
id_cond,
skip_layers,
slg_scale,
skip_layer_start,
skip_layer_end);
// struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
// print_ggml_tensor(x_0);
int64_t sampling_end = ggml_time_ms();
@@ -1386,7 +1435,11 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
float control_strength,
float style_ratio,
bool normalize_input,
const char* input_id_images_path_c_str) {
const char* input_id_images_path_c_str,
std::vector<int> skip_layers,
float slg_scale,
float skip_layer_start,
float skip_layer_end) {
LOG_DEBUG("txt2img %dx%d", width, height);
if (sd_ctx == NULL) {
return NULL;
@@ -1394,7 +1447,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B) {
if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B || sd_ctx->sd->version == VERSION_SD3_5_2B) {
params.mem_size *= 3;
}
if (sd_ctx->sd->version == VERSION_FLUX_DEV || sd_ctx->sd->version == VERSION_FLUX_SCHNELL) {
@@ -1420,7 +1473,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
int C = 4;
if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B) {
if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B || sd_ctx->sd->version == VERSION_SD3_5_2B) {
C = 16;
} else if (sd_ctx->sd->version == VERSION_FLUX_DEV || sd_ctx->sd->version == VERSION_FLUX_SCHNELL) {
C = 16;
@@ -1428,7 +1481,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
int W = width / 8;
int H = height / 8;
ggml_tensor* init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B) {
if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B || sd_ctx->sd->version == VERSION_SD3_5_2B) {
ggml_set_f32(init_latent, 0.0609f);
} else if (sd_ctx->sd->version == VERSION_FLUX_DEV || sd_ctx->sd->version == VERSION_FLUX_SCHNELL) {
ggml_set_f32(init_latent, 0.1159f);
@@ -1454,7 +1507,11 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
control_strength,
style_ratio,
normalize_input,
input_id_images_path_c_str);
input_id_images_path_c_str,
skip_layers,
slg_scale,
skip_layer_start,
skip_layer_end);
size_t t1 = ggml_time_ms();
@@ -1481,7 +1538,11 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
float control_strength,
float style_ratio,
bool normalize_input,
const char* input_id_images_path_c_str) {
const char* input_id_images_path_c_str,
std::vector<int> skip_layers,
float slg_scale,
float skip_layer_start,
float skip_layer_end) {
LOG_DEBUG("img2img %dx%d", width, height);
if (sd_ctx == NULL) {
return NULL;
@@ -1489,7 +1550,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B) {
if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B || sd_ctx->sd->version == VERSION_SD3_5_2B) {
params.mem_size *= 2;
}
if (sd_ctx->sd->version == VERSION_FLUX_DEV || sd_ctx->sd->version == VERSION_FLUX_SCHNELL) {
@@ -1555,7 +1616,11 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
control_strength,
style_ratio,
normalize_input,
input_id_images_path_c_str);
input_id_images_path_c_str,
skip_layers,
slg_scale,
skip_layer_start,
skip_layer_end);
size_t t2 = ggml_time_ms();