mirror of
https://github.com/BillyOutlast/rocm-stable-diffusion.cpp.git
synced 2026-02-04 03:01:18 +01:00
feat: add Qwen Image Edit support (#877)
* add ref latent support for qwen image * optimize clip_preprocess and fix get_first_stage_encoding * add qwen2vl vit support * add qwen image edit support * fix qwen image edit pipeline * add mmproj file support * support dynamic number of Qwen image transformer blocks * set prompt_template_encode_start_idx every time * to_add_out precision fix * to_out.0 precision fix * update docs
This commit is contained in:
@@ -261,6 +261,13 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
if (strlen(SAFE_STR(sd_ctx_params->qwen2vl_vision_path)) > 0) {
|
||||
LOG_INFO("loading qwen2vl vision from '%s'", sd_ctx_params->qwen2vl_vision_path);
|
||||
if (!model_loader.init_from_file(sd_ctx_params->qwen2vl_vision_path, "text_encoders.qwen2vl.visual.")) {
|
||||
LOG_WARN("loading qwen2vl vision from '%s' failed", sd_ctx_params->qwen2vl_vision_path);
|
||||
}
|
||||
}
|
||||
|
||||
if (strlen(SAFE_STR(sd_ctx_params->vae_path)) > 0) {
|
||||
LOG_INFO("loading vae from '%s'", sd_ctx_params->vae_path);
|
||||
if (!model_loader.init_from_file(sd_ctx_params->vae_path, "vae.")) {
|
||||
@@ -274,6 +281,15 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
auto& tensor_types = model_loader.tensor_storages_types;
|
||||
for (auto& item : tensor_types) {
|
||||
// LOG_DEBUG("%s %u", item.first.c_str(), item.second);
|
||||
if (contains(item.first, "qwen2vl") && ends_with(item.first, "weight") && (item.second == GGML_TYPE_F32 || item.second == GGML_TYPE_BF16)) {
|
||||
item.second = GGML_TYPE_F16;
|
||||
// LOG_DEBUG(" change %s %u", item.first.c_str(), item.second);
|
||||
}
|
||||
}
|
||||
|
||||
LOG_INFO("Version: %s ", model_version_to_str[version]);
|
||||
ggml_type wtype = (int)sd_ctx_params->wtype < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)
|
||||
? (ggml_type)sd_ctx_params->wtype
|
||||
@@ -417,9 +433,15 @@ public:
|
||||
clip_vision->get_param_tensors(tensors);
|
||||
}
|
||||
} else if (sd_version_is_qwen_image(version)) {
|
||||
bool enable_vision = false;
|
||||
if (!vae_decode_only) {
|
||||
enable_vision = true;
|
||||
}
|
||||
cond_stage_model = std::make_shared<Qwen2_5_VLCLIPEmbedder>(clip_backend,
|
||||
offload_params_to_cpu,
|
||||
model_loader.tensor_storages_types);
|
||||
model_loader.tensor_storages_types,
|
||||
"",
|
||||
enable_vision);
|
||||
diffusion_model = std::make_shared<QwenImageModel>(backend,
|
||||
offload_params_to_cpu,
|
||||
model_loader.tensor_storages_types,
|
||||
@@ -590,7 +612,9 @@ public:
|
||||
|
||||
if (vae_decode_only) {
|
||||
ignore_tensors.insert("first_stage_model.encoder");
|
||||
ignore_tensors.insert("first_stage_model.conv1");
|
||||
ignore_tensors.insert("first_stage_model.quant");
|
||||
ignore_tensors.insert("text_encoders.qwen2vl.visual.");
|
||||
}
|
||||
if (version == VERSION_SVD) {
|
||||
ignore_tensors.insert("conditioner.embedders.3");
|
||||
@@ -949,12 +973,12 @@ public:
|
||||
ggml_set_f32(output, 0.f);
|
||||
} else {
|
||||
sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image);
|
||||
sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size);
|
||||
sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size, clip_vision->vision_model.image_size);
|
||||
free(image.data);
|
||||
image.data = NULL;
|
||||
|
||||
ggml_tensor* pixel_values = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
|
||||
sd_image_f32_to_tensor(resized_image.data, pixel_values, false);
|
||||
sd_image_f32_to_tensor(resized_image, pixel_values, false);
|
||||
free(resized_image.data);
|
||||
resized_image.data = NULL;
|
||||
|
||||
@@ -991,7 +1015,7 @@ public:
|
||||
sd_image_f32_t resized_image = resize_sd_image_f32_t(image, width, height);
|
||||
free(image.data);
|
||||
image.data = NULL;
|
||||
sd_image_f32_to_tensor(resized_image.data, init_img, false);
|
||||
sd_image_f32_to_tensor(resized_image, init_img, false);
|
||||
free(resized_image.data);
|
||||
resized_image.data = NULL;
|
||||
} else {
|
||||
@@ -1749,6 +1773,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
||||
"clip_vision_path: %s\n"
|
||||
"t5xxl_path: %s\n"
|
||||
"qwen2vl_path: %s\n"
|
||||
"qwen2vl_vision_path: %s\n"
|
||||
"diffusion_model_path: %s\n"
|
||||
"high_noise_diffusion_model_path: %s\n"
|
||||
"vae_path: %s\n"
|
||||
@@ -1777,6 +1802,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
||||
SAFE_STR(sd_ctx_params->clip_vision_path),
|
||||
SAFE_STR(sd_ctx_params->t5xxl_path),
|
||||
SAFE_STR(sd_ctx_params->qwen2vl_path),
|
||||
SAFE_STR(sd_ctx_params->qwen2vl_vision_path),
|
||||
SAFE_STR(sd_ctx_params->diffusion_model_path),
|
||||
SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path),
|
||||
SAFE_STR(sd_ctx_params->vae_path),
|
||||
@@ -1987,6 +2013,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
|
||||
sd_image_t control_image,
|
||||
float control_strength,
|
||||
sd_pm_params_t pm_params,
|
||||
std::vector<sd_image_t*> ref_images,
|
||||
std::vector<ggml_tensor*> ref_latents,
|
||||
bool increase_ref_index,
|
||||
ggml_tensor* concat_latent = NULL,
|
||||
@@ -2019,6 +2046,14 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
|
||||
ggml_tensor* init_img = NULL;
|
||||
SDCondition id_cond;
|
||||
std::vector<bool> class_tokens_mask;
|
||||
|
||||
ConditionerParams condition_params;
|
||||
condition_params.clip_skip = clip_skip;
|
||||
condition_params.width = width;
|
||||
condition_params.height = height;
|
||||
condition_params.ref_images = ref_images;
|
||||
condition_params.adm_in_channels = sd_ctx->sd->diffusion_model->get_adm_in_channels();
|
||||
|
||||
if (sd_ctx->sd->stacked_id) {
|
||||
if (!sd_ctx->sd->pmid_lora->applied) {
|
||||
int64_t t0 = ggml_time_ms();
|
||||
@@ -2041,7 +2076,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
|
||||
std::vector<sd_image_f32_t> processed_id_images;
|
||||
for (int i = 0; i < pm_params.id_images_count; i++) {
|
||||
sd_image_f32_t id_image = sd_image_t_to_sd_image_f32_t(pm_params.id_images[i]);
|
||||
sd_image_f32_t processed_id_image = clip_preprocess(id_image, clip_image_size);
|
||||
sd_image_f32_t processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size);
|
||||
free(id_image.data);
|
||||
id_image.data = NULL;
|
||||
processed_id_images.push_back(processed_id_image);
|
||||
@@ -2058,17 +2093,15 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
|
||||
}
|
||||
processed_id_images.clear();
|
||||
|
||||
int64_t t0 = ggml_time_ms();
|
||||
auto cond_tup = sd_ctx->sd->cond_stage_model->get_learned_condition_with_trigger(work_ctx,
|
||||
sd_ctx->sd->n_threads, prompt,
|
||||
clip_skip,
|
||||
width,
|
||||
height,
|
||||
pm_params.id_images_count,
|
||||
sd_ctx->sd->diffusion_model->get_adm_in_channels());
|
||||
id_cond = std::get<0>(cond_tup);
|
||||
class_tokens_mask = std::get<1>(cond_tup); //
|
||||
struct ggml_tensor* id_embeds = NULL;
|
||||
int64_t t0 = ggml_time_ms();
|
||||
condition_params.text = prompt;
|
||||
condition_params.num_input_imgs = pm_params.id_images_count;
|
||||
auto cond_tup = sd_ctx->sd->cond_stage_model->get_learned_condition_with_trigger(work_ctx,
|
||||
sd_ctx->sd->n_threads,
|
||||
condition_params);
|
||||
id_cond = std::get<0>(cond_tup);
|
||||
class_tokens_mask = std::get<1>(cond_tup); //
|
||||
struct ggml_tensor* id_embeds = NULL;
|
||||
if (pmv2 && pm_params.id_embed_path != nullptr) {
|
||||
id_embeds = load_tensor_from_file(work_ctx, pm_params.id_embed_path);
|
||||
// print_ggml_tensor(id_embeds, true, "id_embeds:");
|
||||
@@ -2094,14 +2127,12 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
|
||||
}
|
||||
|
||||
// Get learned condition
|
||||
t0 = ggml_time_ms();
|
||||
SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
|
||||
sd_ctx->sd->n_threads,
|
||||
prompt,
|
||||
clip_skip,
|
||||
width,
|
||||
height,
|
||||
sd_ctx->sd->diffusion_model->get_adm_in_channels());
|
||||
t0 = ggml_time_ms();
|
||||
condition_params.text = prompt;
|
||||
condition_params.zero_out_masked = false;
|
||||
SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
|
||||
sd_ctx->sd->n_threads,
|
||||
condition_params);
|
||||
|
||||
SDCondition uncond;
|
||||
if (guidance.txt_cfg != 1.0 ||
|
||||
@@ -2110,14 +2141,11 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
|
||||
if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) {
|
||||
zero_out_masked = true;
|
||||
}
|
||||
uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
|
||||
sd_ctx->sd->n_threads,
|
||||
negative_prompt,
|
||||
clip_skip,
|
||||
width,
|
||||
height,
|
||||
sd_ctx->sd->diffusion_model->get_adm_in_channels(),
|
||||
zero_out_masked);
|
||||
condition_params.text = negative_prompt;
|
||||
condition_params.zero_out_masked = zero_out_masked;
|
||||
uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
|
||||
sd_ctx->sd->n_threads,
|
||||
condition_params);
|
||||
}
|
||||
int64_t t1 = ggml_time_ms();
|
||||
LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
|
||||
@@ -2538,13 +2566,42 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
|
||||
|
||||
std::vector<ggml_tensor*> ref_latents;
|
||||
for (int i = 0; i < ref_images.size(); i++) {
|
||||
ggml_tensor* img = ggml_new_tensor_4d(work_ctx,
|
||||
GGML_TYPE_F32,
|
||||
ref_images[i]->width,
|
||||
ref_images[i]->height,
|
||||
3,
|
||||
1);
|
||||
sd_image_to_tensor(*ref_images[i], img);
|
||||
ggml_tensor* img;
|
||||
if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
|
||||
sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]);
|
||||
int VAE_IMAGE_SIZE = std::min(1024 * 1024, width * height);
|
||||
double vae_width = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height);
|
||||
double vae_height = vae_width * ref_image.height / ref_image.width;
|
||||
|
||||
vae_height = round(vae_height / 32) * 32;
|
||||
vae_width = round(vae_width / 32) * 32;
|
||||
|
||||
sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast<int>(vae_width), static_cast<int>(vae_height));
|
||||
free(ref_image.data);
|
||||
ref_image.data = nullptr;
|
||||
|
||||
LOG_DEBUG("resize vae ref image %d from %dx%d to %dx%d", i, ref_image.height, ref_image.width, resized_image.height, resized_image.width);
|
||||
|
||||
img = ggml_new_tensor_4d(work_ctx,
|
||||
GGML_TYPE_F32,
|
||||
resized_image.width,
|
||||
resized_image.height,
|
||||
3,
|
||||
1);
|
||||
sd_image_f32_to_tensor(resized_image, img);
|
||||
free(resized_image.data);
|
||||
resized_image.data = nullptr;
|
||||
} else {
|
||||
img = ggml_new_tensor_4d(work_ctx,
|
||||
GGML_TYPE_F32,
|
||||
ref_images[i]->width,
|
||||
ref_images[i]->height,
|
||||
3,
|
||||
1);
|
||||
sd_image_to_tensor(*ref_images[i], img);
|
||||
}
|
||||
|
||||
// print_ggml_tensor(img, false, "img");
|
||||
|
||||
ggml_tensor* latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
|
||||
ref_latents.push_back(latent);
|
||||
@@ -2578,6 +2635,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
|
||||
sd_img_gen_params->control_image,
|
||||
sd_img_gen_params->control_strength,
|
||||
sd_img_gen_params->pm_params,
|
||||
ref_images,
|
||||
ref_latents,
|
||||
sd_img_gen_params->increase_ref_index,
|
||||
concat_latent,
|
||||
@@ -2835,30 +2893,25 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
|
||||
}
|
||||
|
||||
// Get learned condition
|
||||
bool zero_out_masked = true;
|
||||
int64_t t1 = ggml_time_ms();
|
||||
SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
|
||||
sd_ctx->sd->n_threads,
|
||||
prompt,
|
||||
sd_vid_gen_params->clip_skip,
|
||||
width,
|
||||
height,
|
||||
sd_ctx->sd->diffusion_model->get_adm_in_channels(),
|
||||
zero_out_masked);
|
||||
cond.c_concat = concat_latent;
|
||||
cond.c_vector = clip_vision_output;
|
||||
ConditionerParams condition_params;
|
||||
condition_params.clip_skip = sd_vid_gen_params->clip_skip;
|
||||
condition_params.zero_out_masked = true;
|
||||
condition_params.text = prompt;
|
||||
|
||||
int64_t t1 = ggml_time_ms();
|
||||
SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
|
||||
sd_ctx->sd->n_threads,
|
||||
condition_params);
|
||||
cond.c_concat = concat_latent;
|
||||
cond.c_vector = clip_vision_output;
|
||||
SDCondition uncond;
|
||||
if (sd_vid_gen_params->sample_params.guidance.txt_cfg != 1.0 || sd_vid_gen_params->high_noise_sample_params.guidance.txt_cfg != 1.0) {
|
||||
uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
|
||||
sd_ctx->sd->n_threads,
|
||||
negative_prompt,
|
||||
sd_vid_gen_params->clip_skip,
|
||||
width,
|
||||
height,
|
||||
sd_ctx->sd->diffusion_model->get_adm_in_channels(),
|
||||
zero_out_masked);
|
||||
uncond.c_concat = concat_latent;
|
||||
uncond.c_vector = clip_vision_output;
|
||||
condition_params.text = negative_prompt;
|
||||
uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
|
||||
sd_ctx->sd->n_threads,
|
||||
condition_params);
|
||||
uncond.c_concat = concat_latent;
|
||||
uncond.c_vector = clip_vision_output;
|
||||
}
|
||||
int64_t t2 = ggml_time_ms();
|
||||
LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t2 - t1);
|
||||
|
||||
Reference in New Issue
Block a user