feat: add Qwen Image support (#851)

* add qwen tokenizer

* add qwen2.5 vl support

* mv qwen.hpp -> qwenvl.hpp

* add qwen image model

* add qwen image t2i pipeline

* fix qwen image flash attn

* add qwen image i2i pipline

* change encoding of vocab_qwen.hpp to utf8

* fix get_first_stage_encoding

* apply jeffbolz f32 patch

https://github.com/leejet/stable-diffusion.cpp/pull/851#issuecomment-3335515302

* fix the issue that occurs when using CUDA with k-quants weights

* optimize the handling of the FeedForward precision fix

* to_add_out precision fix

* update docs
This commit is contained in:
leejet
2025-10-12 23:23:19 +08:00
committed by GitHub
parent aa68b875b9
commit beb99a2de2
21 changed files with 142312 additions and 272 deletions

View File

@@ -42,6 +42,7 @@ const char* model_version_to_str[] = {
"Wan 2.x",
"Wan 2.2 I2V",
"Wan 2.2 TI2V",
"Qwen Image",
};
const char* sampling_methods_str[] = {
@@ -253,6 +254,13 @@ public:
}
}
if (strlen(SAFE_STR(sd_ctx_params->qwen2vl_path)) > 0) {
LOG_INFO("loading qwen2vl from '%s'", sd_ctx_params->qwen2vl_path);
if (!model_loader.init_from_file(sd_ctx_params->qwen2vl_path, "text_encoders.qwen2vl.")) {
LOG_WARN("loading qwen2vl from '%s' failed", sd_ctx_params->qwen2vl_path);
}
}
if (strlen(SAFE_STR(sd_ctx_params->vae_path)) > 0) {
LOG_INFO("loading vae from '%s'", sd_ctx_params->vae_path);
if (!model_loader.init_from_file(sd_ctx_params->vae_path, "vae.")) {
@@ -318,7 +326,7 @@ public:
} else if (sd_version_is_flux(version)) {
scale_factor = 0.3611f;
// TODO: shift_factor
} else if (sd_version_is_wan(version)) {
} else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
scale_factor = 1.0f;
}
@@ -332,7 +340,7 @@ public:
{
clip_backend = backend;
bool use_t5xxl = false;
if (sd_version_is_dit(version)) {
if (sd_version_is_dit(version) && !sd_version_is_qwen_image(version)) {
use_t5xxl = true;
}
if (!clip_on_cpu && !ggml_backend_is_cpu(backend) && use_t5xxl) {
@@ -418,6 +426,16 @@ public:
clip_vision->alloc_params_buffer();
clip_vision->get_param_tensors(tensors);
}
} else if (sd_version_is_qwen_image(version)) {
cond_stage_model = std::make_shared<Qwen2_5_VLCLIPEmbedder>(clip_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types);
diffusion_model = std::make_shared<QwenImageModel>(backend,
offload_params_to_cpu,
model_loader.tensor_storages_types,
"model.diffusion_model",
version,
sd_ctx_params->diffusion_flash_attn);
} else { // SD1.x SD2.x SDXL
if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
@@ -466,7 +484,7 @@ public:
vae_backend = backend;
}
if (sd_version_is_wan(version)) {
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types,
@@ -711,6 +729,13 @@ public:
shift = 5.0;
}
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
} else if (sd_version_is_qwen_image(version)) {
LOG_INFO("running in FLOW mode");
float shift = sd_ctx_params->flow_shift;
if (shift == INFINITY) {
shift = 3.0;
}
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
} else if (is_using_v_parameterization) {
LOG_INFO("running in v-prediction mode");
denoiser = std::make_shared<CompVisVDenoiser>();
@@ -989,7 +1014,7 @@ public:
ggml_tensor_scale(noise, augmentation_level);
ggml_tensor_add(init_img, noise);
}
ggml_tensor* moments = encode_first_stage(work_ctx, init_img);
ggml_tensor* moments = vae_encode(work_ctx, init_img);
c_concat = get_first_stage_encoding(work_ctx, moments);
}
}
@@ -1298,118 +1323,8 @@ public:
return x;
}
// ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) {
// ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent);
ggml_tensor_set_f32_randn(noise, rng);
{
float mean = 0;
float logvar = 0;
float value = 0;
float std_ = 0;
for (int i = 0; i < latent->ne[3]; i++) {
for (int j = 0; j < latent->ne[2]; j++) {
for (int k = 0; k < latent->ne[1]; k++) {
for (int l = 0; l < latent->ne[0]; l++) {
mean = ggml_tensor_get_f32(moments, l, k, j, i);
logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i);
logvar = std::max(-30.0f, std::min(logvar, 20.0f));
std_ = std::exp(0.5f * logvar);
value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i);
value = value * scale_factor;
// printf("%d %d %d %d -> %f\n", i, j, k, l, value);
ggml_tensor_set_f32(latent, value, l, k, j, i);
}
}
}
}
}
return latent;
}
void get_tile_sizes(int& tile_size_x,
int& tile_size_y,
float& tile_overlap,
const sd_tiling_params_t& params,
int latent_x,
int latent_y,
float encoding_factor = 1.0f) {
tile_overlap = std::max(std::min(params.target_overlap, 0.5f), 0.0f);
auto get_tile_size = [&](int requested_size, float factor, int latent_size) {
const int default_tile_size = 32;
const int min_tile_dimension = 4;
int tile_size = default_tile_size;
// factor <= 1 means simple fraction of the latent dimension
// factor > 1 means number of tiles across that dimension
if (factor > 0.f) {
if (factor > 1.0)
factor = 1 / (factor - factor * tile_overlap + tile_overlap);
tile_size = std::round(latent_size * factor);
} else if (requested_size >= min_tile_dimension) {
tile_size = requested_size;
}
tile_size *= encoding_factor;
return std::max(std::min(tile_size, latent_size), min_tile_dimension);
};
tile_size_x = get_tile_size(params.tile_size_x, params.rel_size_x, latent_x);
tile_size_y = get_tile_size(params.tile_size_y, params.rel_size_y, latent_y);
}
ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) {
int64_t t0 = ggml_time_ms();
ggml_tensor* result = NULL;
int W = x->ne[0] / 8;
int H = x->ne[1] / 8;
if (vae_tiling_params.enabled && !encode_video) {
// TODO wan2.2 vae support?
int C = sd_version_is_dit(version) ? 16 : 4;
if (!use_tiny_autoencoder) {
C *= 2;
}
result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, x->ne[3]);
}
if (!use_tiny_autoencoder) {
process_vae_input_tensor(x);
if (vae_tiling_params.enabled && !encode_video) {
float tile_overlap;
int tile_size_x, tile_size_y;
// multiply tile size for encode to keep the compute buffer size consistent
get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, vae_tiling_params, W, H, 1.30539f);
LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
first_stage_model->compute(n_threads, in, false, &out, work_ctx);
};
sd_tiling_non_square(x, result, 8, tile_size_x, tile_size_y, tile_overlap, on_tiling);
} else {
first_stage_model->compute(n_threads, x, false, &result, work_ctx);
}
first_stage_model->free_compute_buffer();
} else {
if (vae_tiling_params.enabled && !encode_video) {
// split latent in 32x32 tiles and compute in several steps
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
tae_first_stage->compute(n_threads, in, false, &out, NULL);
};
sd_tiling(x, result, 8, 64, 0.5f, on_tiling);
} else {
tae_first_stage->compute(n_threads, x, false, &result, work_ctx);
}
tae_first_stage->free_compute_buffer();
}
int64_t t1 = ggml_time_ms();
LOG_DEBUG("computing vae encode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
return result;
}
void process_latent_in(ggml_tensor* latent) {
if (sd_version_is_wan(version)) {
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
GGML_ASSERT(latent->ne[3] == 16 || latent->ne[3] == 48);
std::vector<float> latents_mean_vec = {-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f,
0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f};
@@ -1449,7 +1364,7 @@ public:
}
void process_latent_out(ggml_tensor* latent) {
if (sd_version_is_wan(version)) {
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
GGML_ASSERT(latent->ne[3] == 16 || latent->ne[3] == 48);
std::vector<float> latents_mean_vec = {-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f,
0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f};
@@ -1488,6 +1403,146 @@ public:
}
}
void get_tile_sizes(int& tile_size_x,
int& tile_size_y,
float& tile_overlap,
const sd_tiling_params_t& params,
int latent_x,
int latent_y,
float encoding_factor = 1.0f) {
tile_overlap = std::max(std::min(params.target_overlap, 0.5f), 0.0f);
auto get_tile_size = [&](int requested_size, float factor, int latent_size) {
const int default_tile_size = 32;
const int min_tile_dimension = 4;
int tile_size = default_tile_size;
// factor <= 1 means simple fraction of the latent dimension
// factor > 1 means number of tiles across that dimension
if (factor > 0.f) {
if (factor > 1.0)
factor = 1 / (factor - factor * tile_overlap + tile_overlap);
tile_size = std::round(latent_size * factor);
} else if (requested_size >= min_tile_dimension) {
tile_size = requested_size;
}
tile_size *= encoding_factor;
return std::max(std::min(tile_size, latent_size), min_tile_dimension);
};
tile_size_x = get_tile_size(params.tile_size_x, params.rel_size_x, latent_x);
tile_size_y = get_tile_size(params.tile_size_y, params.rel_size_y, latent_y);
}
ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) {
int64_t t0 = ggml_time_ms();
ggml_tensor* result = NULL;
int W = x->ne[0] / 8;
int H = x->ne[1] / 8;
if (vae_tiling_params.enabled && !encode_video) {
// TODO wan2.2 vae support?
int C = sd_version_is_dit(version) ? 16 : 4;
if (!use_tiny_autoencoder) {
C *= 2;
}
result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, x->ne[3]);
}
if (sd_version_is_qwen_image(version)) {
x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]);
}
if (!use_tiny_autoencoder) {
process_vae_input_tensor(x);
if (vae_tiling_params.enabled && !encode_video) {
float tile_overlap;
int tile_size_x, tile_size_y;
// multiply tile size for encode to keep the compute buffer size consistent
get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, vae_tiling_params, W, H, 1.30539f);
LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
first_stage_model->compute(n_threads, in, false, &out, work_ctx);
};
sd_tiling_non_square(x, result, 8, tile_size_x, tile_size_y, tile_overlap, on_tiling);
} else {
first_stage_model->compute(n_threads, x, false, &result, work_ctx);
}
first_stage_model->free_compute_buffer();
} else {
if (vae_tiling_params.enabled && !encode_video) {
// split latent in 32x32 tiles and compute in several steps
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
tae_first_stage->compute(n_threads, in, false, &out, NULL);
};
sd_tiling(x, result, 8, 64, 0.5f, on_tiling);
} else {
tae_first_stage->compute(n_threads, x, false, &result, work_ctx);
}
tae_first_stage->free_compute_buffer();
}
int64_t t1 = ggml_time_ms();
LOG_DEBUG("computing vae encode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
return result;
}
ggml_tensor* gaussian_latent_sample(ggml_context* work_ctx, ggml_tensor* moments) {
// ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent);
ggml_tensor_set_f32_randn(noise, rng);
{
float mean = 0;
float logvar = 0;
float value = 0;
float std_ = 0;
for (int i = 0; i < latent->ne[3]; i++) {
for (int j = 0; j < latent->ne[2]; j++) {
for (int k = 0; k < latent->ne[1]; k++) {
for (int l = 0; l < latent->ne[0]; l++) {
mean = ggml_tensor_get_f32(moments, l, k, j, i);
logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i);
logvar = std::max(-30.0f, std::min(logvar, 20.0f));
std_ = std::exp(0.5f * logvar);
value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i);
// printf("%d %d %d %d -> %f\n", i, j, k, l, value);
ggml_tensor_set_f32(latent, value, l, k, j, i);
}
}
}
}
}
return latent;
}
ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* vae_output) {
ggml_tensor* latent;
if (use_tiny_autoencoder || sd_version_is_qwen_image(version) || sd_version_is_wan(version)) {
latent = vae_output;
} else if (version == VERSION_SD1_PIX2PIX) {
latent = ggml_view_3d(work_ctx,
vae_output,
vae_output->ne[0],
vae_output->ne[1],
vae_output->ne[2] / 2,
vae_output->nb[1],
vae_output->nb[2],
0);
} else {
latent = gaussian_latent_sample(work_ctx, vae_output);
}
process_latent_in(latent);
if (sd_version_is_qwen_image(version)) {
latent = ggml_reshape_4d(work_ctx, latent, latent->ne[0], latent->ne[1], latent->ne[3], 1);
}
return latent;
}
ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) {
ggml_tensor* vae_output = vae_encode(work_ctx, x, encode_video);
return get_first_stage_encoding(work_ctx, vae_output);
}
ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode_video = false) {
int64_t W = x->ne[0] * 8;
int64_t H = x->ne[1] * 8;
@@ -1518,6 +1573,9 @@ public:
}
int64_t t0 = ggml_time_ms();
if (!use_tiny_autoencoder) {
if (sd_version_is_qwen_image(version)) {
x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]);
}
process_latent_out(x);
// x = load_tensor_from_file(work_ctx, "wan_vae_z.bin");
if (vae_tiling_params.enabled && !decode_video) {
@@ -1689,6 +1747,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
"clip_g_path: %s\n"
"clip_vision_path: %s\n"
"t5xxl_path: %s\n"
"qwen2vl_path: %s\n"
"diffusion_model_path: %s\n"
"high_noise_diffusion_model_path: %s\n"
"vae_path: %s\n"
@@ -1716,6 +1775,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
SAFE_STR(sd_ctx_params->clip_g_path),
SAFE_STR(sd_ctx_params->clip_vision_path),
SAFE_STR(sd_ctx_params->t5xxl_path),
SAFE_STR(sd_ctx_params->qwen2vl_path),
SAFE_STR(sd_ctx_params->diffusion_model_path),
SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path),
SAFE_STR(sd_ctx_params->vae_path),
@@ -2079,6 +2139,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
C = 16;
} else if (sd_version_is_flux(sd_ctx->sd->version)) {
C = 16;
} else if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
C = 16;
}
int W = width / 8;
int H = height / 8;
@@ -2086,12 +2148,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
struct ggml_tensor* control_latent = NULL;
if (sd_version_is_control(sd_ctx->sd->version) && image_hint != NULL) {
if (!sd_ctx->sd->use_tiny_autoencoder) {
struct ggml_tensor* control_moments = sd_ctx->sd->encode_first_stage(work_ctx, image_hint);
control_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, control_moments);
} else {
control_latent = sd_ctx->sd->encode_first_stage(work_ctx, image_hint);
}
control_latent = sd_ctx->sd->encode_first_stage(work_ctx, image_hint);
ggml_tensor_scale(control_latent, control_strength);
}
@@ -2278,6 +2335,8 @@ ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx,
C = 16;
} else if (sd_version_is_flux(sd_ctx->sd->version)) {
C = 16;
} else if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
C = 16;
} else if (sd_version_is_wan(sd_ctx->sd->version)) {
C = 16;
T = ((T - 1) / 4) + 1;
@@ -2354,7 +2413,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
ggml_tensor* init_latent = NULL;
ggml_tensor* init_moments = NULL;
ggml_tensor* concat_latent = NULL;
ggml_tensor* denoise_mask = NULL;
if (sd_img_gen_params->init_image.data) {
@@ -2374,12 +2432,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
sd_image_to_tensor(sd_img_gen_params->mask_image, mask_img);
sd_image_to_tensor(sd_img_gen_params->init_image, init_img);
if (!sd_ctx->sd->use_tiny_autoencoder) {
init_moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, init_moments);
} else {
init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
}
init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
if (sd_version_is_inpaint(sd_ctx->sd->version)) {
int64_t mask_channels = 1;
@@ -2389,16 +2442,12 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
mask_channels = 1 + init_latent->ne[2];
}
ggml_tensor* masked_latent = NULL;
if (sd_ctx->sd->version != VERSION_FLEX_2) {
// most inpaint models mask before vae
ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
sd_apply_mask(init_img, mask_img, masked_img);
if (!sd_ctx->sd->use_tiny_autoencoder) {
ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
masked_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
} else {
masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
}
masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
} else {
// mask after vae
masked_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1);
@@ -2458,7 +2507,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
}
}
}
} else {
LOG_INFO("TXT2IMG");
if (sd_version_is_inpaint(sd_ctx->sd->version)) {
@@ -2497,23 +2545,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
1);
sd_image_to_tensor(*ref_images[i], img);
ggml_tensor* latent = NULL;
if (sd_ctx->sd->use_tiny_autoencoder) {
latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
} else if (sd_ctx->sd->version == VERSION_SD1_PIX2PIX) {
latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
latent = ggml_view_3d(work_ctx,
latent,
latent->ne[0],
latent->ne[1],
latent->ne[2] / 2,
latent->nb[1],
latent->nb[2],
0);
} else {
ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img);
latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
}
ggml_tensor* latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
ref_latents.push_back(latent);
}
@@ -2675,8 +2707,6 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
int64_t t2 = ggml_time_ms();
LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
sd_ctx->sd->process_latent_in(concat_latent);
ggml_tensor* concat_mask = ggml_new_tensor_4d(work_ctx,
GGML_TYPE_F32,
concat_latent->ne[0],
@@ -2702,7 +2732,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
sd_image_to_tensor(sd_vid_gen_params->init_image, init_img);
init_img = ggml_reshape_4d(work_ctx, init_img, width, height, 1, 3);
auto init_image_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); // [b*c, 1, h/16, w/16]
auto init_image_latent = sd_ctx->sd->vae_encode(work_ctx, init_img); // [b*c, 1, h/16, w/16]
init_latent = generate_init_latent(sd_ctx, work_ctx, width, height, frames, true);
denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1);
@@ -2733,7 +2763,6 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
ref_img = ggml_reshape_4d(work_ctx, ref_img, width, height, 1, 3);
ref_image_latent = sd_ctx->sd->encode_first_stage(work_ctx, ref_img); // [b*c, 1, h/16, w/16]
sd_ctx->sd->process_latent_in(ref_image_latent);
auto zero_latent = ggml_dup_tensor(work_ctx, ref_image_latent);
ggml_set_f32(zero_latent, 0.f);
ref_image_latent = ggml_tensor_concat(work_ctx, ref_image_latent, zero_latent, 3); // [b*2*c, 1, h/16, w/16]
@@ -2765,9 +2794,6 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
inactive = sd_ctx->sd->encode_first_stage(work_ctx, inactive); // [b*c, t, h/8, w/8]
reactive = sd_ctx->sd->encode_first_stage(work_ctx, reactive); // [b*c, t, h/8, w/8]
sd_ctx->sd->process_latent_in(inactive);
sd_ctx->sd->process_latent_in(reactive);
int64_t length = inactive->ne[2];
if (ref_image_latent) {
length += 1;