Files
rocm-stable-diffusion.cpp/stable-diffusion.cpp
Urs Ganse 968fbf02aa feat: add option to switch the sigma schedule (#51)
Concretely, this allows switching to the "Karras" schedule from the
Karras et al 2022 paper, equivalent to the samplers marked as "Karras"
in the AUTOMATIC1111 WebUI. This choice is in principle orthogonal to
the sampler choice and can be given independently.
2023-09-09 00:02:07 +08:00

4252 lines
172 KiB
C++

#include <assert.h>
#include <algorithm>
#include <cstring>
#include <fstream>
#include <iostream>
#include <iterator>
#include <map>
#include <random>
#include <regex>
#include <set>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
#include "ggml/ggml.h"
#include "rng.h"
#include "rng_philox.h"
#include "stable-diffusion.h"
static SDLogLevel log_level = SDLogLevel::INFO;
#define __FILENAME__ "stable-diffusion.cpp"
#define SD_LOG(level, format, ...) \
do { \
if (level < log_level) { \
break; \
} \
if (level == SDLogLevel::DEBUG) { \
printf("[DEBUG] %s:%-4d - " format "\n", __FILENAME__, __LINE__, ##__VA_ARGS__); \
fflush(stdout); \
} else if (level == SDLogLevel::INFO) { \
printf("[INFO] %s:%-4d - " format "\n", __FILENAME__, __LINE__, ##__VA_ARGS__); \
fflush(stdout); \
} else if (level == SDLogLevel::WARN) { \
fprintf(stderr, "[WARN] %s:%-4d - " format "\n", __FILENAME__, __LINE__, ##__VA_ARGS__); \
fflush(stdout); \
} else if (level == SDLogLevel::ERROR) { \
fprintf(stderr, "[ERROR] %s:%-4d - " format "\n", __FILENAME__, __LINE__, ##__VA_ARGS__); \
fflush(stdout); \
} \
} while (0)
#define LOG_DEBUG(format, ...) SD_LOG(SDLogLevel::DEBUG, format, ##__VA_ARGS__)
#define LOG_INFO(format, ...) SD_LOG(SDLogLevel::INFO, format, ##__VA_ARGS__)
#define LOG_WARN(format, ...) SD_LOG(SDLogLevel::WARN, format, ##__VA_ARGS__)
#define LOG_ERROR(format, ...) SD_LOG(SDLogLevel::ERROR, format, ##__VA_ARGS__)
#define GGML_FILE_MAGIC 0x67676d6c
#define TIMESTEPS 1000
enum ModelType {
SD1 = 0,
SD2 = 1,
MODEL_TYPE_COUNT,
};
const char* model_type_to_str[] = {
"SD1.x",
"SD2.x"};
/*================================================== Helper Functions ================================================*/
void set_sd_log_level(SDLogLevel level) {
log_level = level;
}
std::string sd_get_system_info() {
std::stringstream ss;
ss << "System Info: \n";
ss << " BLAS = " << ggml_cpu_has_blas() << std::endl;
ss << " SSE3 = " << ggml_cpu_has_sse3() << std::endl;
ss << " AVX = " << ggml_cpu_has_avx() << std::endl;
ss << " AVX2 = " << ggml_cpu_has_avx2() << std::endl;
ss << " AVX512 = " << ggml_cpu_has_avx512() << std::endl;
ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << std::endl;
ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << std::endl;
ss << " FMA = " << ggml_cpu_has_fma() << std::endl;
ss << " NEON = " << ggml_cpu_has_neon() << std::endl;
ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << std::endl;
ss << " F16C = " << ggml_cpu_has_f16c() << std::endl;
ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << std::endl;
ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << std::endl;
ss << " VSX = " << ggml_cpu_has_vsx() << std::endl;
return ss.str();
}
ggml_tensor* load_tensor_from_file(ggml_context* ctx, const std::string& file_path) {
std::ifstream file(file_path, std::ios::binary);
if (!file.is_open()) {
LOG_ERROR("failed to open '%s'", file_path.c_str());
return NULL;
}
int32_t n_dims;
int32_t length;
int32_t ttype;
file.read(reinterpret_cast<char*>(&n_dims), sizeof(n_dims));
file.read(reinterpret_cast<char*>(&length), sizeof(length));
file.read(reinterpret_cast<char*>(&ttype), sizeof(ttype));
if (file.eof()) {
LOG_ERROR("incomplete file '%s'", file_path.c_str());
return NULL;
}
int32_t nelements = 1;
int32_t ne[4] = {1, 1, 1, 1};
for (int i = 0; i < n_dims; ++i) {
file.read(reinterpret_cast<char*>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
std::string name(length, 0);
file.read(&name[0], length);
ggml_tensor* tensor = ggml_new_tensor_4d(ctx, (ggml_type)ttype, ne[0], ne[1], ne[2], ne[3]);
const size_t bpe = ggml_type_size(ggml_type(ttype));
file.read(reinterpret_cast<char*>(tensor->data), ggml_nbytes(tensor));
return tensor;
}
void ggml_tensor_set_f32_randn(struct ggml_tensor* tensor, std::shared_ptr<RNG> rng) {
uint32_t n = ggml_nelements(tensor);
std::vector<float> random_numbers = rng->randn(n);
for (int i = 0; i < n; i++) {
ggml_set_f32_1d(tensor, i, random_numbers[i]);
}
}
// set tensor[i, j, k, l]
// set tensor[l]
// set tensor[k, l]
// set tensor[j, k, l]
void ggml_tensor_set_f32(struct ggml_tensor* tensor, float value, int l, int k = 0, int j = 0, int i = 0) {
GGML_ASSERT(tensor->nb[0] == sizeof(float));
*(float*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]) = value;
}
float ggml_tensor_get_f32(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) {
GGML_ASSERT(tensor->nb[0] == sizeof(float));
return *(float*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
}
void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_only = false) {
printf("shape(%zu, %zu, %zu, %zu)\n", tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
fflush(stdout);
if (shape_only) {
return;
}
int range = 3;
for (int i = 0; i < tensor->ne[3]; i++) {
if (i >= range && i + range < tensor->ne[3]) {
continue;
}
for (int j = 0; j < tensor->ne[2]; j++) {
if (j >= range && j + range < tensor->ne[2]) {
continue;
}
for (int k = 0; k < tensor->ne[1]; k++) {
if (k >= range && k + range < tensor->ne[1]) {
continue;
}
for (int l = 0; l < tensor->ne[0]; l++) {
if (l >= range && l + range < tensor->ne[0]) {
continue;
}
printf(" [%d, %d, %d, %d] = %f\n", i, j, k, l, ggml_tensor_get_f32(tensor, l, k, j, i));
fflush(stdout);
}
}
}
}
}
void copy_ggml_tensor(
struct ggml_tensor* dst,
const struct ggml_tensor* src) {
dst->nb[0] = src->nb[0];
dst->nb[1] = src->nb[1];
dst->nb[2] = src->nb[2];
dst->nb[3] = src->nb[3];
memcpy(((char*)dst->data), ((char*)src->data), ggml_nbytes(dst));
}
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
void set_timestep_embedding(struct ggml_tensor* timesteps, struct ggml_tensor* embedding, int dim, int max_period = 10000) {
// timesteps: [N,]
// embedding: [(dim + 1)/2, N]
int half = dim / 2;
std::vector<float> freqs(half);
for (int i = 0; i < half; ++i) {
freqs[i] = (float)std::exp(-std::log(max_period) * i / half);
}
for (int i = 0; i < timesteps->ne[0]; ++i) {
for (int j = 0; j < half; ++j) {
float arg = ggml_get_f32_1d(timesteps, i) * freqs[j];
ggml_tensor_set_f32(embedding, std::cos(arg), j, i);
ggml_tensor_set_f32(embedding, std::sin(arg), j + half, i);
}
if (dim % 2 != 0) {
*(float*)((char*)embedding->data + i * embedding->nb[1] + dim * embedding->nb[0]) = 0;
}
}
}
struct ggml_tensor* new_timestep_embedding(struct ggml_context* ctx, struct ggml_tensor* timesteps, int dim, int max_period = 10000) {
// timesteps: [N,]
// embedding: [(dim + 1)/2, N]
int acutual_dim = dim;
if (dim % 2 != 0) {
acutual_dim = dim + 1;
}
struct ggml_tensor* embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, acutual_dim, timesteps->ne[0]);
if (!ggml_get_no_alloc(ctx)) {
set_timestep_embedding(timesteps, embedding, dim, max_period);
}
return embedding;
}
std::vector<uint8_t> ggml_to_image_vec(struct ggml_tensor* t) {
int64_t w = t->ne[0];
int64_t h = t->ne[1];
int64_t c = t->ne[2];
std::vector<uint8_t> vec;
vec.resize(w * h * c);
uint8_t* data = (uint8_t*)vec.data();
for (int i = 0; i < h; i++) {
for (int j = 0; j < w; j++) {
for (int k = 0; k < c; k++) {
float value = ggml_tensor_get_f32(t, j, i, k);
value = (value + 1.0f) * 0.5f;
if (value < 0) {
value = 0;
} else if (value > 1) {
value = 1;
}
value *= 255.f;
*(data + i * w * c + j * c + k) = (uint8_t)value;
}
}
}
return vec;
}
void image_vec_to_ggml(const std::vector<uint8_t>& vec,
struct ggml_tensor* t) {
int64_t w = t->ne[0];
int64_t h = t->ne[1];
int64_t c = t->ne[2];
uint8_t* data = (uint8_t*)vec.data();
for (int i = 0; i < h; i++) {
for (int j = 0; j < w; j++) {
for (int k = 0; k < c; k++) {
float value = *(data + i * w * c + j * c + k);
value = value / 255.f;
value = 2 * value - 1;
ggml_tensor_set_f32(t, value, j, i, k);
}
}
}
}
struct ggml_tensor* ggml_group_norm_32(struct ggml_context* ctx,
struct ggml_tensor* a) {
return ggml_group_norm(ctx, a, 32);
}
/*================================================== CLIPTokenizer ===================================================*/
const std::string UNK_TOKEN = "<|endoftext|>";
const std::string BOS_TOKEN = "<|startoftext|>";
const std::string EOS_TOKEN = "<|endoftext|>";
const std::string PAD_TOEKN = "<|endoftext|>";
const int UNK_TOKEN_ID = 49407;
const int BOS_TOKEN_ID = 49406;
const int EOS_TOKEN_ID = 49407;
const int PAD_TOKEN_ID = 49407;
// Ref: https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py
// TODO: implement bpe
class CLIPTokenizer {
private:
ModelType model_type = SD1;
std::map<std::string, int32_t> encoder;
std::regex pat;
static std::string strip(const std::string& str) {
std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f");
std::string::size_type end = str.find_last_not_of(" \t\n\r\v\f");
if (start == std::string::npos) {
// String contains only whitespace characters
return "";
}
return str.substr(start, end - start + 1);
}
static std::string whitespace_clean(std::string text) {
text = std::regex_replace(text, std::regex(R"(\s+)"), " ");
text = strip(text);
return text;
}
public:
CLIPTokenizer(ModelType model_type = SD1)
: model_type(model_type){};
std::string bpe(std::string token) {
std::string word = token + "</w>";
if (encoder.find(word) != encoder.end()) {
return word;
} else if (encoder.find(token) != encoder.end()) {
return token;
}
return UNK_TOKEN;
}
void add_token(std::string token, int32_t token_id) {
encoder[token] = token_id;
}
std::vector<int> tokenize(std::string text, size_t max_length = 0, bool padding = false) {
std::vector<int32_t> tokens = encode(text);
tokens.insert(tokens.begin(), BOS_TOKEN_ID);
if (max_length > 0) {
if (tokens.size() > max_length - 1) {
tokens.resize(max_length - 1);
tokens.push_back(EOS_TOKEN_ID);
} else {
tokens.push_back(EOS_TOKEN_ID);
if (padding) {
int pad_token_id = PAD_TOKEN_ID;
if (model_type == SD2) {
pad_token_id = 0;
}
tokens.insert(tokens.end(), max_length - tokens.size(), pad_token_id);
}
}
}
return tokens;
}
std::vector<int> encode(std::string text) {
std::string original_text = text;
std::vector<int32_t> bpe_tokens;
text = whitespace_clean(text);
std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });
std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
std::regex::icase);
std::smatch matches;
std::string str = text;
std::vector<std::string> token_strs;
while (std::regex_search(str, matches, pat)) {
for (auto& token : matches) {
std::istringstream iss(bpe(token));
std::vector<std::string> tokens{std::istream_iterator<std::string>{iss},
std::istream_iterator<std::string>{}};
for (const auto& bpe_token : tokens) {
bpe_tokens.push_back(encoder[bpe_token]);
token_strs.push_back(bpe_token);
}
}
str = matches.suffix();
}
std::stringstream ss;
ss << "[";
for (auto token : token_strs) {
ss << "\"" << token << "\", ";
}
ss << "]";
LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
return bpe_tokens;
}
};
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/prompt_parser.py#L345
//
// Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
// Accepted tokens are:
// (abc) - increases attention to abc by a multiplier of 1.1
// (abc:3.12) - increases attention to abc by a multiplier of 3.12
// [abc] - decreases attention to abc by a multiplier of 1.1
// \( - literal character '('
// \[ - literal character '['
// \) - literal character ')'
// \] - literal character ']'
// \\ - literal character '\'
// anything else - just text
//
// >>> parse_prompt_attention('normal text')
// [['normal text', 1.0]]
// >>> parse_prompt_attention('an (important) word')
// [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
// >>> parse_prompt_attention('(unbalanced')
// [['unbalanced', 1.1]]
// >>> parse_prompt_attention('\(literal\]')
// [['(literal]', 1.0]]
// >>> parse_prompt_attention('(unnecessary)(parens)')
// [['unnecessaryparens', 1.1]]
// >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
// [['a ', 1.0],
// ['house', 1.5730000000000004],
// [' ', 1.1],
// ['on', 1.0],
// [' a ', 1.1],
// ['hill', 0.55],
// [', sun, ', 1.1],
// ['sky', 1.4641000000000006],
// ['.', 1.1]]
std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text) {
std::vector<std::pair<std::string, float>> res;
std::vector<int> round_brackets;
std::vector<int> square_brackets;
float round_bracket_multiplier = 1.1f;
float square_bracket_multiplier = 1 / 1.1f;
std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
std::regex re_break(R"(\s*\bBREAK\b\s*)");
auto multiply_range = [&](int start_position, float multiplier) {
for (int p = start_position; p < res.size(); ++p) {
res[p].second *= multiplier;
}
};
std::smatch m;
std::string remaining_text = text;
while (std::regex_search(remaining_text, m, re_attention)) {
std::string text = m[0];
std::string weight = m[1];
if (text == "(") {
round_brackets.push_back(res.size());
} else if (text == "[") {
square_brackets.push_back(res.size());
} else if (!weight.empty()) {
if (!round_brackets.empty()) {
multiply_range(round_brackets.back(), std::stod(weight));
round_brackets.pop_back();
}
} else if (text == ")" && !round_brackets.empty()) {
multiply_range(round_brackets.back(), round_bracket_multiplier);
round_brackets.pop_back();
} else if (text == "]" && !square_brackets.empty()) {
multiply_range(square_brackets.back(), square_bracket_multiplier);
square_brackets.pop_back();
} else if (text == "\\(") {
res.push_back({text.substr(1), 1.0f});
} else {
res.push_back({text, 1.0f});
}
remaining_text = m.suffix();
}
for (int pos : round_brackets) {
multiply_range(pos, round_bracket_multiplier);
}
for (int pos : square_brackets) {
multiply_range(pos, square_bracket_multiplier);
}
if (res.empty()) {
res.push_back({"", 1.0f});
}
int i = 0;
while (i + 1 < res.size()) {
if (res[i].second == res[i + 1].second) {
res[i].first += res[i + 1].first;
res.erase(res.begin() + i + 1);
} else {
++i;
}
}
return res;
}
/*================================================ FrozenCLIPEmbedder ================================================*/
struct ResidualAttentionBlock {
int32_t n_head;
int32_t d_model;
int32_t hidden_size; // n_head * d_model
int32_t intermediate_size;
// attention
struct ggml_tensor* q_w; // [hidden_size, hidden_size]
struct ggml_tensor* q_b; // [hidden_size, ]
struct ggml_tensor* k_w; // [hidden_size, hidden_size]
struct ggml_tensor* k_b; // [hidden_size, ]
struct ggml_tensor* v_w; // [hidden_size, hidden_size]
struct ggml_tensor* v_b; // [hidden_size, ]
struct ggml_tensor* out_w; // [hidden_size, hidden_size]
struct ggml_tensor* out_b; // [hidden_size, ]
// layer norm 1
struct ggml_tensor* ln1_w; // [hidden_size, ]
struct ggml_tensor* ln1_b; // [hidden_size, ]
// mlp
struct ggml_tensor* fc1_w; // [intermediate_size, hidden_size]
struct ggml_tensor* fc1_b; // [intermediate_size, ]
struct ggml_tensor* fc2_w; // [hidden_size, intermediate_size]
struct ggml_tensor* fc2_b; // [hidden_size, ]
// layer norm 2
struct ggml_tensor* ln2_w; // [hidden_size, ]
struct ggml_tensor* ln2_b; // [hidden_size, ]
size_t compute_params_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += 4 * hidden_size * hidden_size * ggml_type_sizef(wtype); // q_w/k_w/v_w/out_w
mem_size += 8 * hidden_size * ggml_type_sizef(GGML_TYPE_F32); // q_b/k_b/v_b/out_b/ln1_w/ln1_b/ln2_w/ln2_b
mem_size += 2 * hidden_size * intermediate_size * ggml_type_sizef(wtype); // fc1_w/fc2_w
mem_size += intermediate_size * ggml_type_sizef(GGML_TYPE_F32); // fc1_b
mem_size += hidden_size * ggml_type_sizef(GGML_TYPE_F32); // fc2_b
mem_size += 16 * ggml_tensor_overhead(); // tensor overhead
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
ln1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
ln1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
q_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
k_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
v_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
out_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
fc1_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, intermediate_size);
fc1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, intermediate_size);
fc2_w = ggml_new_tensor_2d(ctx, wtype, intermediate_size, hidden_size);
fc2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
ln2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
ln2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "self_attn.q_proj.weight"] = q_w;
tensors[prefix + "self_attn.q_proj.bias"] = q_b;
tensors[prefix + "self_attn.k_proj.weight"] = k_w;
tensors[prefix + "self_attn.k_proj.bias"] = k_b;
tensors[prefix + "self_attn.v_proj.weight"] = v_w;
tensors[prefix + "self_attn.v_proj.bias"] = v_b;
tensors[prefix + "self_attn.out_proj.weight"] = out_w;
tensors[prefix + "self_attn.out_proj.bias"] = out_b;
tensors[prefix + "layer_norm1.weight"] = ln1_w;
tensors[prefix + "layer_norm1.bias"] = ln1_b;
tensors[prefix + "layer_norm2.weight"] = ln2_w;
tensors[prefix + "layer_norm2.bias"] = ln2_b;
tensors[prefix + "mlp.fc1.weight"] = fc1_w;
tensors[prefix + "mlp.fc1.bias"] = fc1_b;
tensors[prefix + "mlp.fc2.weight"] = fc2_w;
tensors[prefix + "mlp.fc2.bias"] = fc2_b;
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, n_token, hidden_size]
int64_t N = x->ne[2];
int64_t n_token = x->ne[1];
int64_t hidden_size = n_head * d_model;
struct ggml_tensor* r = x;
// layer norm 1
{
x = ggml_norm(ctx, x);
x = ggml_add(ctx,
ggml_mul(ctx, ggml_repeat(ctx, ln1_w, x), x),
ggml_repeat(ctx, ln1_b, x));
}
// self-attention
{
struct ggml_tensor* q = ggml_add(ctx,
ggml_repeat(ctx, q_b, x),
ggml_mul_mat(ctx, q_w, x));
q = ggml_scale_inplace(ctx, q, ggml_new_f32(ctx, 1.0f / sqrt((float)d_model)));
q = ggml_reshape_4d(ctx, q, d_model, n_head, n_token, N); // [N, n_token, n_head, d_model]
q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, n_token, d_model]
q = ggml_reshape_3d(ctx, q, d_model, n_token, n_head * N); // [N * n_head, n_token, d_model]
struct ggml_tensor* k = ggml_add(ctx,
ggml_repeat(ctx, k_b, x),
ggml_mul_mat(ctx, k_w, x));
k = ggml_reshape_4d(ctx, k, d_model, n_head, n_token, N); // [N, n_token, n_head, d_model]
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, n_token, d_model]
k = ggml_reshape_3d(ctx, k, d_model, n_token, n_head); // [N * n_head, n_token, d_model]
struct ggml_tensor* v = ggml_add(ctx,
ggml_repeat(ctx, v_b, x),
ggml_mul_mat(ctx, v_w, x));
v = ggml_reshape_4d(ctx, v, d_model, n_head, n_token, N); // [N, n_token, n_head, d_model]
v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_model, n_token]
v = ggml_reshape_3d(ctx, v, n_token, d_model, n_head * N); // [N * n_head, d_model, n_token]
struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q); // [N * n_head, n_token, n_token]
kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
kq = ggml_soft_max_inplace(ctx, kq);
struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq); // [N * n_head, n_token, d_model]
kqv = ggml_reshape_4d(ctx, kqv, d_model, n_token, n_head, N);
kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3)); // [N, n_token, n_head, d_model]
x = ggml_reshape_2d(ctx, kqv, d_model * n_head, n_token * N); // // [N * n_token, d_model * n_head]
}
// attention output
x = ggml_add(ctx, ggml_repeat(ctx, out_b, x), ggml_mul_mat(ctx, out_w, x));
// residual
x = ggml_add(ctx, x, r);
r = x;
// layer norm 2
{
x = ggml_norm(ctx, x);
x = ggml_add(ctx, ggml_mul(ctx, ggml_repeat(ctx, ln2_w, x), x),
ggml_repeat(ctx, ln2_b, x));
}
// mlp
x = ggml_mul_mat(ctx, fc1_w, x);
x = ggml_add(ctx, ggml_repeat(ctx, fc1_b, x), x);
if (hidden_size == 1024) { // SD 2.x
x = ggml_gelu_inplace(ctx, x);
} else { // SD 1.x
x = ggml_gelu_quick_inplace(ctx, x);
}
x = ggml_mul_mat(ctx, fc2_w, x);
x = ggml_add(ctx, ggml_repeat(ctx, fc2_b, x), x);
// residual 2
x = ggml_add(ctx, x, r);
return x;
}
};
// SD1.x: https://huggingface.co/openai/clip-vit-large-patch14/blob/main/config.json
// SD2.x: https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/config.json
struct CLIPTextModel {
ModelType model_type = SD1;
// network hparams
int32_t vocab_size = 49408;
int32_t max_position_embeddings = 77;
int32_t hidden_size = 768; // 1024 for SD 2.x
int32_t intermediate_size = 3072; // 4096 for SD 2.x
int32_t n_head = 12; // num_attention_heads, 16 for SD 2.x
int32_t num_hidden_layers = 12; // 24 for SD 2.x
// embeddings
struct ggml_tensor* position_ids;
struct ggml_tensor* token_embed_weight;
struct ggml_tensor* position_embed_weight;
// transformer
std::vector<ResidualAttentionBlock> resblocks;
struct ggml_tensor* final_ln_w;
struct ggml_tensor* final_ln_b;
CLIPTextModel(ModelType model_type = SD1)
: model_type(model_type) {
if (model_type == SD2) {
hidden_size = 1024;
intermediate_size = 4096;
n_head = 16;
num_hidden_layers = 24;
}
resblocks.resize(num_hidden_layers);
set_resblocks_hp_params();
}
void set_resblocks_hp_params() {
int d_model = hidden_size / n_head; // 64
for (int i = 0; i < num_hidden_layers; i++) {
resblocks[i].d_model = d_model;
resblocks[i].n_head = n_head;
resblocks[i].hidden_size = hidden_size;
resblocks[i].intermediate_size = intermediate_size;
}
}
size_t compute_params_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += hidden_size * max_position_embeddings * ggml_type_sizef(GGML_TYPE_I32); // position_ids
mem_size += hidden_size * vocab_size * ggml_type_sizef(wtype); // token_embed_weight
mem_size += hidden_size * max_position_embeddings * ggml_type_sizef(wtype); // position_embed_weight
for (int i = 0; i < num_hidden_layers; i++) {
mem_size += resblocks[i].compute_params_mem_size(wtype);
}
mem_size += 2 * hidden_size * ggml_type_sizef(GGML_TYPE_F32); // final_ln_w/b
mem_size += ggml_tensor_overhead(); // object overhead
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
position_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, max_position_embeddings);
for (int i = 0; i < max_position_embeddings; i++) {
ggml_set_i32_1d(position_ids, i, i);
}
token_embed_weight = ggml_new_tensor_2d(ctx, wtype, hidden_size, vocab_size);
position_embed_weight = ggml_new_tensor_2d(ctx, wtype, hidden_size, max_position_embeddings);
for (int i = 0; i < num_hidden_layers; i++) {
resblocks[i].init_params(ctx, wtype);
}
final_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
final_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "embeddings.token_embedding.weight"] = token_embed_weight;
tensors[prefix + "embeddings.position_embedding.weight"] = position_embed_weight;
tensors[prefix + "final_layer_norm.weight"] = final_ln_w;
tensors[prefix + "final_layer_norm.bias"] = final_ln_b;
for (int i = 0; i < num_hidden_layers; i++) {
resblocks[i].map_by_name(tensors, prefix + "encoder.layers." + std::to_string(i) + ".");
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* input_ids) {
// input_ids: [N, n_token]
GGML_ASSERT(input_ids->ne[0] <= position_ids->ne[0]);
// token_embedding + position_embedding
struct ggml_tensor* x;
x = ggml_add(ctx,
ggml_get_rows(ctx, token_embed_weight, input_ids),
ggml_get_rows(ctx,
position_embed_weight,
ggml_view_1d(ctx, position_ids, input_ids->ne[0], 0))); // [N, n_token, hidden_size]
// transformer
for (int i = 0; i < num_hidden_layers; i++) {
if (model_type == SD2 && i == num_hidden_layers - 1) { // layer: "penultimate"
break;
}
x = resblocks[i].forward(ctx, x); // [N, n_token, hidden_size]
}
// final layer norm
{
x = ggml_norm(ctx, x);
x = ggml_add(ctx, ggml_mul(ctx, ggml_repeat(ctx, final_ln_w, x), x),
ggml_repeat(ctx, final_ln_b, x));
}
return x; // [N, n_token, hidden_size]
}
};
// ldm.modules.encoders.modules.FrozenCLIPEmbedder
struct FrozenCLIPEmbedder {
CLIPTokenizer tokenizer;
CLIPTextModel text_model;
struct ggml_tensor* forward(struct ggml_context* ctx, const std::string& prompt) {
std::vector<int32_t> tokens = tokenizer.tokenize(prompt, text_model.max_position_embeddings, true);
struct ggml_tensor* input_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens.size());
memcpy(input_ids->data, tokens.data(), tokens.size() * ggml_element_size(input_ids));
struct ggml_tensor* hidden_states = text_model.forward(ctx, input_ids);
return hidden_states;
}
};
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
struct FrozenCLIPEmbedderWithCustomWords {
ModelType model_type = SD1;
CLIPTokenizer tokenizer;
CLIPTextModel text_model;
FrozenCLIPEmbedderWithCustomWords(ModelType model_type = SD1)
: model_type(model_type), tokenizer(model_type), text_model(model_type) {}
std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
size_t max_length = 0,
bool padding = false) {
auto parsed_attention = parse_prompt_attention(text);
{
std::stringstream ss;
ss << "[";
for (const auto& item : parsed_attention) {
ss << "['" << item.first << "', " << item.second << "], ";
}
ss << "]";
LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
}
std::vector<int> tokens;
std::vector<float> weights;
for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
float curr_weight = item.second;
std::vector<int> curr_tokens = tokenizer.encode(curr_text);
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
weights.insert(weights.end(), curr_tokens.size(), curr_weight);
}
tokens.insert(tokens.begin(), BOS_TOKEN_ID);
weights.insert(weights.begin(), 1.0);
if (max_length > 0) {
if (tokens.size() > max_length - 1) {
tokens.resize(max_length - 1);
weights.resize(max_length - 1);
tokens.push_back(EOS_TOKEN_ID);
weights.push_back(1.0);
} else {
tokens.push_back(EOS_TOKEN_ID);
weights.push_back(1.0);
if (padding) {
int pad_token_id = PAD_TOKEN_ID;
if (model_type == SD2) {
pad_token_id = 0;
}
tokens.insert(tokens.end(), max_length - tokens.size(), pad_token_id);
weights.insert(weights.end(), max_length - weights.size(), 1.0);
}
}
}
// for (int i = 0; i < tokens.size(); i++) {
// std::cout << tokens[i] << ":" << weights[i] << ", ";
// }
// std::cout << std::endl;
return {tokens, weights};
}
};
/*==================================================== UnetModel =====================================================*/
struct ResBlock {
// network hparams
int channels; // model_channels * (1, 1, 1, 2, 2, 4, 4, 4)
int emb_channels; // time_embed_dim
int out_channels; // mult * model_channels
// network params
// in_layers
struct ggml_tensor* in_layer_0_w; // [channels, ]
struct ggml_tensor* in_layer_0_b; // [channels, ]
// in_layer_1 is nn.SILU()
struct ggml_tensor* in_layer_2_w; // [out_channels, channels, 3, 3]
struct ggml_tensor* in_layer_2_b; // [out_channels, ]
// emb_layers
// emb_layer_0 is nn.SILU()
struct ggml_tensor* emb_layer_1_w; // [out_channels, emb_channels]
struct ggml_tensor* emb_layer_1_b; // [out_channels, ]
// out_layers
struct ggml_tensor* out_layer_0_w; // [out_channels, ]
struct ggml_tensor* out_layer_0_b; // [out_channels, ]
// out_layer_1 is nn.SILU()
// out_layer_2 is nn.Dropout(), p = 0 for inference
struct ggml_tensor* out_layer_3_w; // [out_channels, out_channels, 3, 3]
struct ggml_tensor* out_layer_3_b; // [out_channels, ]
// skip connection, only if out_channels != channels
struct ggml_tensor* skip_w; // [out_channels, channels, 1, 1]
struct ggml_tensor* skip_b; // [out_channels, ]
size_t compute_params_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += 2 * channels * ggml_type_sizef(GGML_TYPE_F32); // in_layer_0_w/b
mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // in_layer_2_w
mem_size += 5 * out_channels * ggml_type_sizef(GGML_TYPE_F32); // in_layer_2_b/emb_layer_1_b/out_layer_0_w/out_layer_0_b/out_layer_3_b
mem_size += out_channels * emb_channels * ggml_type_sizef(wtype); // emb_layer_1_w
mem_size += out_channels * out_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // out_layer_3_w
mem_size += 10 * ggml_tensor_overhead(); // object overhead
if (out_channels != channels) {
mem_size += out_channels * channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16); // skip_w
mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32); // skip_b
mem_size += 2 * ggml_tensor_overhead(); // object overhead
}
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
in_layer_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, channels);
in_layer_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, channels);
in_layer_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
in_layer_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
emb_layer_1_w = ggml_new_tensor_2d(ctx, wtype, emb_channels, out_channels);
emb_layer_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
out_layer_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
out_layer_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
out_layer_3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
out_layer_3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
if (out_channels != channels) {
skip_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, channels, out_channels);
skip_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
}
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "in_layers.0.weight"] = in_layer_0_w;
tensors[prefix + "in_layers.0.bias"] = in_layer_0_b;
tensors[prefix + "in_layers.2.weight"] = in_layer_2_w;
tensors[prefix + "in_layers.2.bias"] = in_layer_2_b;
tensors[prefix + "emb_layers.1.weight"] = emb_layer_1_w;
tensors[prefix + "emb_layers.1.bias"] = emb_layer_1_b;
tensors[prefix + "out_layers.0.weight"] = out_layer_0_w;
tensors[prefix + "out_layers.0.bias"] = out_layer_0_b;
tensors[prefix + "out_layers.3.weight"] = out_layer_3_w;
tensors[prefix + "out_layers.3.bias"] = out_layer_3_b;
if (out_channels != channels) {
tensors[prefix + "skip_connection.weight"] = skip_w;
tensors[prefix + "skip_connection.bias"] = skip_b;
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb) {
// x: [N, channels, h, w]
// emb: [N, emb_channels]
// in_layers
// group norm 32
auto h = ggml_group_norm_32(ctx, x);
h = ggml_add(ctx,
ggml_mul(ctx,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, in_layer_0_w, 1, 1, in_layer_0_w->ne[0], 1),
h),
h),
ggml_repeat(ctx,
ggml_reshape_4d(ctx, in_layer_0_b, 1, 1, in_layer_0_b->ne[0], 1),
h));
// silu
h = ggml_silu_inplace(ctx, h);
// conv2d
h = ggml_conv_2d(ctx, in_layer_2_w, h, 1, 1, 1, 1, 1, 1);
h = ggml_add(ctx,
h,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, in_layer_2_b, 1, 1, in_layer_2_b->ne[0], 1),
h)); // [N, out_channels, h, w]
// emb_layers
auto emb_out = ggml_silu(ctx, emb);
emb_out = ggml_mul_mat(ctx, emb_layer_1_w, emb_out);
emb_out = ggml_add(ctx, ggml_repeat(ctx, emb_layer_1_b, emb_out), emb_out); // [N, out_channels]
emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1]
emb_out = ggml_repeat(ctx, emb_out, h); // [N, out_channels, h, w]
// out_layers
h = ggml_add(ctx, h, emb_out);
// group norm 32
h = ggml_group_norm_inplace(ctx, h, 32);
h = ggml_add(ctx,
ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, out_layer_0_w, 1, 1, out_layer_0_w->ne[0], 1), h), h),
ggml_repeat(ctx, ggml_reshape_4d(ctx, out_layer_0_b, 1, 1, out_layer_0_b->ne[0], 1), h));
// silu
h = ggml_silu_inplace(ctx, h);
// dropout, skip for inference
// conv2d
h = ggml_conv_2d(ctx, out_layer_3_w, h, 1, 1, 1, 1, 1, 1);
h = ggml_add(ctx,
h,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, out_layer_3_b, 1, 1, out_layer_3_b->ne[0], 1),
h)); // [N, out_channels, h, w
// skip connection
if (out_channels != channels) {
x = ggml_conv_2d(ctx, skip_w, x, 1, 1, 0, 0, 1, 1);
x = ggml_add(ctx,
x,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, skip_b, 1, 1, skip_b->ne[0], 1),
x)); // [N, out_channels, h, w]
}
h = ggml_add(ctx, h, x);
return h; // [N, out_channels, h, w]
}
};
struct SpatialTransformer {
int in_channels; // mult * model_channels
int n_head; // num_heads
int d_head; // in_channels // n_heads
int depth = 1; // 1
int context_dim = 768; // hidden_size, 1024 for SD2.x
// group norm
struct ggml_tensor* norm_w; // [in_channels,]
struct ggml_tensor* norm_b; // [in_channels,]
// proj_in
struct ggml_tensor* proj_in_w; // [in_channels, in_channels, 1, 1]
struct ggml_tensor* proj_in_b; // [in_channels,]
// transformer
struct
{
// layer norm 1
struct ggml_tensor* norm1_w; // [in_channels, ]
struct ggml_tensor* norm1_b; // [in_channels, ]
// attn1
struct ggml_tensor* attn1_q_w; // [in_channels, in_channels]
struct ggml_tensor* attn1_k_w; // [in_channels, in_channels]
struct ggml_tensor* attn1_v_w; // [in_channels, in_channels]
struct ggml_tensor* attn1_out_w; // [in_channels, in_channels]
struct ggml_tensor* attn1_out_b; // [in_channels, ]
// layer norm 2
struct ggml_tensor* norm2_w; // [in_channels, ]
struct ggml_tensor* norm2_b; // [in_channels, ]
// attn2
struct ggml_tensor* attn2_q_w; // [in_channels, in_channels]
struct ggml_tensor* attn2_k_w; // [in_channels, context_dim]
struct ggml_tensor* attn2_v_w; // [in_channels, context_dim]
struct ggml_tensor* attn2_out_w; // [in_channels, in_channels]
struct ggml_tensor* attn2_out_b; // [in_channels, ]
// layer norm 3
struct ggml_tensor* norm3_w; // [in_channels, ]
struct ggml_tensor* norm3_b; // [in_channels, ]
// ff
struct ggml_tensor* ff_0_proj_w; // [in_channels * 4 * 2, in_channels]
struct ggml_tensor* ff_0_proj_b; // [in_channels * 4 * 2]
struct ggml_tensor* ff_2_w; // [in_channels, in_channels * 4]
struct ggml_tensor* ff_2_b; // [in_channels,]
} transformer;
// proj_out
struct ggml_tensor* proj_out_w; // [in_channels, in_channels, 1, 1]
struct ggml_tensor* proj_out_b; // [in_channels,]
size_t compute_params_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += 2 * in_channels * ggml_type_sizef(GGML_TYPE_F32); // norm_w/norm_b
mem_size += 2 * in_channels * in_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16); // proj_in_w/proj_out_w
mem_size += 2 * in_channels * ggml_type_sizef(GGML_TYPE_F32); // proj_in_b/proj_out_b
// transformer
{
mem_size += 6 * in_channels * ggml_type_sizef(GGML_TYPE_F32); // norm1-3_w/b
mem_size += 6 * in_channels * in_channels * ggml_type_sizef(wtype); // attn1_q/k/v/out_w attn2_q/out_w
mem_size += 2 * in_channels * context_dim * ggml_type_sizef(wtype); // attn2_k/v_w
mem_size += in_channels * 4 * 2 * in_channels * ggml_type_sizef(wtype); // ff_0_proj_w
mem_size += in_channels * 4 * 2 * ggml_type_sizef(GGML_TYPE_F32); // ff_0_proj_b
mem_size += in_channels * 4 * in_channels * ggml_type_sizef(wtype); // ff_2_w
mem_size += in_channels * ggml_type_sizef(GGML_TYPE_F32); // ff_2_b
}
mem_size += 26 * ggml_tensor_overhead(); // object overhead
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
proj_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
proj_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
proj_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
proj_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
// transformer
transformer.norm1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
transformer.norm1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
transformer.attn1_q_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
transformer.attn1_k_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
transformer.attn1_v_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
transformer.attn1_out_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
transformer.attn1_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
transformer.norm2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
transformer.norm2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
transformer.attn2_q_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
transformer.attn2_k_w = ggml_new_tensor_2d(ctx, wtype, context_dim, in_channels);
transformer.attn2_v_w = ggml_new_tensor_2d(ctx, wtype, context_dim, in_channels);
transformer.attn2_out_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
transformer.attn2_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
transformer.norm3_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
transformer.norm3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
transformer.ff_0_proj_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels * 4 * 2);
transformer.ff_0_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels * 4 * 2);
transformer.ff_2_w = ggml_new_tensor_2d(ctx, wtype, in_channels * 4, in_channels);
transformer.ff_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "norm.weight"] = norm_w;
tensors[prefix + "norm.bias"] = norm_b;
tensors[prefix + "proj_in.weight"] = proj_in_w;
tensors[prefix + "proj_in.bias"] = proj_in_b;
// transformer
{
std::string transformer_prefix = prefix + "transformer_blocks.0.";
tensors[transformer_prefix + "attn1.to_q.weight"] = transformer.attn1_q_w;
tensors[transformer_prefix + "attn1.to_k.weight"] = transformer.attn1_k_w;
tensors[transformer_prefix + "attn1.to_v.weight"] = transformer.attn1_v_w;
tensors[transformer_prefix + "attn1.to_out.0.weight"] = transformer.attn1_out_w;
tensors[transformer_prefix + "attn1.to_out.0.bias"] = transformer.attn1_out_b;
tensors[transformer_prefix + "ff.net.0.proj.weight"] = transformer.ff_0_proj_w;
tensors[transformer_prefix + "ff.net.0.proj.bias"] = transformer.ff_0_proj_b;
tensors[transformer_prefix + "ff.net.2.weight"] = transformer.ff_2_w;
tensors[transformer_prefix + "ff.net.2.bias"] = transformer.ff_2_b;
tensors[transformer_prefix + "attn2.to_q.weight"] = transformer.attn2_q_w;
tensors[transformer_prefix + "attn2.to_k.weight"] = transformer.attn2_k_w;
tensors[transformer_prefix + "attn2.to_v.weight"] = transformer.attn2_v_w;
tensors[transformer_prefix + "attn2.to_out.0.weight"] = transformer.attn2_out_w;
tensors[transformer_prefix + "attn2.to_out.0.bias"] = transformer.attn2_out_b;
tensors[transformer_prefix + "norm1.weight"] = transformer.norm1_w;
tensors[transformer_prefix + "norm1.bias"] = transformer.norm1_b;
tensors[transformer_prefix + "norm2.weight"] = transformer.norm2_w;
tensors[transformer_prefix + "norm2.bias"] = transformer.norm2_b;
tensors[transformer_prefix + "norm3.weight"] = transformer.norm3_w;
tensors[transformer_prefix + "norm3.bias"] = transformer.norm3_b;
}
tensors[prefix + "proj_out.weight"] = proj_out_w;
tensors[prefix + "proj_out.bias"] = proj_out_b;
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
// x: [N, in_channels, h, w]
// context: [N, max_position, hidden_size(aka context_dim)]
auto x_in = x;
// group norm 32
x = ggml_group_norm_32(ctx, x);
x = ggml_add(ctx,
ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_w, 1, 1, norm_w->ne[0], 1), x), x),
ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_b, 1, 1, norm_b->ne[0], 1), x));
// proj_in
x = ggml_conv_2d(ctx, proj_in_w, x, 1, 1, 0, 0, 1, 1);
x = ggml_add(ctx,
x,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, proj_in_b, 1, 1, proj_in_b->ne[0], 1),
x)); // [N, in_channels, h, w]
// transformer
const int64_t n = x->ne[3];
const int64_t c = x->ne[2];
const int64_t h = x->ne[1];
const int64_t w = x->ne[0];
const int64_t max_position = context->ne[1];
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3)); // [N, h, w, in_channels]
{
auto r = x;
// layer norm 1
{
x = ggml_reshape_2d(ctx, x, c, w * h * n);
x = ggml_norm(ctx, x);
x = ggml_add(ctx,
ggml_mul(ctx,
ggml_repeat(ctx, transformer.norm1_w, x),
x),
ggml_repeat(ctx, transformer.norm1_b, x));
}
// self-attention
{
x = ggml_reshape_2d(ctx, x, c, h * w * n); // [N * h * w, in_channels]
struct ggml_tensor* q = ggml_mul_mat(ctx, transformer.attn1_q_w, x); // [N * h * w, in_channels]
q = ggml_scale_inplace(ctx, q, ggml_new_f32(ctx, 1.0f / sqrt((float)d_head)));
q = ggml_reshape_4d(ctx, q, d_head, n_head, h * w, n); // [N, h * w, n_head, d_head]
q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, h * w, d_head]
q = ggml_reshape_3d(ctx, q, d_head, h * w, n_head * n); // [N * n_head, h * w, d_head]
struct ggml_tensor* k = ggml_mul_mat(ctx, transformer.attn1_k_w, x); // [N * h * w, in_channels]
k = ggml_reshape_4d(ctx, k, d_head, n_head, h * w, n); // [N, h * w, n_head, d_head]
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, h * w, d_head]
k = ggml_reshape_3d(ctx, k, d_head, h * w, n_head * n); // [N * n_head, h * w, d_head]
struct ggml_tensor* v = ggml_mul_mat(ctx, transformer.attn1_v_w, x); // [N * h * w, in_channels]
v = ggml_reshape_4d(ctx, v, d_head, n_head, h * w, n); // [N, h * w, n_head, d_head]
v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_head, h * w]
v = ggml_reshape_3d(ctx, v, h * w, d_head, n_head * n); // [N * n_head, d_head, h * w]
struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q); // [N * n_head, h * w, h * w]
// kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
kq = ggml_soft_max_inplace(ctx, kq);
struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq); // [N * n_head, h * w, d_head]
kqv = ggml_reshape_4d(ctx, kqv, d_head, h * w, n_head, n);
kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3)); // [N, h * w, n_head, d_head]
// x = ggml_cpy(ctx, kqv, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d_head * n_head, h * w * n));
x = ggml_reshape_2d(ctx, kqv, d_head * n_head, h * w * n);
x = ggml_add(ctx, ggml_repeat(ctx, transformer.attn1_out_b, x), ggml_mul_mat(ctx, transformer.attn1_out_w, x));
x = ggml_reshape_4d(ctx, x, c, w, h, n);
}
x = ggml_add(ctx, x, r);
r = x;
// layer norm 2
{
x = ggml_norm(ctx, x);
x = ggml_add(ctx,
ggml_mul(ctx,
ggml_repeat(ctx, transformer.norm2_w, x), x),
ggml_repeat(ctx, transformer.norm2_b, x));
}
// cross-attention
{
x = ggml_reshape_2d(ctx, x, c, h * w * n); // [N * h * w, in_channels]
context = ggml_reshape_2d(ctx, context, context->ne[0], context->ne[1] * context->ne[2]); // [N * max_position, hidden_size]
struct ggml_tensor* q = ggml_mul_mat(ctx, transformer.attn2_q_w, x); // [N * h * w, in_channels]
q = ggml_scale_inplace(ctx, q, ggml_new_f32(ctx, 1.0f / sqrt((float)d_head)));
q = ggml_reshape_4d(ctx, q, d_head, n_head, h * w, n); // [N, h * w, n_head, d_head]
q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, h * w, d_head]
q = ggml_reshape_3d(ctx, q, d_head, h * w, n_head * n); // [N * n_head, h * w, d_head]
struct ggml_tensor* k = ggml_mul_mat(ctx, transformer.attn2_k_w, context); // [N * max_position, in_channels]
k = ggml_reshape_4d(ctx, k, d_head, n_head, max_position, n); // [N, max_position, n_head, d_head]
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, max_position, d_head]
k = ggml_reshape_3d(ctx, k, d_head, max_position, n_head * n); // [N * n_head, max_position, d_head]
struct ggml_tensor* v = ggml_mul_mat(ctx, transformer.attn2_v_w, context); // [N * max_position, in_channels]
v = ggml_reshape_4d(ctx, v, d_head, n_head, max_position, n); // [N, max_position, n_head, d_head]
v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_head, max_position]
v = ggml_reshape_3d(ctx, v, max_position, d_head, n_head * n); // [N * n_head, d_head, max_position]
struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q); // [N * n_head, h * w, max_position]
// kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
kq = ggml_soft_max_inplace(ctx, kq);
struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq); // [N * n_head, h * w, d_head]
kqv = ggml_reshape_4d(ctx, kqv, d_head, h * w, n_head, n);
kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));
// x = ggml_cpy(ctx, kqv, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d_head * n_head, h * w * n)); // [N * h * w, in_channels]
x = ggml_reshape_2d(ctx, kqv, d_head * n_head, h * w * n); // [N * h * w, in_channels]
x = ggml_add(ctx, ggml_repeat(ctx, transformer.attn2_out_b, x), ggml_mul_mat(ctx, transformer.attn2_out_w, x));
x = ggml_reshape_4d(ctx, x, c, w, h, n);
}
x = ggml_add(ctx, x, r);
r = x;
// layer norm 3
{
x = ggml_reshape_2d(ctx, x, c, h * w * n); // [N * h * w, in_channels]
x = ggml_norm(ctx, x);
x = ggml_add(ctx,
ggml_mul(ctx,
ggml_repeat(ctx, transformer.norm3_w, x), x),
ggml_repeat(ctx, transformer.norm3_b, x));
}
// ff
{
// GEGLU
auto x_w = ggml_view_2d(ctx,
transformer.ff_0_proj_w,
transformer.ff_0_proj_w->ne[0],
transformer.ff_0_proj_w->ne[1] / 2,
transformer.ff_0_proj_w->nb[1],
0); // [in_channels * 4, in_channels]
auto x_b = ggml_view_1d(ctx,
transformer.ff_0_proj_b,
transformer.ff_0_proj_b->ne[0] / 2,
0); // [in_channels * 4, in_channels]
auto gate_w = ggml_view_2d(ctx,
transformer.ff_0_proj_w,
transformer.ff_0_proj_w->ne[0],
transformer.ff_0_proj_w->ne[1] / 2,
transformer.ff_0_proj_w->nb[1],
transformer.ff_0_proj_w->nb[1] * transformer.ff_0_proj_w->ne[1] / 2); // [in_channels * 4, ]
auto gate_b = ggml_view_1d(ctx,
transformer.ff_0_proj_b,
transformer.ff_0_proj_b->ne[0] / 2,
transformer.ff_0_proj_b->nb[0] * transformer.ff_0_proj_b->ne[0] / 2); // [in_channels * 4, ]
x = ggml_reshape_2d(ctx, x, c, w * h * n);
auto x_in = x;
x = ggml_mul_mat(ctx, x_w, x_in); // [N * h * w, in_channels * 4]
x = ggml_add(ctx, ggml_repeat(ctx, x_b, x), x);
auto gate = ggml_mul_mat(ctx, gate_w, x_in); // [N * h * w, in_channels * 4]
gate = ggml_add(ctx, ggml_repeat(ctx, gate_b, gate), gate);
gate = ggml_gelu_inplace(ctx, gate);
x = ggml_mul(ctx, x, gate); // [N * h * w, in_channels * 4]
// fc
x = ggml_mul_mat(ctx, transformer.ff_2_w, x); // [N * h * w, in_channels]
x = ggml_add(ctx, ggml_repeat(ctx, transformer.ff_2_b, x), x);
}
x = ggml_reshape_4d(ctx, x, c, w, h, n); // [N, h, w, in_channels]
// residual
x = ggml_add(ctx, x, r);
}
x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3)); // // [N, in_channels, h, w]
// proj_out
x = ggml_conv_2d(ctx, proj_out_w, x, 1, 1, 0, 0, 1, 1);
x = ggml_add(ctx,
x,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, proj_out_b, 1, 1, proj_out_b->ne[0], 1),
x)); // [N, in_channels, h, w]
x = ggml_add(ctx, x, x_in);
return x;
}
};
struct DownSample {
// hparams
int channels;
int out_channels;
// conv2d params
struct ggml_tensor* op_w; // [out_channels, channels, 3, 3]
struct ggml_tensor* op_b; // [out_channels,]
bool vae_downsample = false;
size_t compute_params_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // op_w
mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32); // op_b
mem_size += 2 * ggml_tensor_overhead(); // object overhead
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
op_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
op_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
if (vae_downsample) {
tensors[prefix + "conv.weight"] = op_w;
tensors[prefix + "conv.bias"] = op_b;
} else {
tensors[prefix + "op.weight"] = op_w;
tensors[prefix + "op.bias"] = op_b;
}
}
// TODO: making it parallel
static void asymmetric_pad(struct ggml_tensor* dst,
const struct ggml_tensor* a,
const struct ggml_tensor* b,
int ith,
int nth,
void* userdata) {
assert(sizeof(dst->nb[0]) == sizeof(float));
assert(sizeof(a->nb[0]) == sizeof(float));
assert(sizeof(b->nb[0]) == sizeof(float));
float value = 0;
for (int i = 0; i < dst->ne[3]; i++) {
for (int j = 0; j < dst->ne[2]; j++) {
for (int k = 0; k < dst->ne[1]; k++) {
for (int l = 0; l < dst->ne[0]; l++) {
if (k == dst->ne[1] - 1 || l == dst->ne[0] - 1) {
value = 0;
} else {
value = ggml_tensor_get_f32(b, l, k, j, i);
}
// printf("%d %d %d %d -> %f\n", i, j, k, l, value);
ggml_tensor_set_f32(dst, value, l, k, j, i);
}
}
}
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w]
if (vae_downsample) {
bool dynamic = ggml_get_dynamic(ctx);
ggml_set_dynamic(ctx, false);
auto pad_x = ggml_new_tensor_4d(ctx, x->type, x->ne[0] + 1, x->ne[1] + 1, x->ne[2], x->ne[3]);
ggml_set_dynamic(ctx, dynamic);
x = ggml_map_custom2_inplace(ctx, pad_x, x, asymmetric_pad, 1, NULL);
x = ggml_conv_2d(ctx, op_w, x, 2, 2, 0, 0, 1, 1);
} else {
x = ggml_conv_2d(ctx, op_w, x, 2, 2, 1, 1, 1, 1);
}
x = ggml_add(ctx,
x,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, op_b, 1, 1, op_b->ne[0], 1),
x)); // [N, out_channels, h/2, w/2]
return x;
}
};
struct UpSample {
// hparams
int channels;
int out_channels;
// conv2d params
struct ggml_tensor* conv_w; // [out_channels, channels, 3, 3]
struct ggml_tensor* conv_b; // [out_channels,]
size_t compute_params_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // op_w
mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32); // op_b
mem_size += 2 * ggml_tensor_overhead(); // object overhead
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
conv_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "conv.weight"] = conv_w;
tensors[prefix + "conv.bias"] = conv_b;
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w]
x = ggml_upscale(ctx, x, 2); // [N, channels, h*2, w*2]
x = ggml_conv_2d(ctx, conv_w, x, 1, 1, 1, 1, 1, 1);
x = ggml_add(ctx,
x,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, conv_b, 1, 1, conv_b->ne[0], 1),
x)); // [N, out_channels, h*2, w*2]
return x;
}
};
// ldm.modules.diffusionmodules.openaimodel.UNetModel
struct UNetModel {
// network hparams
int in_channels = 4;
int model_channels = 320;
int out_channels = 4;
int num_res_blocks = 2;
int attention_resolutions[3] = {4, 2, 1};
int channel_mult[4] = {1, 2, 4, 4};
int time_embed_dim = 1280; // model_channels*4
int num_heads = 8;
int num_head_channels = -1; // channels // num_heads
int context_dim = 768; // 1024 for SD2.x
// network params
struct ggml_tensor* time_embed_0_w; // [time_embed_dim, model_channels]
struct ggml_tensor* time_embed_0_b; // [time_embed_dim, ]
// time_embed_1 is nn.SILU()
struct ggml_tensor* time_embed_2_w; // [time_embed_dim, time_embed_dim]
struct ggml_tensor* time_embed_2_b; // [time_embed_dim, ]
struct ggml_tensor* input_block_0_w; // [model_channels, in_channels, 3, 3]
struct ggml_tensor* input_block_0_b; // [model_channels, ]
// input_blocks
ResBlock input_res_blocks[4][2];
SpatialTransformer input_transformers[3][2];
DownSample input_down_samples[3];
// middle_block
ResBlock middle_block_0;
SpatialTransformer middle_block_1;
ResBlock middle_block_2;
// output_blocks
ResBlock output_res_blocks[4][3];
SpatialTransformer output_transformers[3][3];
UpSample output_up_samples[3];
// out
// group norm 32
struct ggml_tensor* out_0_w; // [model_channels, ]
struct ggml_tensor* out_0_b; // [model_channels, ]
// out 1 is nn.SILU()
struct ggml_tensor* out_2_w; // [out_channels, model_channels, 3, 3]
struct ggml_tensor* out_2_b; // [out_channels, ]
UNetModel(ModelType model_type = SD1) {
if (model_type == SD2) {
context_dim = 1024;
num_head_channels = 64;
num_heads = -1;
}
// set up hparams of blocks
// input_blocks
std::vector<int> input_block_chans;
input_block_chans.push_back(model_channels);
int ch = model_channels;
int ds = 1;
int len_mults = sizeof(channel_mult) / sizeof(int);
for (int i = 0; i < len_mults; i++) {
int mult = channel_mult[i];
for (int j = 0; j < num_res_blocks; j++) {
input_res_blocks[i][j].channels = ch;
input_res_blocks[i][j].emb_channels = time_embed_dim;
input_res_blocks[i][j].out_channels = mult * model_channels;
ch = mult * model_channels;
if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
int n_head = num_heads;
int d_head = ch / num_heads;
if (num_head_channels != -1) {
d_head = num_head_channels;
n_head = ch / d_head;
}
input_transformers[i][j].in_channels = ch;
input_transformers[i][j].n_head = n_head;
input_transformers[i][j].d_head = d_head;
input_transformers[i][j].context_dim = context_dim;
}
input_block_chans.push_back(ch);
}
if (i != len_mults - 1) {
input_down_samples[i].channels = ch;
input_down_samples[i].out_channels = ch;
input_block_chans.push_back(ch);
ds *= 2;
}
}
// middle blocks
middle_block_0.channels = ch;
middle_block_0.emb_channels = time_embed_dim;
middle_block_0.out_channels = ch;
int n_head = num_heads;
int d_head = ch / num_heads;
if (num_head_channels != -1) {
d_head = num_head_channels;
n_head = ch / d_head;
}
middle_block_1.in_channels = ch;
middle_block_1.n_head = n_head;
middle_block_1.d_head = d_head;
middle_block_1.context_dim = context_dim;
middle_block_2.channels = ch;
middle_block_2.emb_channels = time_embed_dim;
middle_block_2.out_channels = ch;
// output blocks
for (int i = len_mults - 1; i >= 0; i--) {
int mult = channel_mult[i];
for (int j = 0; j < num_res_blocks + 1; j++) {
int ich = input_block_chans.back();
input_block_chans.pop_back();
output_res_blocks[i][j].channels = ch + ich;
output_res_blocks[i][j].emb_channels = time_embed_dim;
output_res_blocks[i][j].out_channels = mult * model_channels;
ch = mult * model_channels;
if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
int n_head = num_heads;
int d_head = ch / num_heads;
if (num_head_channels != -1) {
d_head = num_head_channels;
n_head = ch / d_head;
}
output_transformers[i][j].in_channels = ch;
output_transformers[i][j].n_head = n_head;
output_transformers[i][j].d_head = d_head;
output_transformers[i][j].context_dim = context_dim;
}
if (i > 0 && j == num_res_blocks) {
output_up_samples[i - 1].channels = ch;
output_up_samples[i - 1].out_channels = ch;
ds /= 2;
}
}
}
}
size_t compute_params_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += time_embed_dim * model_channels * ggml_type_sizef(wtype); // time_embed_0_w
mem_size += time_embed_dim * ggml_type_sizef(GGML_TYPE_F32); // time_embed_0_b
mem_size += time_embed_dim * time_embed_dim * ggml_type_sizef(wtype); // time_embed_2_w
mem_size += time_embed_dim * ggml_type_sizef(GGML_TYPE_F32); // time_embed_2_b
mem_size += model_channels * in_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // input_block_0_w
mem_size += model_channels * ggml_type_sizef(GGML_TYPE_F32); // input_block_0_b
mem_size += 6 * ggml_tensor_overhead(); // object overhead
// input_blocks
int ds = 1;
int len_mults = sizeof(channel_mult) / sizeof(int);
for (int i = 0; i < len_mults; i++) {
for (int j = 0; j < num_res_blocks; j++) {
mem_size += input_res_blocks[i][j].compute_params_mem_size(wtype);
if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
mem_size += input_transformers[i][j].compute_params_mem_size(wtype);
}
}
if (i != len_mults - 1) {
ds *= 2;
mem_size += input_down_samples[i].compute_params_mem_size(wtype);
}
}
// middle_block
mem_size += middle_block_0.compute_params_mem_size(wtype);
mem_size += middle_block_1.compute_params_mem_size(wtype);
mem_size += middle_block_2.compute_params_mem_size(wtype);
// output_blocks
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
mem_size += output_res_blocks[i][j].compute_params_mem_size(wtype);
if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
mem_size += output_transformers[i][j].compute_params_mem_size(wtype);
}
if (i > 0 && j == num_res_blocks) {
mem_size += output_up_samples[i - 1].compute_params_mem_size(wtype);
ds /= 2;
}
}
}
// out
mem_size += 2 * model_channels * ggml_type_sizef(GGML_TYPE_F32); // out_0_w/b
mem_size += out_channels * model_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // out_2_w
mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32); // out_2_b
mem_size += 4 * ggml_tensor_overhead();
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
time_embed_0_w = ggml_new_tensor_2d(ctx, wtype, model_channels, time_embed_dim);
time_embed_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, time_embed_dim);
time_embed_2_w = ggml_new_tensor_2d(ctx, wtype, time_embed_dim, time_embed_dim);
time_embed_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, time_embed_dim);
// input_blocks
input_block_0_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, model_channels);
input_block_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model_channels);
int ds = 1;
int len_mults = sizeof(channel_mult) / sizeof(int);
for (int i = 0; i < len_mults; i++) {
for (int j = 0; j < num_res_blocks; j++) {
input_res_blocks[i][j].init_params(ctx, wtype);
if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
input_transformers[i][j].init_params(ctx, wtype);
}
}
if (i != len_mults - 1) {
input_down_samples[i].init_params(ctx, wtype);
ds *= 2;
}
}
// middle_blocks
middle_block_0.init_params(ctx, wtype);
middle_block_1.init_params(ctx, wtype);
middle_block_2.init_params(ctx, wtype);
// output_blocks
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
output_res_blocks[i][j].init_params(ctx, wtype);
if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
output_transformers[i][j].init_params(ctx, wtype);
}
if (i > 0 && j == num_res_blocks) {
output_up_samples[i - 1].init_params(ctx, wtype);
ds /= 2;
}
}
}
// out
out_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model_channels);
out_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model_channels);
out_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, model_channels, out_channels);
out_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "time_embed.0.weight"] = time_embed_0_w;
tensors[prefix + "time_embed.0.bias"] = time_embed_0_b;
tensors[prefix + "time_embed.2.weight"] = time_embed_2_w;
tensors[prefix + "time_embed.2.bias"] = time_embed_2_b;
// input_blocks
tensors[prefix + "input_blocks.0.0.weight"] = input_block_0_w;
tensors[prefix + "input_blocks.0.0.bias"] = input_block_0_b;
int len_mults = sizeof(channel_mult) / sizeof(int);
int input_block_idx = 0;
int ds = 1;
for (int i = 0; i < len_mults; i++) {
for (int j = 0; j < num_res_blocks; j++) {
input_block_idx += 1;
input_res_blocks[i][j].map_by_name(tensors, prefix + "input_blocks." + std::to_string(input_block_idx) + ".0.");
if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
input_transformers[i][j].map_by_name(tensors, prefix + "input_blocks." + std::to_string(input_block_idx) + ".1.");
}
}
if (i != len_mults - 1) {
input_block_idx += 1;
input_down_samples[i].map_by_name(tensors, prefix + "input_blocks." + std::to_string(input_block_idx) + ".0.");
ds *= 2;
}
}
// middle_blocks
middle_block_0.map_by_name(tensors, prefix + "middle_block.0.");
middle_block_1.map_by_name(tensors, prefix + "middle_block.1.");
middle_block_2.map_by_name(tensors, prefix + "middle_block.2.");
// output_blocks
int output_block_idx = 0;
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
output_res_blocks[i][j].map_by_name(tensors, prefix + "output_blocks." + std::to_string(output_block_idx) + ".0.");
int up_sample_idx = 1;
if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
output_transformers[i][j].map_by_name(tensors, prefix + "output_blocks." + std::to_string(output_block_idx) + ".1.");
up_sample_idx++;
}
if (i > 0 && j == num_res_blocks) {
output_up_samples[i - 1].map_by_name(tensors, prefix + "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx) + ".");
ds /= 2;
}
output_block_idx += 1;
}
}
// out
tensors[prefix + "out.0.weight"] = out_0_w;
tensors[prefix + "out.0.bias"] = out_0_b;
tensors[prefix + "out.2.weight"] = out_2_w;
tensors[prefix + "out.2.bias"] = out_2_b;
}
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* t_emb = NULL) {
// x: [N, in_channels, h, w]
// timesteps: [N, ]
// t_emb: [N, model_channels]
// context: [N, max_position, hidden_size]([N, 77, 768])
if (t_emb == NULL && timesteps != NULL) {
t_emb = new_timestep_embedding(ctx, timesteps, model_channels); // [N, model_channels]
}
// time_embed
auto emb = ggml_mul_mat(ctx, time_embed_0_w, t_emb);
emb = ggml_add(ctx, ggml_repeat(ctx, time_embed_0_b, emb), emb);
emb = ggml_silu_inplace(ctx, emb);
emb = ggml_mul_mat(ctx, time_embed_2_w, emb);
emb = ggml_add(ctx, ggml_repeat(ctx, time_embed_2_b, emb), emb); // [N, time_embed_dim]
// input_blocks
std::vector<struct ggml_tensor*> hs;
// input block 0
auto h = ggml_conv_2d(ctx, input_block_0_w, x, 1, 1, 1, 1, 1, 1); // [N, model_channels, h, w]
h = ggml_add(ctx,
h,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, input_block_0_b, 1, 1, input_block_0_b->ne[0], 1),
h)); // [N, model_channels, h, w]
hs.push_back(h);
// input block 1-11
int len_mults = sizeof(channel_mult) / sizeof(int);
int ds = 1;
for (int i = 0; i < len_mults; i++) {
int mult = channel_mult[i];
for (int j = 0; j < num_res_blocks; j++) {
h = input_res_blocks[i][j].forward(ctx, h, emb); // [N, mult*model_channels, h, w]
if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
h = input_transformers[i][j].forward(ctx, h, context); // [N, mult*model_channels, h, w]
}
hs.push_back(h);
}
if (i != len_mults - 1) {
ds *= 2;
h = input_down_samples[i].forward(ctx, h); // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
hs.push_back(h);
}
}
// [N, 4*model_channels, h/8, w/8]
// middle_block
h = middle_block_0.forward(ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
h = middle_block_1.forward(ctx, h, context); // [N, 4*model_channels, h/8, w/8]
h = middle_block_2.forward(ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
// output_blocks
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
auto h_skip = hs.back();
hs.pop_back();
h = ggml_concat(ctx, h, h_skip);
h = output_res_blocks[i][j].forward(ctx, h, emb);
if (ds == attention_resolutions[0] || ds == attention_resolutions[1] || ds == attention_resolutions[2]) {
h = output_transformers[i][j].forward(ctx, h, context);
}
if (i > 0 && j == num_res_blocks) {
h = output_up_samples[i - 1].forward(ctx, h);
ds /= 2;
}
}
}
// out
// group norm 32
h = ggml_group_norm_32(ctx, h);
h = ggml_add(ctx,
ggml_mul(ctx,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, out_0_w, 1, 1, out_0_w->ne[0], 1),
h),
h),
ggml_repeat(ctx,
ggml_reshape_4d(ctx, out_0_b, 1, 1, out_0_b->ne[0], 1),
h));
// silu
h = ggml_silu_inplace(ctx, h);
// conv2d
h = ggml_conv_2d(ctx, out_2_w, h, 1, 1, 1, 1, 1, 1);
h = ggml_add(ctx,
h,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, out_2_b, 1, 1, out_2_b->ne[0], 1),
h)); // [N, out_channels, h, w]
return h;
}
};
/*================================================== AutoEncoderKL ===================================================*/
struct ResnetBlock {
// network hparams
int in_channels;
int out_channels;
// network params
struct ggml_tensor* norm1_w; // [in_channels, ]
struct ggml_tensor* norm1_b; // [in_channels, ]
struct ggml_tensor* conv1_w; // [out_channels, in_channels, 3, 3]
struct ggml_tensor* conv1_b; // [out_channels, ]
struct ggml_tensor* norm2_w; // [out_channels, ]
struct ggml_tensor* norm2_b; // [out_channels, ]
struct ggml_tensor* conv2_w; // [out_channels, out_channels, 3, 3]
struct ggml_tensor* conv2_b; // [out_channels, ]
// nin_shortcut, only if out_channels != in_channels
struct ggml_tensor* nin_shortcut_w; // [out_channels, in_channels, 1, 1]
struct ggml_tensor* nin_shortcut_b; // [out_channels, ]
size_t compute_params_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += 2 * in_channels * ggml_type_sizef(GGML_TYPE_F32); // norm1_w/b
mem_size += out_channels * in_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv1_w
mem_size += 4 * out_channels * ggml_type_sizef(GGML_TYPE_F32); // conv1_b/norm2_w/norm2_b/conv2_b
mem_size += out_channels * out_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv2_w
mem_size += 8 * ggml_tensor_overhead(); // object overhead
if (out_channels != in_channels) {
mem_size += out_channels * in_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16); // nin_shortcut_w
mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32); // nin_shortcut_b
mem_size += 2 * ggml_tensor_overhead(); // object overhead
}
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
norm1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
norm1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
conv1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, out_channels);
conv1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
norm2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
norm2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
conv2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
conv2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
if (out_channels != in_channels) {
nin_shortcut_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, out_channels);
nin_shortcut_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
}
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "norm1.weight"] = norm1_w;
tensors[prefix + "norm1.bias"] = norm1_b;
tensors[prefix + "conv1.weight"] = conv1_w;
tensors[prefix + "conv1.bias"] = conv1_b;
tensors[prefix + "norm2.weight"] = norm2_w;
tensors[prefix + "norm2.bias"] = norm2_b;
tensors[prefix + "conv2.weight"] = conv2_w;
tensors[prefix + "conv2.bias"] = conv2_b;
if (out_channels != in_channels) {
tensors[prefix + "nin_shortcut.weight"] = nin_shortcut_w;
tensors[prefix + "nin_shortcut.bias"] = nin_shortcut_b;
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
// z: [N, in_channels, h, w]
// group norm 32
auto h = ggml_group_norm_32(ctx, z);
h = ggml_mul(ctx,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, norm1_w, 1, 1, norm1_w->ne[0], 1),
h),
h);
h = ggml_add(ctx,
h,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, norm1_b, 1, 1, norm1_b->ne[0], 1),
h));
// silu
h = ggml_silu_inplace(ctx, h);
// conv2d
h = ggml_conv_2d(ctx, conv1_w, h, 1, 1, 1, 1, 1, 1);
h = ggml_add(ctx,
h,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, conv1_b, 1, 1, conv1_b->ne[0], 1),
h)); // [N, out_channels, h, w]
// group norm 32
h = ggml_group_norm_32(ctx, h);
h = ggml_add(ctx,
ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm2_w, 1, 1, norm2_w->ne[0], 1), h), h),
ggml_repeat(ctx, ggml_reshape_4d(ctx, norm2_b, 1, 1, norm2_b->ne[0], 1), h));
// silu
h = ggml_silu_inplace(ctx, h);
// dropout, skip for inference
// conv2d
h = ggml_conv_2d(ctx, conv2_w, h, 1, 1, 1, 1, 1, 1);
h = ggml_add(ctx,
h,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, conv2_b, 1, 1, conv2_b->ne[0], 1),
h)); // [N, out_channels, h, w
// skip connection
if (out_channels != in_channels) {
z = ggml_conv_2d(ctx, nin_shortcut_w, z, 1, 1, 0, 0, 1, 1);
z = ggml_add(ctx,
z,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, nin_shortcut_b, 1, 1, nin_shortcut_b->ne[0], 1),
z)); // [N, out_channels, h, w]
}
h = ggml_add(ctx, h, z);
return h; // [N, out_channels, h, w]
}
};
struct AttnBlock {
int in_channels; // mult * model_channels
// group norm
struct ggml_tensor* norm_w; // [in_channels,]
struct ggml_tensor* norm_b; // [in_channels,]
// q/k/v
struct ggml_tensor* q_w; // [in_channels, in_channels, 1, 1]
struct ggml_tensor* q_b; // [in_channels,]
struct ggml_tensor* k_w; // [in_channels, in_channels, 1, 1]
struct ggml_tensor* k_b; // [in_channels,]
struct ggml_tensor* v_w; // [in_channels, in_channels, 1, 1]
struct ggml_tensor* v_b; // [in_channels,]
// proj_out
struct ggml_tensor* proj_out_w; // [in_channels, in_channels, 1, 1]
struct ggml_tensor* proj_out_b; // [in_channels,]
size_t compute_params_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += 6 * in_channels * ggml_type_sizef(GGML_TYPE_F32); // norm_w/norm_b/q_b/k_v/v_b/proj_out_b
mem_size += 4 * in_channels * in_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16); // q_w/k_w/v_w/proj_out_w
mem_size += 10 * ggml_tensor_overhead(); // object overhead
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
q_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
k_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
v_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
proj_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
proj_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "norm.weight"] = norm_w;
tensors[prefix + "norm.bias"] = norm_b;
tensors[prefix + "q.weight"] = q_w;
tensors[prefix + "q.bias"] = q_b;
tensors[prefix + "k.weight"] = k_w;
tensors[prefix + "k.bias"] = k_b;
tensors[prefix + "v.weight"] = v_w;
tensors[prefix + "v.bias"] = v_b;
tensors[prefix + "proj_out.weight"] = proj_out_w;
tensors[prefix + "proj_out.bias"] = proj_out_b;
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, in_channels, h, w]
// group norm 32
auto h_ = ggml_group_norm_32(ctx, x);
h_ = ggml_add(ctx,
ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_w, 1, 1, norm_w->ne[0], 1), h_), h_),
ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_b, 1, 1, norm_b->ne[0], 1), h_));
const int64_t n = h_->ne[3];
const int64_t c = h_->ne[2];
const int64_t h = h_->ne[1];
const int64_t w = h_->ne[0];
// q
auto q = ggml_conv_2d(ctx, q_w, h_, 1, 1, 0, 0, 1, 1);
q = ggml_add(ctx,
q,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, q_b, 1, 1, q_b->ne[0], 1),
q)); // [N, in_channels, h, w]
// k
auto k = ggml_conv_2d(ctx, k_w, h_, 1, 1, 0, 0, 1, 1);
k = ggml_add(ctx,
k,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, k_b, 1, 1, k_b->ne[0], 1),
k)); // [N, in_channels, h, w]
// v
auto v = ggml_conv_2d(ctx, v_w, h_, 1, 1, 0, 0, 1, 1);
v = ggml_add(ctx,
v,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, v_b, 1, 1, v_b->ne[0], 1),
v)); // [N, in_channels, h, w]
q = ggml_cont(ctx, ggml_permute(ctx, q, 1, 2, 0, 3)); // [N, h, w, in_channels]
q = ggml_reshape_3d(ctx, q, c, h * w, n); // [N, h * w, in_channels]
k = ggml_cont(ctx, ggml_permute(ctx, k, 1, 2, 0, 3)); // [N, h, w, in_channels]
k = ggml_reshape_3d(ctx, k, c, h * w, n); // [N, h * w, in_channels]
auto w_ = ggml_mul_mat(ctx, k, q); // [N, h * w, h * w]
w_ = ggml_scale_inplace(ctx, w_, ggml_new_f32(ctx, 1.0f / sqrt((float)c)));
w_ = ggml_soft_max_inplace(ctx, w_);
v = ggml_reshape_3d(ctx, v, h * w, c, n); // [N, in_channels, h * w]
h_ = ggml_mul_mat(ctx, v, w_); // [N, h * w, in_channels]
h_ = ggml_cont(ctx, ggml_permute(ctx, h_, 1, 0, 2, 3)); // [N, in_channels, h * w]
h_ = ggml_reshape_4d(ctx, h_, w, h, c, n); // [N, in_channels, h, w]
// proj_out
h_ = ggml_conv_2d(ctx, proj_out_w, h_, 1, 1, 0, 0, 1, 1);
h_ = ggml_add(ctx,
h_,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, proj_out_b, 1, 1, proj_out_b->ne[0], 1),
h_)); // [N, in_channels, h, w]
h_ = ggml_add(ctx, h_, x);
return h_;
}
};
// ldm.modules.diffusionmodules.model.Encoder
struct Encoder {
int embed_dim = 4;
int ch = 128;
int z_channels = 4;
int in_channels = 3;
int num_res_blocks = 2;
int ch_mult[4] = {1, 2, 4, 4};
struct ggml_tensor* conv_in_w; // [ch, in_channels, 3, 3]
struct ggml_tensor* conv_in_b; // [ch, ]
ResnetBlock down_blocks[4][2];
DownSample down_samples[3];
struct
{
ResnetBlock block_1;
AttnBlock attn_1;
ResnetBlock block_2;
} mid;
// block_in = ch * ch_mult[len_mults - 1]
struct ggml_tensor* norm_out_w; // [block_in, ]
struct ggml_tensor* norm_out_b; // [block_in, ]
struct ggml_tensor* conv_out_w; // [embed_dim*2, block_in, 3, 3]
struct ggml_tensor* conv_out_b; // [embed_dim*2, ]
Encoder() {
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = 1;
for (int i = 0; i < len_mults; i++) {
if (i == 0) {
block_in = ch;
} else {
block_in = ch * ch_mult[i - 1];
}
int block_out = ch * ch_mult[i];
for (int j = 0; j < num_res_blocks; j++) {
down_blocks[i][j].in_channels = block_in;
down_blocks[i][j].out_channels = block_out;
block_in = block_out;
}
if (i != len_mults - 1) {
down_samples[i].channels = block_in;
down_samples[i].out_channels = block_in;
down_samples[i].vae_downsample = true;
}
}
mid.block_1.in_channels = block_in;
mid.block_1.out_channels = block_in;
mid.attn_1.in_channels = block_in;
mid.block_2.in_channels = block_in;
mid.block_2.out_channels = block_in;
}
size_t compute_params_mem_size(ggml_type wtype) {
double mem_size = 0;
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = ch * ch_mult[len_mults - 1];
mem_size += ch * in_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv_in_w
mem_size += ch * ggml_type_sizef(GGML_TYPE_F32); // conv_in_b
mem_size += 2 * block_in * ggml_type_sizef(GGML_TYPE_F32); // norm_out_w/b
mem_size += z_channels * 2 * block_in * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv_out_w
mem_size += z_channels * 2 * ggml_type_sizef(GGML_TYPE_F32); // conv_out_b
mem_size += 6 * ggml_tensor_overhead(); // object overhead
mem_size += mid.block_1.compute_params_mem_size(wtype);
mem_size += mid.attn_1.compute_params_mem_size(wtype);
mem_size += mid.block_2.compute_params_mem_size(wtype);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
mem_size += down_blocks[i][j].compute_params_mem_size(wtype);
}
if (i != 0) {
mem_size += down_samples[i - 1].compute_params_mem_size(wtype);
}
}
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = ch * ch_mult[len_mults - 1];
conv_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, ch);
conv_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch);
norm_out_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, block_in);
norm_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, block_in);
conv_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, block_in, z_channels * 2);
conv_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_channels * 2);
mid.block_1.init_params(ctx, wtype);
mid.attn_1.init_params(ctx, wtype);
mid.block_2.init_params(ctx, wtype);
for (int i = 0; i < len_mults; i++) {
for (int j = 0; j < num_res_blocks; j++) {
down_blocks[i][j].init_params(ctx, wtype);
}
if (i != len_mults - 1) {
down_samples[i].init_params(ctx, wtype);
}
}
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "norm_out.weight"] = norm_out_w;
tensors[prefix + "norm_out.bias"] = norm_out_b;
tensors[prefix + "conv_in.weight"] = conv_in_w;
tensors[prefix + "conv_in.bias"] = conv_in_b;
tensors[prefix + "conv_out.weight"] = conv_out_w;
tensors[prefix + "conv_out.bias"] = conv_out_b;
mid.block_1.map_by_name(tensors, prefix + "mid.block_1.");
mid.attn_1.map_by_name(tensors, prefix + "mid.attn_1.");
mid.block_2.map_by_name(tensors, prefix + "mid.block_2.");
int len_mults = sizeof(ch_mult) / sizeof(int);
for (int i = 0; i < len_mults; i++) {
for (int j = 0; j < num_res_blocks; j++) {
down_blocks[i][j].map_by_name(tensors, prefix + "down." + std::to_string(i) + ".block." + std::to_string(j) + ".");
}
if (i != len_mults - 1) {
down_samples[i].map_by_name(tensors, prefix + "down." + std::to_string(i) + ".downsample.");
}
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, in_channels, h, w]
// conv_in
auto h = ggml_conv_2d(ctx, conv_in_w, x, 1, 1, 1, 1, 1, 1);
h = ggml_add(ctx,
h,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, conv_in_b, 1, 1, conv_in_b->ne[0], 1),
h)); // [N, ch, h, w]
int len_mults = sizeof(ch_mult) / sizeof(int);
for (int i = 0; i < len_mults; i++) {
for (int j = 0; j < num_res_blocks; j++) {
h = down_blocks[i][j].forward(ctx, h);
}
if (i != len_mults - 1) {
h = down_samples[i].forward(ctx, h);
}
}
h = mid.block_1.forward(ctx, h);
h = mid.attn_1.forward(ctx, h);
h = mid.block_2.forward(ctx, h); // [N, block_in, h, w]
// group norm 32
h = ggml_group_norm_32(ctx, h);
h = ggml_add(ctx,
ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_out_w, 1, 1, norm_out_w->ne[0], 1), h), h),
ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_out_b, 1, 1, norm_out_b->ne[0], 1), h));
// silu
// silu
h = ggml_silu_inplace(ctx, h);
// conv_out
h = ggml_conv_2d(ctx, conv_out_w, h, 1, 1, 1, 1, 1, 1);
h = ggml_add(ctx,
h,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, conv_out_b, 1, 1, conv_out_b->ne[0], 1),
h)); // [N, z_channels*2, h, w]
return h;
}
};
// ldm.modules.diffusionmodules.model.Decoder
struct Decoder {
int embed_dim = 4;
int ch = 128;
int z_channels = 4;
int out_ch = 3;
int num_res_blocks = 2;
int ch_mult[4] = {1, 2, 4, 4};
// block_in = ch * ch_mult[-1], 512
struct ggml_tensor* conv_in_w; // [block_in, z_channels, 3, 3]
struct ggml_tensor* conv_in_b; // [block_in, ]
struct
{
ResnetBlock block_1;
AttnBlock attn_1;
ResnetBlock block_2;
} mid;
ResnetBlock up_blocks[4][3];
UpSample up_samples[3];
struct ggml_tensor* norm_out_w; // [ch * ch_mult[0], ]
struct ggml_tensor* norm_out_b; // [ch * ch_mult[0], ]
struct ggml_tensor* conv_out_w; // [out_ch, ch * ch_mult[0], 3, 3]
struct ggml_tensor* conv_out_b; // [out_ch, ]
Decoder() {
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = ch * ch_mult[len_mults - 1];
mid.block_1.in_channels = block_in;
mid.block_1.out_channels = block_in;
mid.attn_1.in_channels = block_in;
mid.block_2.in_channels = block_in;
mid.block_2.out_channels = block_in;
for (int i = len_mults - 1; i >= 0; i--) {
int mult = ch_mult[i];
int block_out = ch * mult;
for (int j = 0; j < num_res_blocks + 1; j++) {
up_blocks[i][j].in_channels = block_in;
up_blocks[i][j].out_channels = block_out;
block_in = block_out;
}
if (i != 0) {
up_samples[i - 1].channels = block_in;
up_samples[i - 1].out_channels = block_in;
}
}
}
size_t compute_params_mem_size(ggml_type wtype) {
double mem_size = 0;
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = ch * ch_mult[len_mults - 1];
mem_size += block_in * z_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv_in_w
mem_size += block_in * ggml_type_sizef(GGML_TYPE_F32); // conv_in_b
mem_size += 2 * (ch * ch_mult[0]) * ggml_type_sizef(GGML_TYPE_F32); // norm_out_w/b
mem_size += (ch * ch_mult[0]) * out_ch * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv_out_w
mem_size += out_ch * ggml_type_sizef(GGML_TYPE_F32); // conv_out_b
mem_size += 8 * ggml_tensor_overhead(); // object overhead
mem_size += mid.block_1.compute_params_mem_size(wtype);
mem_size += mid.attn_1.compute_params_mem_size(wtype);
mem_size += mid.block_2.compute_params_mem_size(wtype);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
mem_size += up_blocks[i][j].compute_params_mem_size(wtype);
}
if (i != 0) {
mem_size += up_samples[i - 1].compute_params_mem_size(wtype);
}
}
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = ch * ch_mult[len_mults - 1];
norm_out_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch * ch_mult[0]);
norm_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch * ch_mult[0]);
conv_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, z_channels, block_in);
conv_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, block_in);
conv_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, ch * ch_mult[0], out_ch);
conv_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_ch);
mid.block_1.init_params(ctx, wtype);
mid.attn_1.init_params(ctx, wtype);
mid.block_2.init_params(ctx, wtype);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
up_blocks[i][j].init_params(ctx, wtype);
}
if (i != 0) {
up_samples[i - 1].init_params(ctx, wtype);
}
}
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "norm_out.weight"] = norm_out_w;
tensors[prefix + "norm_out.bias"] = norm_out_b;
tensors[prefix + "conv_in.weight"] = conv_in_w;
tensors[prefix + "conv_in.bias"] = conv_in_b;
tensors[prefix + "conv_out.weight"] = conv_out_w;
tensors[prefix + "conv_out.bias"] = conv_out_b;
mid.block_1.map_by_name(tensors, prefix + "mid.block_1.");
mid.attn_1.map_by_name(tensors, prefix + "mid.attn_1.");
mid.block_2.map_by_name(tensors, prefix + "mid.block_2.");
int len_mults = sizeof(ch_mult) / sizeof(int);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
up_blocks[i][j].map_by_name(tensors, prefix + "up." + std::to_string(i) + ".block." + std::to_string(j) + ".");
}
if (i != 0) {
up_samples[i - 1].map_by_name(tensors, prefix + "up." + std::to_string(i) + ".upsample.");
}
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
// z: [N, z_channels, h, w]
// conv_in
auto h = ggml_conv_2d(ctx, conv_in_w, z, 1, 1, 1, 1, 1, 1);
h = ggml_add(ctx,
h,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, conv_in_b, 1, 1, conv_in_b->ne[0], 1),
h)); // [N, block_in, h, w]
h = mid.block_1.forward(ctx, h);
h = mid.attn_1.forward(ctx, h);
h = mid.block_2.forward(ctx, h); // [N, block_in, h, w]
int len_mults = sizeof(ch_mult) / sizeof(int);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
h = up_blocks[i][j].forward(ctx, h);
}
if (i != 0) {
h = up_samples[i - 1].forward(ctx, h);
}
}
// group norm 32
h = ggml_group_norm_32(ctx, h);
h = ggml_add(ctx,
ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_out_w, 1, 1, norm_out_w->ne[0], 1), h), h),
ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_out_b, 1, 1, norm_out_b->ne[0], 1), h));
// silu
// silu
h = ggml_silu_inplace(ctx, h);
// conv_out
h = ggml_conv_2d(ctx, conv_out_w, h, 1, 1, 1, 1, 1, 1);
h = ggml_add(ctx,
h,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, conv_out_b, 1, 1, conv_out_b->ne[0], 1),
h)); // [N, out_ch, h, w]
return h;
}
};
// ldm.models.autoencoder.AutoencoderKL
struct AutoEncoderKL {
bool decode_only = true;
int embed_dim = 4;
struct
{
int z_channels = 4;
int resolution = 256;
int in_channels = 3;
int out_ch = 3;
int ch = 128;
int ch_mult[4] = {1, 2, 4, 4};
int num_res_blocks = 2;
} dd_config;
struct ggml_tensor* quant_conv_w; // [2*embed_dim, 2*z_channels, 1, 1]
struct ggml_tensor* quant_conv_b; // [2*embed_dim, ]
struct ggml_tensor* post_quant_conv_w; // [z_channels, embed_dim, 1, 1]
struct ggml_tensor* post_quant_conv_b; // [z_channels, ]
Encoder encoder;
Decoder decoder;
AutoEncoderKL(bool decode_only = false)
: decode_only(decode_only) {
assert(sizeof(dd_config.ch_mult) == sizeof(encoder.ch_mult));
assert(sizeof(dd_config.ch_mult) == sizeof(decoder.ch_mult));
encoder.embed_dim = embed_dim;
decoder.embed_dim = embed_dim;
encoder.ch = dd_config.ch;
decoder.ch = dd_config.ch;
encoder.z_channels = dd_config.z_channels;
decoder.z_channels = dd_config.z_channels;
encoder.in_channels = dd_config.in_channels;
decoder.out_ch = dd_config.out_ch;
encoder.num_res_blocks = dd_config.num_res_blocks;
int len_mults = sizeof(dd_config.ch_mult) / sizeof(int);
for (int i = 0; i < len_mults; i++) {
encoder.ch_mult[i] = dd_config.ch_mult[i];
decoder.ch_mult[i] = dd_config.ch_mult[i];
}
}
size_t compute_params_mem_size(ggml_type wtype) {
double mem_size = 0;
if (!decode_only) {
mem_size += 2 * embed_dim * 2 * dd_config.z_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16); // quant_conv_w
mem_size += 2 * embed_dim * ggml_type_sizef(GGML_TYPE_F32); // quant_conv_b
mem_size += encoder.compute_params_mem_size(wtype);
}
mem_size += dd_config.z_channels * embed_dim * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16); // post_quant_conv_w
mem_size += dd_config.z_channels * ggml_type_sizef(GGML_TYPE_F32); // post_quant_conv_b
mem_size += decoder.compute_params_mem_size(wtype);
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
if (!decode_only) {
quant_conv_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, 2 * dd_config.z_channels, 2 * embed_dim);
quant_conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2 * embed_dim);
encoder.init_params(ctx, wtype);
}
post_quant_conv_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, embed_dim, dd_config.z_channels);
post_quant_conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dd_config.z_channels);
decoder.init_params(ctx, wtype);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
if (!decode_only) {
tensors[prefix + "quant_conv.weight"] = quant_conv_w;
tensors[prefix + "quant_conv.bias"] = quant_conv_b;
encoder.map_by_name(tensors, prefix + "encoder.");
}
tensors[prefix + "post_quant_conv.weight"] = post_quant_conv_w;
tensors[prefix + "post_quant_conv.bias"] = post_quant_conv_b;
decoder.map_by_name(tensors, prefix + "decoder.");
}
struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) {
// z: [N, z_channels, h, w]
// post_quant_conv
auto h = ggml_conv_2d(ctx, post_quant_conv_w, z, 1, 1, 0, 0, 1, 1);
h = ggml_add(ctx,
h,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, post_quant_conv_b, 1, 1, post_quant_conv_b->ne[0], 1),
h)); // [N, z_channels, h, w]
h = decoder.forward(ctx, h);
return h;
}
struct ggml_tensor* encode(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, in_channels, h, w]
auto h = encoder.forward(ctx, x); // [N, 2*z_channels, h/8, w/8]
// quant_conv
h = ggml_conv_2d(ctx, quant_conv_w, h, 1, 1, 0, 0, 1, 1);
h = ggml_add(ctx,
h,
ggml_repeat(ctx,
ggml_reshape_4d(ctx, quant_conv_b, 1, 1, quant_conv_b->ne[0], 1),
h)); // [N, 2*embed_dim, h/8, w/8]
return h;
}
};
/*================================================= CompVisDenoiser ==================================================*/
// Ref: https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/external.py
struct SigmaSchedule {
float alphas_cumprod[TIMESTEPS];
float sigmas[TIMESTEPS];
float log_sigmas[TIMESTEPS];
virtual std::vector<float> get_sigmas(uint32_t n) = 0;
float sigma_to_t(float sigma) {
float log_sigma = std::log(sigma);
std::vector<float> dists;
dists.reserve(TIMESTEPS);
for (float log_sigma_val : log_sigmas) {
dists.push_back(log_sigma - log_sigma_val);
}
int low_idx = 0;
for (size_t i = 0; i < TIMESTEPS; i++) {
if (dists[i] >= 0) {
low_idx++;
}
}
low_idx = std::min(std::max(low_idx - 1, 0), TIMESTEPS - 2);
int high_idx = low_idx + 1;
float low = log_sigmas[low_idx];
float high = log_sigmas[high_idx];
float w = (low - log_sigma) / (low - high);
w = std::max(0.f, std::min(1.f, w));
float t = (1.0f - w) * low_idx + w * high_idx;
return t;
}
float t_to_sigma(float t) {
int low_idx = static_cast<int>(std::floor(t));
int high_idx = static_cast<int>(std::ceil(t));
float w = t - static_cast<float>(low_idx);
float log_sigma = (1.0f - w) * log_sigmas[low_idx] + w * log_sigmas[high_idx];
return std::exp(log_sigma);
}
};
struct DiscreteSchedule : SigmaSchedule {
std::vector<float> get_sigmas(uint32_t n) {
std::vector<float> result;
int t_max = TIMESTEPS - 1;
if (n == 0) {
return result;
} else if (n == 1) {
result.push_back(t_to_sigma(t_max));
result.push_back(0);
return result;
}
float step = static_cast<float>(t_max) / static_cast<float>(n - 1);
for (int i = 0; i < n; ++i) {
float t = t_max - step * i;
result.push_back(t_to_sigma(t));
}
result.push_back(0);
return result;
}
};
struct KarrasSchedule : SigmaSchedule {
std::vector<float> get_sigmas(uint32_t n) {
// These *COULD* be function arguments here,
// but does anybody ever bother to touch them?
float sigma_min = 0.1;
float sigma_max = 10.;
float rho = 7.;
std::vector<float> result(n + 1);
float min_inv_rho = pow(sigma_min, (1. / rho));
float max_inv_rho = pow(sigma_max, (1. / rho));
for (int i = 0; i < n; i++) {
// Eq. (5) from Karras et al 2022
result[i] = pow(max_inv_rho + (float)i / ((float)n - 1.) * (min_inv_rho - max_inv_rho), rho);
}
result[n] = 0.;
return result;
}
};
struct Denoiser {
std::shared_ptr<SigmaSchedule> schedule = std::make_shared<DiscreteSchedule>();
virtual std::vector<float> get_scalings(float sigma) = 0;
};
struct CompVisDenoiser : public Denoiser {
float sigma_data = 1.0f;
std::vector<float> get_scalings(float sigma) {
float c_out = -sigma;
float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
return {c_out, c_in};
}
};
struct CompVisVDenoiser : public Denoiser {
float sigma_data = 1.0f;
std::vector<float> get_scalings(float sigma) {
float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data);
float c_out = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data);
float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
return {c_skip, c_out, c_in};
}
};
/*=============================================== StableDiffusionGGML ================================================*/
class StableDiffusionGGML {
public:
ggml_context* clip_params_ctx = NULL;
ggml_context* unet_params_ctx = NULL;
ggml_context* vae_params_ctx = NULL;
bool dynamic = true;
bool vae_decode_only = false;
bool free_params_immediately = false;
std::shared_ptr<RNG> rng = std::make_shared<STDDefaultRNG>();
int32_t ftype = 1;
int n_threads = -1;
float scale_factor = 0.18215f;
size_t max_mem_size = 0;
size_t curr_params_mem_size = 0;
size_t max_params_mem_size = 0;
size_t max_rt_mem_size = 0;
FrozenCLIPEmbedderWithCustomWords cond_stage_model;
UNetModel diffusion_model;
AutoEncoderKL first_stage_model;
std::shared_ptr<Denoiser> denoiser = std::make_shared<CompVisDenoiser>();
StableDiffusionGGML() = default;
StableDiffusionGGML(int n_threads,
bool vae_decode_only,
bool free_params_immediately,
RNGType rng_type)
: n_threads(n_threads),
vae_decode_only(vae_decode_only),
free_params_immediately(free_params_immediately) {
first_stage_model.decode_only = vae_decode_only;
if (rng_type == STD_DEFAULT_RNG) {
rng = std::make_shared<STDDefaultRNG>();
} else if (rng_type == CUDA_RNG) {
rng = std::make_shared<PhiloxRNG>();
}
}
~StableDiffusionGGML() {
if (clip_params_ctx != NULL) {
ggml_free(clip_params_ctx);
clip_params_ctx = NULL;
}
if (unet_params_ctx != NULL) {
ggml_free(unet_params_ctx);
unet_params_ctx = NULL;
}
if (vae_params_ctx != NULL) {
ggml_free(vae_params_ctx);
vae_params_ctx = NULL;
}
}
bool load_from_file(const std::string& file_path, Schedule schedule) {
LOG_INFO("loading model from '%s'", file_path.c_str());
std::ifstream file(file_path, std::ios::binary);
if (!file.is_open()) {
LOG_ERROR("failed to open '%s'", file_path.c_str());
return false;
}
LOG_DEBUG("verifying magic");
// verify magic
{
uint32_t magic;
file.read(reinterpret_cast<char*>(&magic), sizeof(magic));
if (magic != GGML_FILE_MAGIC) {
LOG_ERROR("invalid model file '%s' (bad magic)", file_path.c_str());
return false;
}
}
LOG_DEBUG("loading hparams");
// load hparams
file.read(reinterpret_cast<char*>(&ftype), sizeof(ftype));
int model_type = (ftype >> 16) & 0xFFFF;
if (model_type >= MODEL_TYPE_COUNT) {
LOG_ERROR("invalid model file '%s' (bad model type value %d)", file_path.c_str(), ftype);
return false;
}
LOG_INFO("model type: %s", model_type_to_str[model_type]);
if (model_type == SD2) {
cond_stage_model = FrozenCLIPEmbedderWithCustomWords((ModelType)model_type);
diffusion_model = UNetModel((ModelType)model_type);
}
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(ftype & 0xFFFF));
LOG_INFO("ftype: %s", ggml_type_name(wtype));
if (wtype == GGML_TYPE_COUNT) {
LOG_ERROR("invalid model file '%s' (bad ftype value %d)", file_path.c_str(), ftype);
return false;
}
LOG_DEBUG("loading vocab");
// load vocab
{
int32_t n_vocab = 0;
file.read(reinterpret_cast<char*>(&n_vocab), sizeof(n_vocab));
if (n_vocab != cond_stage_model.text_model.vocab_size) {
LOG_ERROR("invalid model file '%s' (bad vocab size %d != %d)",
file_path.c_str(), n_vocab, cond_stage_model.text_model.vocab_size);
return false;
}
std::string word;
std::vector<char> buf(128);
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
file.read((char*)&len, sizeof(len));
buf.resize(len);
file.read((char*)buf.data(), len);
word.assign(buf.data(), len);
cond_stage_model.tokenizer.add_token(word, i);
}
}
// create the ggml context for network params
LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));
{
// cond_stage_model(FrozenCLIPEmbedder)
double ctx_size = 1 * 1024 * 1024; // 1 MB, for padding
ctx_size += cond_stage_model.text_model.compute_params_mem_size(wtype);
LOG_DEBUG("clip params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(ctx_size);
params.mem_buffer = NULL;
params.no_alloc = false;
params.dynamic = false;
clip_params_ctx = ggml_init(params);
if (!clip_params_ctx) {
LOG_ERROR("ggml_init() failed");
return false;
}
}
{
// diffusion_model(UNetModel)
double ctx_size = 1 * 1024 * 1024; // 1 MB, for padding
ctx_size += diffusion_model.compute_params_mem_size(wtype);
LOG_DEBUG("unet params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(ctx_size);
params.mem_buffer = NULL;
params.no_alloc = false;
params.dynamic = false;
unet_params_ctx = ggml_init(params);
if (!unet_params_ctx) {
LOG_ERROR("ggml_init() failed");
ggml_free(clip_params_ctx);
clip_params_ctx = NULL;
return false;
}
}
{
// first_stage_model(AutoEncoderKL)
double ctx_size = 1 * 1024 * 1024; // 1 MB, for padding
ctx_size += first_stage_model.compute_params_mem_size(wtype);
LOG_DEBUG("vae params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(ctx_size);
params.mem_buffer = NULL;
params.no_alloc = false;
params.dynamic = false;
vae_params_ctx = ggml_init(params);
if (!vae_params_ctx) {
LOG_ERROR("ggml_init() failed");
ggml_free(clip_params_ctx);
clip_params_ctx = NULL;
ggml_free(unet_params_ctx);
unet_params_ctx = NULL;
return false;
}
}
std::map<std::string, struct ggml_tensor*> tensors;
LOG_DEBUG("preparing memory for the weights");
// prepare memory for the weights
{
// cond_stage_model(FrozenCLIPEmbedder)
cond_stage_model.text_model.init_params(clip_params_ctx, wtype);
cond_stage_model.text_model.map_by_name(tensors, "cond_stage_model.transformer.text_model.");
// diffusion_model(UNetModel)
diffusion_model.init_params(unet_params_ctx, wtype);
diffusion_model.map_by_name(tensors, "model.diffusion_model.");
// firest_stage_model(AutoEncoderKL)
first_stage_model.init_params(vae_params_ctx, wtype);
first_stage_model.map_by_name(tensors, "first_stage_model.");
}
LOG_DEBUG("loading weights");
std::set<std::string> tensor_names_in_file;
int64_t t0 = ggml_time_ms();
// load weights
float alphas_cumprod[TIMESTEPS];
{
int n_tensors = 0;
size_t total_size = 0;
while (true) {
int32_t n_dims;
int32_t length;
int32_t ttype;
file.read(reinterpret_cast<char*>(&n_dims), sizeof(n_dims));
file.read(reinterpret_cast<char*>(&length), sizeof(length));
file.read(reinterpret_cast<char*>(&ttype), sizeof(ttype));
if (file.eof()) {
break;
}
int32_t nelements = 1;
int32_t ne[4] = {1, 1, 1, 1};
for (int i = 0; i < n_dims; ++i) {
file.read(reinterpret_cast<char*>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
const size_t num_bytes = nelements / ggml_blck_size(ggml_type(ttype)) * ggml_type_size(ggml_type(ttype));
std::string name(length, 0);
file.read(&name[0], length);
tensor_names_in_file.insert(std::string(name.data()));
if (std::string(name.data()) == "alphas_cumprod") {
file.read(reinterpret_cast<char*>(alphas_cumprod), nelements * ggml_type_size((ggml_type)ttype));
continue;
}
struct ggml_tensor* tensor;
if (tensors.find(name.data()) != tensors.end()) {
tensor = tensors[name.data()];
} else {
if (name.find("quant") == std::string::npos && name.find("first_stage_model.encoder.") == std::string::npos) {
LOG_WARN("unknown tensor '%s' in model file", name.data());
} else {
if (!vae_decode_only) {
LOG_WARN("unknown tensor '%s' in model file", name.data());
return false;
}
}
file.ignore(num_bytes);
continue;
}
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2] || tensor->ne[3] != ne[3]) {
LOG_ERROR(
"tensor '%s' has wrong shape in model file: "
"got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
name.data(),
ne[0], ne[1], ne[2], ne[3],
(int)tensor->ne[0], (int)tensor->ne[1], (int)tensor->ne[2], (int)tensor->ne[3]);
return false;
}
if (ggml_nelements(tensor) != nelements) {
LOG_ERROR(
"tensor '%s' has wrong number of elements in model file: "
"got %u, expert %zu",
name.data(), nelements, ggml_nelements(tensor));
return false;
}
if (tensor->type != ttype) {
LOG_ERROR("tensor '%s' has wrong type in model file: got %s, expect %s",
name.data(), ggml_type_name(ggml_type(ttype)), ggml_type_name(tensor->type));
return false;
}
file.read(reinterpret_cast<char*>(tensor->data), num_bytes);
total_size += ggml_nbytes(tensor);
}
bool some_tensor_not_init = false;
for (auto pair : tensors) {
if (pair.first.find("cond_stage_model.transformer.text_model.encoder.layers.23") != std::string::npos) {
continue;
}
if (tensor_names_in_file.find(pair.first) == tensor_names_in_file.end()) {
LOG_ERROR("tensor '%s' not in model file", pair.first.c_str());
some_tensor_not_init = true;
}
}
if (tensor_names_in_file.find("alphas_cumprod") == tensor_names_in_file.end()) {
LOG_ERROR("tensor alphas_cumprod not in model file");
some_tensor_not_init = true;
}
if (some_tensor_not_init) {
file.close();
return false;
}
LOG_DEBUG("model size = %.2fMB", total_size / 1024.0 / 1024.0);
}
max_params_mem_size = ggml_used_mem(clip_params_ctx) + ggml_used_mem(unet_params_ctx) + ggml_used_mem(vae_params_ctx);
max_mem_size = max_params_mem_size;
curr_params_mem_size = max_params_mem_size;
LOG_INFO("total params size = %.2fMB (clip %.2fMB, unet %.2fMB, vae %.2fMB)",
max_params_mem_size / 1024.0 / 1024.0,
ggml_used_mem(clip_params_ctx) / 1024.0 / 1024.0,
ggml_used_mem(unet_params_ctx) / 1024.0 / 1024.0,
ggml_used_mem(vae_params_ctx) / 1024.0 / 1024.0);
int64_t t1 = ggml_time_ms();
LOG_INFO("loading model from '%s' completed, taking %.2fs", file_path.c_str(), (t1 - t0) * 1.0f / 1000);
file.close();
// check is_using_v_parameterization_for_sd2
bool is_using_v_parameterization = false;
if (model_type == SD2) {
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024) * 1024; // 10M
params.mem_buffer = NULL;
params.no_alloc = false;
params.dynamic = false;
struct ggml_context* ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return false;
}
if (is_using_v_parameterization_for_sd2(ctx)) {
is_using_v_parameterization = true;
}
}
if (is_using_v_parameterization) {
denoiser = std::make_shared<CompVisVDenoiser>();
LOG_INFO("running in v-prediction mode");
} else {
LOG_INFO("running in eps-prediction mode");
}
if (schedule != DEFAULT) {
switch (schedule) {
case DISCRETE:
LOG_INFO("running with discrete schedule");
denoiser->schedule = std::make_shared<DiscreteSchedule>();
break;
case KARRAS:
LOG_INFO("running with Karras schedule");
denoiser->schedule = std::make_shared<KarrasSchedule>();
break;
case DEFAULT:
// Don't touch anything.
break;
default:
LOG_ERROR("Unknown schedule %i", schedule);
abort();
}
}
for (int i = 0; i < TIMESTEPS; i++) {
denoiser->schedule->alphas_cumprod[i] = alphas_cumprod[i];
denoiser->schedule->sigmas[i] = std::sqrt((1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]);
denoiser->schedule->log_sigmas[i] = std::log(denoiser->schedule->sigmas[i]);
}
return true;
}
bool is_using_v_parameterization_for_sd2(ggml_context* res_ctx) {
struct ggml_tensor* x_t = ggml_new_tensor_4d(res_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
ggml_set_f32(x_t, 0.5);
struct ggml_tensor* c = ggml_new_tensor_4d(res_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
ggml_set_f32(c, 0.5);
size_t ctx_size = 10 * 1024 * 1024; // 10MB
// calculate the amount of memory required
{
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
params.no_alloc = true;
params.dynamic = dynamic;
struct ggml_context* ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return false;
}
ggml_set_dynamic(ctx, false);
struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); // [N, ]
struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels); // [N, model_channels]
ggml_set_dynamic(ctx, params.dynamic);
struct ggml_tensor* out = diffusion_model.forward(ctx, x_t, NULL, c, t_emb);
ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx);
struct ggml_cgraph* diffusion_graph = ggml_build_forward_ctx(ctx, out);
struct ggml_cplan cplan = ggml_graph_plan(diffusion_graph, n_threads);
ctx_size += cplan.work_size;
LOG_DEBUG("diffusion context need %.2fMB static memory, with work_size needing %.2fMB",
ctx_size * 1.0f / 1024 / 1024,
cplan.work_size * 1.0f / 1024 / 1024);
ggml_free(ctx);
}
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
params.no_alloc = false;
params.dynamic = dynamic;
struct ggml_context* ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return false;
}
ggml_set_dynamic(ctx, false);
struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); // [N, ]
struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels); // [N, model_channels]
ggml_set_dynamic(ctx, params.dynamic);
ggml_set_f32(timesteps, 999);
set_timestep_embedding(timesteps, t_emb, diffusion_model.model_channels);
struct ggml_tensor* out = diffusion_model.forward(ctx, x_t, NULL, c, t_emb);
ggml_hold_dynamic_tensor(out);
struct ggml_cgraph* diffusion_graph = ggml_build_forward_ctx(ctx, out);
struct ggml_cplan cplan = ggml_graph_plan(diffusion_graph, n_threads);
ggml_set_dynamic(ctx, false);
struct ggml_tensor* buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
ggml_set_dynamic(ctx, params.dynamic);
cplan.work_data = (uint8_t*)buf->data;
int64_t t0 = ggml_time_ms();
ggml_graph_compute(diffusion_graph, &cplan);
double result = 0.f;
{
float* vec_x = (float*)x_t->data;
float* vec_out = (float*)out->data;
int64_t n = ggml_nelements(out);
for (int i = 0; i < n; i++) {
result += ((double)vec_out[i] - (double)vec_x[i]);
}
result /= n;
}
#ifdef GGML_PERF
ggml_graph_print(&diffusion_graph);
#endif
int64_t t1 = ggml_time_ms();
LOG_INFO("check is_using_v_parameterization_for_sd2 completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
LOG_DEBUG("diffusion graph use %.2fMB runtime memory: static %.2fMB, dynamic %.2fMB",
(ctx_size + ggml_curr_max_dynamic_size()) * 1.0f / 1024 / 1024,
ctx_size * 1.0f / 1024 / 1024,
ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
return result < -1;
}
ggml_tensor* get_learned_condition(ggml_context* res_ctx, const std::string& text) {
auto tokens_and_weights = cond_stage_model.tokenize(text,
cond_stage_model.text_model.max_position_embeddings,
true);
std::vector<int>& tokens = tokens_and_weights.first;
std::vector<float>& weights = tokens_and_weights.second;
size_t ctx_size = 10 * 1024 * 1024; // 10MB
// calculate the amount of memory required
{
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
params.no_alloc = true;
params.dynamic = dynamic;
struct ggml_context* ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return NULL;
}
ggml_set_dynamic(ctx, false);
struct ggml_tensor* input_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens.size());
ggml_set_dynamic(ctx, params.dynamic);
struct ggml_tensor* hidden_states = cond_stage_model.text_model.forward(ctx, input_ids);
struct ggml_cgraph cond_graph = ggml_build_forward(hidden_states);
struct ggml_cplan cplan = ggml_graph_plan(&cond_graph, n_threads);
ctx_size += cplan.work_size;
ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx);
LOG_DEBUG("condition context need %.2fMB static memory, with work_size needing %.2fMB",
ctx_size * 1.0f / 1024 / 1024,
cplan.work_size * 1.0f / 1024 / 1024);
ggml_free(ctx);
}
// allocate the required memory and compute forward
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
params.no_alloc = false;
params.dynamic = dynamic;
struct ggml_context* ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return NULL;
}
ggml_set_dynamic(ctx, false);
struct ggml_tensor* input_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens.size());
ggml_set_dynamic(ctx, params.dynamic);
struct ggml_tensor* hidden_states = cond_stage_model.text_model.forward(ctx, input_ids);
struct ggml_cgraph* cond_graph = ggml_build_forward_ctx(ctx, hidden_states);
LOG_DEBUG("building condition graph completed: %d nodes, %d leafs",
cond_graph->n_nodes, cond_graph->n_leafs);
memcpy(input_ids->data, tokens.data(), tokens.size() * ggml_element_size(input_ids));
int64_t t0 = ggml_time_ms();
ggml_graph_compute_with_ctx(ctx, cond_graph, n_threads);
int64_t t1 = ggml_time_ms();
LOG_DEBUG("computing condition graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
ggml_tensor* result = ggml_dup_tensor(res_ctx, hidden_states); // [N, n_token, hidden_size]
{
int64_t nelements = ggml_nelements(hidden_states);
float original_mean = 0.f;
float new_mean = 0.f;
float* vec = (float*)hidden_states->data;
for (int i = 0; i < nelements; i++) {
original_mean += vec[i] / nelements * 1.0f;
}
for (int i2 = 0; i2 < hidden_states->ne[2]; i2++) {
for (int i1 = 0; i1 < hidden_states->ne[1]; i1++) {
for (int i0 = 0; i0 < hidden_states->ne[0]; i0++) {
float value = ggml_tensor_get_f32(hidden_states, i0, i1, i2);
value *= weights[i1];
ggml_tensor_set_f32(result, value, i0, i1, i2);
}
}
}
vec = (float*)result->data;
for (int i = 0; i < nelements; i++) {
new_mean += vec[i] / nelements * 1.0f;
}
for (int i = 0; i < nelements; i++) {
vec[i] = vec[i] * (original_mean / new_mean);
}
}
// print_ggml_tensor(result);
size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size();
if (rt_mem_size > max_rt_mem_size) {
max_rt_mem_size = rt_mem_size;
}
size_t graph_mem_size = ggml_used_mem(clip_params_ctx) + rt_mem_size;
size_t curr_mem_size = curr_params_mem_size + rt_mem_size;
if (curr_mem_size > max_mem_size) {
max_mem_size = curr_mem_size;
}
LOG_INFO(
"condition graph use %.2fMB of memory: params %.2fMB, "
"runtime %.2fMB (static %.2fMB, dynamic %.2fMB)",
graph_mem_size * 1.0f / 1024 / 1024,
ggml_used_mem(clip_params_ctx) * 1.0f / 1024 / 1024,
rt_mem_size * 1.0f / 1024 / 1024,
ctx_size * 1.0f / 1024 / 1024,
ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
ggml_free(ctx);
return result; // [1, 77, 768]
}
ggml_tensor* sample(ggml_context* res_ctx,
ggml_tensor* x_t,
ggml_tensor* c,
ggml_tensor* uc,
float cfg_scale,
SampleMethod method,
const std::vector<float>& sigmas) {
size_t steps = sigmas.size() - 1;
// x_t = load_tensor_from_file(res_ctx, "./rand0.bin");
// print_ggml_tensor(x_t);
struct ggml_tensor* x = ggml_dup_tensor(res_ctx, x_t);
copy_ggml_tensor(x, x_t);
size_t ctx_size = 10 * 1024 * 1024; // 10MB
// calculate the amount of memory required
{
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
params.no_alloc = true;
params.dynamic = dynamic;
struct ggml_context* ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return NULL;
}
ggml_set_dynamic(ctx, false);
struct ggml_tensor* noised_input = ggml_dup_tensor(ctx, x_t);
struct ggml_tensor* context = ggml_dup_tensor(ctx, c);
struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); // [N, ]
struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels); // [N, model_channels]
ggml_set_dynamic(ctx, params.dynamic);
struct ggml_tensor* out = diffusion_model.forward(ctx, noised_input, NULL, context, t_emb);
ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx);
struct ggml_cgraph* diffusion_graph = ggml_build_forward_ctx(ctx, out);
struct ggml_cplan cplan = ggml_graph_plan(diffusion_graph, n_threads);
ctx_size += cplan.work_size;
LOG_DEBUG("diffusion context need %.2fMB static memory, with work_size needing %.2fMB",
ctx_size * 1.0f / 1024 / 1024,
cplan.work_size * 1.0f / 1024 / 1024);
ggml_free(ctx);
}
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
params.no_alloc = false;
params.dynamic = dynamic;
struct ggml_context* ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return NULL;
}
ggml_set_dynamic(ctx, false);
struct ggml_tensor* noised_input = ggml_dup_tensor(ctx, x_t);
struct ggml_tensor* context = ggml_dup_tensor(ctx, c);
struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); // [N, ]
struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels); // [N, model_channels]
ggml_set_dynamic(ctx, params.dynamic);
struct ggml_tensor* out = diffusion_model.forward(ctx, noised_input, NULL, context, t_emb);
ggml_hold_dynamic_tensor(out);
struct ggml_cgraph* diffusion_graph = ggml_build_forward_ctx(ctx, out);
struct ggml_cplan cplan = ggml_graph_plan(diffusion_graph, n_threads);
ggml_set_dynamic(ctx, false);
struct ggml_tensor* buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
ggml_set_dynamic(ctx, params.dynamic);
cplan.work_data = (uint8_t*)buf->data;
// x = x * sigmas[0]
{
float* vec = (float*)x->data;
for (int i = 0; i < ggml_nelements(x); i++) {
vec[i] = vec[i] * sigmas[0];
}
}
// denoise wrapper
ggml_set_dynamic(ctx, false);
struct ggml_tensor* out_cond = NULL;
struct ggml_tensor* out_uncond = NULL;
if (cfg_scale != 1.0f && uc != NULL) {
out_uncond = ggml_dup_tensor(ctx, x);
}
struct ggml_tensor* denoised = ggml_dup_tensor(ctx, x);
ggml_set_dynamic(ctx, params.dynamic);
auto denoise = [&](ggml_tensor* input, float sigma, int step) {
int64_t t0 = ggml_time_ms();
float c_skip = 1.0f;
float c_out = 1.0f;
float c_in = 1.0f;
std::vector<float> scaling = denoiser->get_scalings(sigma);
if (scaling.size() == 3) { // CompVisVDenoiser
c_skip = scaling[0];
c_out = scaling[1];
c_in = scaling[2];
} else { // CompVisDenoiser
c_out = scaling[0];
c_in = scaling[1];
}
float t = denoiser->schedule->sigma_to_t(sigma);
ggml_set_f32(timesteps, t);
set_timestep_embedding(timesteps, t_emb, diffusion_model.model_channels);
copy_ggml_tensor(noised_input, input);
// noised_input = noised_input * c_in
{
float* vec = (float*)noised_input->data;
for (int i = 0; i < ggml_nelements(noised_input); i++) {
vec[i] = vec[i] * c_in;
}
}
if (cfg_scale != 1.0 && uc != NULL) {
// uncond
copy_ggml_tensor(context, uc);
ggml_graph_compute(diffusion_graph, &cplan);
copy_ggml_tensor(out_uncond, out);
// cond
copy_ggml_tensor(context, c);
ggml_graph_compute(diffusion_graph, &cplan);
out_cond = out;
// out_uncond + cfg_scale * (out_cond - out_uncond)
{
float* vec_out = (float*)out->data;
float* vec_out_uncond = (float*)out_uncond->data;
float* vec_out_cond = (float*)out_cond->data;
for (int i = 0; i < ggml_nelements(out); i++) {
vec_out[i] = vec_out_uncond[i] + cfg_scale * (vec_out_cond[i] - vec_out_uncond[i]);
}
}
} else {
// cond
copy_ggml_tensor(context, c);
ggml_graph_compute(diffusion_graph, &cplan);
}
// v = out, eps = out
// denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
{
float* vec_denoised = (float*)denoised->data;
float* vec_input = (float*)input->data;
float* vec_out = (float*)out->data;
for (int i = 0; i < ggml_nelements(denoised); i++) {
vec_denoised[i] = vec_out[i] * c_out + vec_input[i] * c_skip;
}
}
#ifdef GGML_PERF
ggml_graph_print(&diffusion_graph);
#endif
int64_t t1 = ggml_time_ms();
if (step > 0) {
LOG_INFO("step %d sampling completed, taking %.2fs", step, (t1 - t0) * 1.0f / 1000);
LOG_DEBUG("diffusion graph use %.2fMB runtime memory: static %.2fMB, dynamic %.2fMB",
(ctx_size + ggml_curr_max_dynamic_size()) * 1.0f / 1024 / 1024,
ctx_size * 1.0f / 1024 / 1024,
ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
}
};
// sample_euler_ancestral
switch (method) {
case EULER_A: {
LOG_INFO("sampling using Euler A method");
ggml_set_dynamic(ctx, false);
struct ggml_tensor* noise = ggml_dup_tensor(ctx, x);
struct ggml_tensor* d = ggml_dup_tensor(ctx, x);
ggml_set_dynamic(ctx, params.dynamic);
for (int i = 0; i < steps; i++) {
float sigma = sigmas[i];
// denoise
denoise(x, sigma, i + 1);
// d = (x - denoised) / sigma
{
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
float* vec_denoised = (float*)denoised->data;
for (int i = 0; i < ggml_nelements(d); i++) {
vec_d[i] = (vec_x[i] - vec_denoised[i]) / sigma;
}
}
// get_ancestral_step
float sigma_up = std::min(sigmas[i + 1],
std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
// Euler method
float dt = sigma_down - sigmas[i];
// x = x + d * dt
{
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
for (int i = 0; i < ggml_nelements(x); i++) {
vec_x[i] = vec_x[i] + vec_d[i] * dt;
}
}
if (sigmas[i + 1] > 0) {
// x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
ggml_tensor_set_f32_randn(noise, rng);
// noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin");
{
float* vec_x = (float*)x->data;
float* vec_noise = (float*)noise->data;
for (int i = 0; i < ggml_nelements(x); i++) {
vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
}
}
}
}
} break;
case EULER: // Implemented without any sigma churn
{
LOG_INFO("sampling using Euler method");
ggml_set_dynamic(ctx, false);
struct ggml_tensor* d = ggml_dup_tensor(ctx, x);
ggml_set_dynamic(ctx, params.dynamic);
for (int i = 0; i < steps; i++) {
float sigma = sigmas[i];
// denoise
denoise(x, sigma, i + 1);
// d = (x - denoised) / sigma
{
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(d); j++) {
vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigma;
}
}
float dt = sigmas[i + 1] - sigma;
// x = x + d * dt
{
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x[j] = vec_x[j] + vec_d[j] * dt;
}
}
}
} break;
case HEUN: {
LOG_INFO("sampling using Heun method");
ggml_set_dynamic(ctx, false);
struct ggml_tensor* d = ggml_dup_tensor(ctx, x);
struct ggml_tensor* x2 = ggml_dup_tensor(ctx, x);
ggml_set_dynamic(ctx, params.dynamic);
for (int i = 0; i < steps; i++) {
// denoise
denoise(x, sigmas[i], -(i + 1));
// d = (x - denoised) / sigma
{
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
}
}
float dt = sigmas[i + 1] - sigmas[i];
if (sigmas[i + 1] == 0) {
// Euler step
// x = x + d * dt
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x[j] = vec_x[j] + vec_d[j] * dt;
}
} else {
// Heun step
float* vec_d = (float*)d->data;
float* vec_d2 = (float*)d->data;
float* vec_x = (float*)x->data;
float* vec_x2 = (float*)x2->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x2[j] = vec_x[j] + vec_d[j] * dt;
}
denoise(x2, sigmas[i + 1], i + 1);
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) {
float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1];
vec_d[j] = (vec_d[j] + d2) / 2;
vec_x[j] = vec_x[j] + vec_d[j] * dt;
}
}
}
} break;
case DPMPP2M: // DPM++ (2M) from Karras et al (2022)
{
LOG_INFO("sampling using DPM++ (2M) method");
ggml_set_dynamic(ctx, false);
struct ggml_tensor* old_denoised = ggml_dup_tensor(ctx, x);
ggml_set_dynamic(ctx, params.dynamic);
auto t_fn = [](float sigma) -> float { return -log(sigma); };
for (int i = 0; i < steps; i++) {
// denoise
denoise(x, sigmas[i], i + 1);
float t = t_fn(sigmas[i]);
float t_next = t_fn(sigmas[i + 1]);
float h = t_next - t;
float a = sigmas[i + 1] / sigmas[i];
float b = exp(-h) - 1.;
float* vec_x = (float*)x->data;
float* vec_denoised = (float*)denoised->data;
float* vec_old_denoised = (float*)old_denoised->data;
if (i == 0 || sigmas[i + 1] == 0) {
// Simpler step for the edge cases
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x[j] = a * vec_x[j] - b * vec_denoised[j];
}
} else {
float h_last = t - t_fn(sigmas[i - 1]);
float r = h_last / h;
for (int j = 0; j < ggml_nelements(x); j++) {
float denoised_d = (1. + 1. / (2. * r)) * vec_denoised[j] - (1. / (2. * r)) * vec_old_denoised[j];
vec_x[j] = a * vec_x[j] - b * denoised_d;
}
}
// old_denoised = denoised
for (int j = 0; j < ggml_nelements(x); j++) {
vec_old_denoised[j] = vec_denoised[j];
}
}
} break;
case DPMPP2Mv2: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457
{
LOG_INFO("sampling using modified DPM++ (2M) method");
ggml_set_dynamic(ctx, false);
struct ggml_tensor* old_denoised = ggml_dup_tensor(ctx, x);
ggml_set_dynamic(ctx, params.dynamic);
auto t_fn = [](float sigma) -> float { return -log(sigma); };
for (int i = 0; i < steps; i++) {
// denoise
denoise(x, sigmas[i], i + 1);
float t = t_fn(sigmas[i]);
float t_next = t_fn(sigmas[i + 1]);
float h = t_next - t;
float a = sigmas[i + 1] / sigmas[i];
float* vec_x = (float*)x->data;
float* vec_denoised = (float*)denoised->data;
float* vec_old_denoised = (float*)old_denoised->data;
if (i == 0 || sigmas[i + 1] == 0) {
// Simpler step for the edge cases
float b = exp(-h) - 1.;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x[j] = a * vec_x[j] - b * vec_denoised[j];
}
} else {
float h_last = t - t_fn(sigmas[i - 1]);
float h_min = std::min(h_last, h);
float h_max = std::max(h_last, h);
float r = h_max / h_min;
float h_d = (h_max + h_min) / 2.;
float b = exp(-h_d) - 1.;
for (int j = 0; j < ggml_nelements(x); j++) {
float denoised_d = (1. + 1. / (2. * r)) * vec_denoised[j] - (1. / (2. * r)) * vec_old_denoised[j];
vec_x[j] = a * vec_x[j] - b * denoised_d;
}
}
// old_denoised = denoised
for (int j = 0; j < ggml_nelements(x); j++) {
vec_old_denoised[j] = vec_denoised[j];
}
}
} break;
default:
LOG_ERROR("Attempting to sample with nonexisting sample method %i", method);
abort();
}
size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size();
if (rt_mem_size > max_rt_mem_size) {
max_rt_mem_size = rt_mem_size;
}
size_t graph_mem_size = ggml_used_mem(unet_params_ctx) + rt_mem_size;
size_t curr_mem_size = curr_params_mem_size + rt_mem_size;
if (curr_mem_size > max_mem_size) {
max_mem_size = curr_mem_size;
}
LOG_INFO(
"diffusion graph use %.2fMB of memory: params %.2fMB, "
"runtime %.2fMB (static %.2fMB, dynamic %.2fMB)",
graph_mem_size * 1.0f / 1024 / 1024,
ggml_used_mem(unet_params_ctx) * 1.0f / 1024 / 1024,
rt_mem_size * 1.0f / 1024 / 1024,
ctx_size * 1.0f / 1024 / 1024,
ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
ggml_free(ctx);
return x;
}
ggml_tensor* encode_first_stage(ggml_context* res_ctx, ggml_tensor* x) {
int64_t W = x->ne[0];
int64_t H = x->ne[1];
struct ggml_tensor* result = NULL;
// calculate the amount of memory required
size_t ctx_size = 10 * 1024 * 1024; // 10MB
{
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
params.no_alloc = true;
params.dynamic = dynamic;
struct ggml_context* ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return NULL;
}
struct ggml_tensor* moments = first_stage_model.encode(ctx, x);
ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx);
struct ggml_cgraph* vae_graph = ggml_build_forward_ctx(ctx, moments);
struct ggml_cplan cplan = ggml_graph_plan(vae_graph, n_threads);
ctx_size += cplan.work_size;
LOG_DEBUG("vae context need %.2fMB static memory, with work_size needing %.2fMB",
ctx_size * 1.0f / 1024 / 1024,
cplan.work_size * 1.0f / 1024 / 1024);
ggml_free(ctx);
}
{
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
params.no_alloc = false;
params.dynamic = dynamic;
struct ggml_context* ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return NULL;
}
struct ggml_tensor* moments = first_stage_model.encode(ctx, x);
struct ggml_cgraph* vae_graph = ggml_build_forward_ctx(ctx, moments);
int64_t t0 = ggml_time_ms();
ggml_graph_compute_with_ctx(ctx, vae_graph, n_threads);
int64_t t1 = ggml_time_ms();
#ifdef GGML_PERF
ggml_graph_print(&vae_graph);
#endif
LOG_DEBUG("computing vae graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
result = ggml_dup_tensor(res_ctx, moments);
copy_ggml_tensor(result, moments);
size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size();
if (rt_mem_size > max_rt_mem_size) {
max_rt_mem_size = rt_mem_size;
}
size_t graph_mem_size = ggml_used_mem(vae_params_ctx) + rt_mem_size;
size_t curr_mem_size = curr_params_mem_size + rt_mem_size;
if (curr_mem_size > max_mem_size) {
max_mem_size = curr_mem_size;
}
LOG_INFO(
"vae graph use %.2fMB of memory: params %.2fMB, "
"runtime %.2fMB (static %.2fMB, dynamic %.2fMB)",
graph_mem_size * 1.0f / 1024 / 1024,
ggml_used_mem(vae_params_ctx) * 1.0f / 1024 / 1024,
rt_mem_size * 1.0f / 1024 / 1024,
ctx_size * 1.0f / 1024 / 1024,
ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
ggml_free(ctx);
}
return result;
}
// ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
ggml_tensor* get_first_stage_encoding(ggml_context* res_ctx, ggml_tensor* moments) {
// ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
ggml_tensor* latent = ggml_new_tensor_4d(res_ctx, moments->type, moments->ne[0],
moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
struct ggml_tensor* noise = ggml_dup_tensor(res_ctx, latent);
ggml_tensor_set_f32_randn(noise, rng);
// noise = load_tensor_from_file(res_ctx, "noise.bin");
{
float mean = 0;
float logvar = 0;
float value = 0;
float std_ = 0;
for (int i = 0; i < latent->ne[3]; i++) {
for (int j = 0; j < latent->ne[2]; j++) {
for (int k = 0; k < latent->ne[1]; k++) {
for (int l = 0; l < latent->ne[0]; l++) {
mean = ggml_tensor_get_f32(moments, l, k, j, i);
logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i);
logvar = std::max(-30.0f, std::min(logvar, 20.0f));
std_ = std::exp(0.5f * logvar);
value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i);
value = value * scale_factor;
// printf("%d %d %d %d -> %f\n", i, j, k, l, value);
ggml_tensor_set_f32(latent, value, l, k, j, i);
}
}
}
}
}
return latent;
}
ggml_tensor* decode_first_stage(ggml_context* res_ctx, ggml_tensor* z) {
int64_t W = z->ne[0];
int64_t H = z->ne[1];
struct ggml_tensor* result_img = NULL;
{
float* vec = (float*)z->data;
for (int i = 0; i < ggml_nelements(z); i++) {
vec[i] = 1.0f / scale_factor * vec[i];
}
}
// calculate the amount of memory required
size_t ctx_size = 10 * 1024 * 1024; // 10MB
{
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
params.no_alloc = true;
params.dynamic = dynamic;
struct ggml_context* ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return NULL;
}
struct ggml_tensor* img = first_stage_model.decoder.forward(ctx, z);
ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx);
struct ggml_cgraph* vae_graph = ggml_build_forward_ctx(ctx, img);
struct ggml_cplan cplan = ggml_graph_plan(vae_graph, n_threads);
ctx_size += cplan.work_size;
LOG_DEBUG("vae context need %.2fMB static memory, with work_size needing %.2fMB",
ctx_size * 1.0f / 1024 / 1024,
cplan.work_size * 1.0f / 1024 / 1024);
ggml_free(ctx);
}
{
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
params.no_alloc = false;
params.dynamic = dynamic;
struct ggml_context* ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return NULL;
}
struct ggml_tensor* img = first_stage_model.decode(ctx, z);
struct ggml_cgraph* vae_graph = ggml_build_forward_ctx(ctx, img);
int64_t t0 = ggml_time_ms();
ggml_graph_compute_with_ctx(ctx, vae_graph, n_threads);
int64_t t1 = ggml_time_ms();
#ifdef GGML_PERF
ggml_graph_print(&vae_graph);
#endif
LOG_DEBUG("computing vae graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
result_img = ggml_dup_tensor(res_ctx, img);
copy_ggml_tensor(result_img, img);
size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size();
if (rt_mem_size > max_rt_mem_size) {
max_rt_mem_size = rt_mem_size;
}
size_t graph_mem_size = ggml_used_mem(vae_params_ctx) + rt_mem_size;
size_t curr_mem_size = curr_params_mem_size + rt_mem_size;
if (curr_mem_size > max_mem_size) {
max_mem_size = curr_mem_size;
}
LOG_INFO(
"vae graph use %.2fMB of memory: params %.2fMB, "
"runtime %.2fMB (static %.2fMB, dynamic %.2fMB)",
graph_mem_size * 1.0f / 1024 / 1024,
ggml_used_mem(vae_params_ctx) * 1.0f / 1024 / 1024,
rt_mem_size * 1.0f / 1024 / 1024,
ctx_size * 1.0f / 1024 / 1024,
ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
ggml_free(ctx);
}
return result_img;
}
};
/*================================================= StableDiffusion ==================================================*/
StableDiffusion::StableDiffusion(int n_threads,
bool vae_decode_only,
bool free_params_immediately,
RNGType rng_type) {
sd = std::make_shared<StableDiffusionGGML>(n_threads,
vae_decode_only,
free_params_immediately,
rng_type);
}
bool StableDiffusion::load_from_file(const std::string& file_path, Schedule s) {
return sd->load_from_file(file_path, s);
}
std::vector<uint8_t> StableDiffusion::txt2img(const std::string& prompt,
const std::string& negative_prompt,
float cfg_scale,
int width,
int height,
SampleMethod sample_method,
int sample_steps,
int64_t seed) {
std::vector<uint8_t> result;
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024) * 1024; // 10M
params.mem_size += width * height * 3 * sizeof(float) * 2;
params.mem_buffer = NULL;
params.no_alloc = false;
params.dynamic = false;
struct ggml_context* ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return result;
}
if (seed < 0) {
seed = (int)time(NULL);
}
sd->rng->manual_seed(seed);
int64_t t0 = ggml_time_ms();
ggml_tensor* c = sd->get_learned_condition(ctx, prompt);
struct ggml_tensor* uc = NULL;
if (cfg_scale != 1.0) {
uc = sd->get_learned_condition(ctx, negative_prompt);
}
int64_t t1 = ggml_time_ms();
LOG_INFO("get_learned_condition completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
if (sd->free_params_immediately) {
sd->curr_params_mem_size -= ggml_used_mem(sd->clip_params_ctx);
ggml_free(sd->clip_params_ctx);
sd->clip_params_ctx = NULL;
}
int C = 4;
int W = width / 8;
int H = height / 8;
struct ggml_tensor* x_t = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, W, H, C, 1);
ggml_tensor_set_f32_randn(x_t, sd->rng);
std::vector<float> sigmas = sd->denoiser->schedule->get_sigmas(sample_steps);
LOG_INFO("start sampling");
struct ggml_tensor* x_0 = sd->sample(ctx, x_t, c, uc, cfg_scale, sample_method, sigmas);
// struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
// print_ggml_tensor(x_0);
int64_t t2 = ggml_time_ms();
LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
if (sd->free_params_immediately) {
sd->curr_params_mem_size -= ggml_used_mem(sd->unet_params_ctx);
ggml_free(sd->unet_params_ctx);
sd->unet_params_ctx = NULL;
}
struct ggml_tensor* img = sd->decode_first_stage(ctx, x_0);
if (img != NULL) {
result = ggml_to_image_vec(img);
}
int64_t t3 = ggml_time_ms();
LOG_INFO("decode_first_stage completed, taking %.2fs", (t3 - t2) * 1.0f / 1000);
if (sd->free_params_immediately) {
sd->curr_params_mem_size -= ggml_used_mem(sd->vae_params_ctx);
ggml_free(sd->vae_params_ctx);
sd->vae_params_ctx = NULL;
}
LOG_INFO(
"txt2img completed in %.2fs, use %.2fMB of memory: peak params memory %.2fMB, "
"peak runtime memory %.2fMB",
(t3 - t0) * 1.0f / 1000,
sd->max_mem_size * 1.0f / 1024 / 1024,
sd->max_params_mem_size * 1.0f / 1024 / 1024,
sd->max_rt_mem_size * 1.0f / 1024 / 1024);
ggml_free(ctx);
return result;
}
std::vector<uint8_t> StableDiffusion::img2img(const std::vector<uint8_t>& init_img_vec,
const std::string& prompt,
const std::string& negative_prompt,
float cfg_scale,
int width,
int height,
SampleMethod sample_method,
int sample_steps,
float strength,
int64_t seed) {
std::vector<uint8_t> result;
if (init_img_vec.size() != width * height * 3) {
return result;
}
LOG_INFO("img2img %dx%d", width, height);
std::vector<float> sigmas = sd->denoiser->schedule->get_sigmas(sample_steps);
size_t t_enc = static_cast<size_t>(sample_steps * strength);
LOG_INFO("target t_enc is %zu steps", t_enc);
std::vector<float> sigma_sched;
sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end());
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024) * 1024; // 10M
params.mem_size += width * height * 3 * sizeof(float) * 2;
params.mem_buffer = NULL;
params.no_alloc = false;
params.dynamic = false;
struct ggml_context* ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return result;
}
if (seed < 0) {
seed = (int)time(NULL);
}
sd->rng->manual_seed(seed);
ggml_tensor* init_img = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, width, height, 3, 1);
image_vec_to_ggml(init_img_vec, init_img);
int64_t t0 = ggml_time_ms();
ggml_tensor* moments = sd->encode_first_stage(ctx, init_img);
ggml_tensor* init_latent = sd->get_first_stage_encoding(ctx, moments);
// print_ggml_tensor(init_latent);
int64_t t1 = ggml_time_ms();
LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
ggml_reset_curr_max_dynamic_size(); // reset counter
ggml_tensor* c = sd->get_learned_condition(ctx, prompt);
struct ggml_tensor* uc = NULL;
if (cfg_scale != 1.0) {
uc = sd->get_learned_condition(ctx, negative_prompt);
}
int64_t t2 = ggml_time_ms();
LOG_INFO("get_learned_condition completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
if (sd->free_params_immediately) {
sd->curr_params_mem_size -= ggml_used_mem(sd->clip_params_ctx);
ggml_free(sd->clip_params_ctx);
sd->clip_params_ctx = NULL;
}
LOG_INFO("start sampling");
struct ggml_tensor* x_0 = sd->sample(ctx, init_latent, c, uc, cfg_scale, sample_method, sigma_sched);
// struct ggml_tensor *x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
// print_ggml_tensor(x_0);
int64_t t3 = ggml_time_ms();
LOG_INFO("sampling completed, taking %.2fs", (t3 - t2) * 1.0f / 1000);
if (sd->free_params_immediately) {
sd->curr_params_mem_size -= ggml_used_mem(sd->unet_params_ctx);
ggml_free(sd->unet_params_ctx);
sd->unet_params_ctx = NULL;
}
struct ggml_tensor* img = sd->decode_first_stage(ctx, x_0);
if (img != NULL) {
result = ggml_to_image_vec(img);
}
int64_t t4 = ggml_time_ms();
LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000);
if (sd->free_params_immediately) {
sd->curr_params_mem_size -= ggml_used_mem(sd->vae_params_ctx);
ggml_free(sd->vae_params_ctx);
sd->vae_params_ctx = NULL;
}
LOG_INFO(
"img2img completed in %.2fs, use %.2fMB of memory: peak params memory %.2fMB, "
"peak runtime memory %.2fMB",
(t4 - t0) * 1.0f / 1000,
sd->max_mem_size * 1.0f / 1024 / 1024,
sd->max_params_mem_size * 1.0f / 1024 / 1024,
sd->max_rt_mem_size * 1.0f / 1024 / 1024);
ggml_free(ctx);
return result;
}