Skip to content

Commit

Permalink
feat: add PhotoMaker Version 2 support (#358)
Browse files Browse the repository at this point in the history
* first attempt at updating to photomaker v2

* continue adding photomaker v2 modules

* finishing the last few pieces for photomaker v2; id_embeds need to be done by a manual step and pass as an input file

* added a name converter for Photomaker V2; build ok

* more debugging underway

* failing at cuda mat_mul

* updated chunk_half to be more efficient; redo feedforward

* fixed a bug: carefully using ggml_view_4d to get chunks of a tensor; strides need to be recalculated or set properly; still failing at soft_max cuda op

* redo weight calculation and weight*v

* fixed a bug now Photomaker V2 kinds of working

* add python script for face detection (Photomaker V2 needs)

* updated readme for photomaker

* fixed a bug causing PMV1 crashing; both V1 and V2 work

* fixed clean_input_ids for PMV2

* fixed a double counting bug in tokenize_with_trigger_token

* updated photomaker readme

* removed some commented code

* improved reconstructing class word free prompt

* changed reading id_embed to raw binary using existing load tensor function; this is more efficient than using model load and also makes it easier to work with sd server

* minor clean up

---------

Co-authored-by: bssrdf <[email protected]>
  • Loading branch information
bssrdf and bssrdf authored Nov 23, 2024
1 parent b99cbfe commit 2b1bc06
Show file tree
Hide file tree
Showing 11 changed files with 845 additions and 57 deletions.
27 changes: 22 additions & 5 deletions clip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,14 @@ class CLIPTokenizer {
}
}

std::string clean_up_tokenization(std::string &text){

std::regex pattern(R"( ,)");
// Replace " ," with ","
std::string result = std::regex_replace(text, pattern, ",");
return result;
}

std::string decode(const std::vector<int>& tokens) {
std::string text = "";
for (int t : tokens) {
Expand All @@ -351,8 +359,12 @@ class CLIPTokenizer {
std::u32string ts = decoder[t];
// printf("%d, %s \n", t, utf32_to_utf8(ts).c_str());
std::string s = utf32_to_utf8(ts);
if (s.length() >= 4 && ends_with(s, "</w>")) {
text += " " + s.replace(s.length() - 4, s.length() - 1, "");
if (s.length() >= 4 ){
if(ends_with(s, "</w>")) {
text += s.replace(s.length() - 4, s.length() - 1, "") + " ";
}else{
text += s;
}
} else {
text += " " + s;
}
Expand All @@ -364,6 +376,7 @@ class CLIPTokenizer {

// std::string s((char *)bytes.data());
// std::string s = "";
text = clean_up_tokenization(text);
return trim(text);
}

Expand Down Expand Up @@ -755,7 +768,8 @@ class CLIPVisionModel : public GGMLBlock {
blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
}

struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, bool return_pooled = true) {
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values,
bool return_pooled = true) {
// pixel_values: [N, num_channels, image_size, image_size]
auto embeddings = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
auto pre_layernorm = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
Expand All @@ -765,14 +779,17 @@ class CLIPVisionModel : public GGMLBlock {
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
x = pre_layernorm->forward(ctx, x);
x = encoder->forward(ctx, x, -1, false);
// print_ggml_tensor(x, true, "ClipVisionModel x: ");
auto last_hidden_state = x;
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]

GGML_ASSERT(x->ne[3] == 1);
GGML_ASSERT(x->ne[3] == 1);
if (return_pooled) {
ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
return pooled; // [N, hidden_size]
} else {
return x; // [N, n_token, hidden_size]
// return x; // [N, n_token, hidden_size]
return last_hidden_state; // [N, n_token, hidden_size]
}
}
};
Expand Down
24 changes: 15 additions & 9 deletions conditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "clip.hpp"
#include "t5.hpp"


struct SDCondition {
struct ggml_tensor* c_crossattn = NULL; // aka context
struct ggml_tensor* c_vector = NULL; // aka y
Expand Down Expand Up @@ -44,6 +45,7 @@ struct Conditioner {
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
SDVersion version = VERSION_SD1;
PMVersion pm_version = VERSION_1;
CLIPTokenizer tokenizer;
ggml_type wtype;
std::shared_ptr<CLIPTextModelRunner> text_model;
Expand All @@ -59,8 +61,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
ggml_type wtype,
const std::string& embd_dir,
SDVersion version = VERSION_SD1,
PMVersion pv = VERSION_1,
int clip_skip = -1)
: version(version), tokenizer(version == VERSION_SD2 ? 0 : 49407), embd_dir(embd_dir), wtype(wtype) {
: version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407), embd_dir(embd_dir), wtype(wtype) {
if (clip_skip <= 0) {
clip_skip = 1;
if (version == VERSION_SD2 || version == VERSION_SDXL) {
Expand Down Expand Up @@ -159,7 +162,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
tokenize_with_trigger_token(std::string text,
int num_input_imgs,
int32_t image_token,
bool padding = false) {
bool padding = false){
return tokenize_with_trigger_token(text, num_input_imgs, image_token,
text_model->model.n_token, padding);
}
Expand Down Expand Up @@ -268,7 +271,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
std::vector<int> clean_input_ids_tmp;
for (uint32_t i = 0; i < class_token_index[0]; i++)
clean_input_ids_tmp.push_back(clean_input_ids[i]);
for (uint32_t i = 0; i < num_input_imgs; i++)
for (uint32_t i = 0; i < (pm_version == VERSION_2 ? 2*num_input_imgs: num_input_imgs); i++)
clean_input_ids_tmp.push_back(class_token);
for (uint32_t i = class_token_index[0] + 1; i < clean_input_ids.size(); i++)
clean_input_ids_tmp.push_back(clean_input_ids[i]);
Expand All @@ -279,13 +282,16 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
weights.insert(weights.end(), clean_input_ids.size(), curr_weight);
}
tokens.insert(tokens.begin(), tokenizer.BOS_TOKEN_ID);
weights.insert(weights.begin(), 1.0);
// BUG!! double couting, pad_tokens will add BOS at the beginning
// tokens.insert(tokens.begin(), tokenizer.BOS_TOKEN_ID);
// weights.insert(weights.begin(), 1.0);

tokenizer.pad_tokens(tokens, weights, max_length, padding);

int offset = pm_version == VERSION_2 ? 2*num_input_imgs: num_input_imgs;
for (uint32_t i = 0; i < tokens.size(); i++) {
if (class_idx + 1 <= i && i < class_idx + 1 + num_input_imgs)
// if (class_idx + 1 <= i && i < class_idx + 1 + 2*num_input_imgs) // photomaker V2 has num_tokens(=2)*num_input_imgs
if (class_idx + 1 <= i && i < class_idx + 1 + offset) // photomaker V2 has num_tokens(=2)*num_input_imgs
// hardcode for now
class_token_mask.push_back(true);
else
class_token_mask.push_back(false);
Expand Down Expand Up @@ -530,7 +536,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
int height,
int num_input_imgs,
int adm_in_channels = -1,
bool force_zero_embeddings = false) {
bool force_zero_embeddings = false){
auto image_tokens = convert_token_to_id(trigger_word);
// if(image_tokens.size() == 1){
// printf(" image token id is: %d \n", image_tokens[0]);
Expand Down Expand Up @@ -958,7 +964,7 @@ struct SD3CLIPEmbedder : public Conditioner {
int height,
int num_input_imgs,
int adm_in_channels = -1,
bool force_zero_embeddings = false) {
bool force_zero_embeddings = false){
GGML_ASSERT(0 && "Not implemented yet!");
}

Expand Down
24 changes: 23 additions & 1 deletion docs/photo_maker.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,26 @@ Example:

```bash
bin/sd -m ../models/sdxlUnstableDiffusers_v11.safetensors --vae ../models/sdxl_vae.safetensors --stacked-id-embd-dir ../models/photomaker-v1.safetensors --input-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0 --sampling-method euler -H 1024 -W 1024 --style-ratio 10 --vae-on-cpu -o output.png
```
```

## PhotoMaker Version 2

[PhotoMaker Version 2 (PMV2)](https://github.com/TencentARC/PhotoMaker/blob/main/README_pmv2.md) has some key improvements. Unfortunately it has a very heavy dependency which makes running it a bit involved in ```SD.cpp```.

Running PMV2 is now a two-step process:

- Run a python script ```face_detect.py``` to obtain **id_embeds** for the given input images
```
python face_detect.py input_image_dir
```
An ```id_embeds.safetensors``` file will be generated in ```input_images_dir```

**Note: this step is only needed to run once; the same ```id_embeds``` can be reused**

- Run the same command as in version 1 but replacing ```photomaker-v1.safetensors``` with ```photomaker-v2.safetensors```.

You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2)

- All the command line parameters from Version 1 remain the same for Version 2


88 changes: 88 additions & 0 deletions face_detect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import os
import sys

import numpy as np
import torch
from diffusers.utils import load_image
# pip install insightface==0.7.3
from insightface.app import FaceAnalysis
from insightface.data import get_image as ins_get_image
from safetensors.torch import save_file

###
# https://github.com/cubiq/ComfyUI_IPAdapter_plus/issues/165#issue-2055829543
###
class FaceAnalysis2(FaceAnalysis):
# NOTE: allows setting det_size for each detection call.
# the model allows it but the wrapping code from insightface
# doesn't show it, and people end up loading duplicate models
# for different sizes where there is absolutely no need to
def get(self, img, max_num=0, det_size=(640, 640)):
if det_size is not None:
self.det_model.input_size = det_size

return super().get(img, max_num)

def analyze_faces(face_analysis: FaceAnalysis, img_data: np.ndarray, det_size=(640, 640)):
# NOTE: try detect faces, if no faces detected, lower det_size until it does
detection_sizes = [None] + [(size, size) for size in range(640, 256, -64)] + [(256, 256)]

for size in detection_sizes:
faces = face_analysis.get(img_data, det_size=size)
if len(faces) > 0:
return faces

return []

if __name__ == "__main__":
#face_detector = FaceAnalysis2(providers=['CUDAExecutionProvider'], allowed_modules=['detection', 'recognition'])
face_detector = FaceAnalysis2(providers=['CPUExecutionProvider'], allowed_modules=['detection', 'recognition'])
face_detector.prepare(ctx_id=0, det_size=(640, 640))
#input_folder_name = './scarletthead_woman'
input_folder_name = sys.argv[1]
image_basename_list = os.listdir(input_folder_name)
image_path_list = sorted([os.path.join(input_folder_name, basename) for basename in image_basename_list])

input_id_images = []
for image_path in image_path_list:
input_id_images.append(load_image(image_path))

id_embed_list = []

for img in input_id_images:
img = np.array(img)
img = img[:, :, ::-1]
faces = analyze_faces(face_detector, img)
if len(faces) > 0:
id_embed_list.append(torch.from_numpy((faces[0]['embedding'])))

if len(id_embed_list) == 0:
raise ValueError(f"No face detected in input image pool")

id_embeds = torch.stack(id_embed_list)

# for r in id_embeds:
# print(r)
# #torch.save(id_embeds, input_folder_name+'/id_embeds.pt');
# weights = dict()
# weights["id_embeds"] = id_embeds
# save_file(weights, input_folder_name+'/id_embeds.safetensors')

binary_data = id_embeds.numpy().tobytes()
two = 4
zero = 0
one = 1
tensor_name = "id_embeds"
# Write binary data to a file
with open(input_folder_name+'/id_embeds.bin', "wb") as f:
f.write(two.to_bytes(4, byteorder='little'))
f.write((len(tensor_name)).to_bytes(4, byteorder='little'))
f.write(zero.to_bytes(4, byteorder='little'))
f.write((id_embeds.shape[1]).to_bytes(4, byteorder='little'))
f.write((id_embeds.shape[0]).to_bytes(4, byteorder='little'))
f.write(one.to_bytes(4, byteorder='little'))
f.write(one.to_bytes(4, byteorder='little'))
f.write(tensor_name.encode('ascii'))
f.write(binary_data)


8 changes: 7 additions & 1 deletion ggml_extend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1047,6 +1047,11 @@ struct GGMLRunner {
params_buffer_size / (1024.0 * 1024.0),
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
num_tensors);
// printf("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)\n",
// get_desc().c_str(),
// params_buffer_size / (1024.0 * 1024.0),
// ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
// num_tensors);
return true;
}

Expand Down Expand Up @@ -1216,7 +1221,8 @@ class Linear : public UnaryBlock {
params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
if (bias) {
params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_features);
}
}

}

public:
Expand Down
42 changes: 40 additions & 2 deletions model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,33 @@ std::unordered_map<std::string, std::string> vae_decoder_name_map = {
{"first_stage_model.decoder.mid.attn_1.to_v.weight", "first_stage_model.decoder.mid.attn_1.v.weight"},
};

std::unordered_map<std::string, std::string> pmid_v2_name_map = {
{"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.fc1.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.3.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.fc2.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.fc1.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.3.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.fc2.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.fc1.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.3.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.fc2.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.fc1.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.3.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.fc2.weight"},
{"pmid.qformer_perceiver.token_proj.0.bias",
"pmid.qformer_perceiver.token_proj.fc1.bias"},
{"pmid.qformer_perceiver.token_proj.2.bias",
"pmid.qformer_perceiver.token_proj.fc2.bias"},
{"pmid.qformer_perceiver.token_proj.0.weight",
"pmid.qformer_perceiver.token_proj.fc1.weight"},
{"pmid.qformer_perceiver.token_proj.2.weight",
"pmid.qformer_perceiver.token_proj.fc2.weight"},
};

std::string convert_open_clip_to_hf_clip(const std::string& name) {
std::string new_name = name;
std::string prefix;
Expand Down Expand Up @@ -212,6 +239,13 @@ std::string convert_vae_decoder_name(const std::string& name) {
return name;
}

std::string convert_pmid_v2_name(const std::string& name) {
if (pmid_v2_name_map.find(name) != pmid_v2_name_map.end()) {
return pmid_v2_name_map[name];
}
return name;
}

/* If not a SDXL LoRA the unet" prefix will have already been replaced by this
* point and "te2" and "te1" don't seem to appear in non-SDXL only "te_" */
std::string convert_sdxl_lora_name(std::string tensor_name) {
Expand Down Expand Up @@ -443,6 +477,8 @@ std::string convert_tensor_name(std::string name) {
new_name = convert_open_clip_to_hf_clip(name);
} else if (starts_with(name, "first_stage_model.decoder")) {
new_name = convert_vae_decoder_name(name);
} else if (starts_with(name, "pmid.qformer_perceiver")) {
new_name = convert_pmid_v2_name(name);
} else if (starts_with(name, "control_model.")) { // for controlnet pth models
size_t pos = name.find('.');
if (pos != std::string::npos) {
Expand Down Expand Up @@ -1015,7 +1051,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
}

TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index, ST_HEADER_SIZE_LEN + header_size_ + begin);
tensor_storage.reverse_ne();
tensor_storage.reverse_ne();

size_t tensor_data_size = end - begin;

Expand Down Expand Up @@ -1362,7 +1398,7 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer,
reader.tensor_storage.reverse_ne();
reader.tensor_storage.file_index = file_index;
// if(strcmp(prefix.c_str(), "scarlett") == 0)
// printf(" got tensor %s \n ", reader.tensor_storage.name.c_str());
// printf(" ZIP got tensor %s \n ", reader.tensor_storage.name.c_str());
reader.tensor_storage.name = prefix + reader.tensor_storage.name;
tensor_storages.push_back(reader.tensor_storage);
// LOG_DEBUG("%s", reader.tensor_storage.name.c_str());
Expand Down Expand Up @@ -1398,7 +1434,9 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
std::string name = zip_entry_name(zip);
size_t pos = name.find("data.pkl");
if (pos != std::string::npos) {

std::string dir = name.substr(0, pos);
printf("ZIP %d, name = %s, dir = %s \n", i, name.c_str(), dir.c_str());
void* pkl_data = NULL;
size_t pkl_size;
zip_entry_read(zip, &pkl_data, &pkl_size);
Expand Down
6 changes: 6 additions & 0 deletions model.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ enum SDVersion {
VERSION_COUNT,
};

enum PMVersion {
VERSION_1,
VERSION_2,
};

struct TensorStorage {
std::string name;
ggml_type type = GGML_TYPE_F32;
Expand Down Expand Up @@ -162,6 +167,7 @@ class ModelLoader {
bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
ggml_backend_t backend,
std::set<std::string> ignore_tensors = {});

bool save_to_gguf_file(const std::string& file_path, ggml_type type);
bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
Expand Down
Loading

0 comments on commit 2b1bc06

Please sign in to comment.