diff --git a/extras/ip_adapter.py b/extras/ip_adapter.py index ac64ef1..cb1d366 100644 --- a/extras/ip_adapter.py +++ b/extras/ip_adapter.py @@ -167,14 +167,7 @@ def preprocess(img, ip_adapter_path): ldm_patched.modules.model_management.load_model_gpu(clip_vision.patcher) pixel_values = clip_preprocess(numpy_to_pytorch(img).to(clip_vision.load_device)) - - if clip_vision.dtype != torch.float32: - precision_scope = torch.autocast - else: - precision_scope = lambda a, b: contextlib.nullcontext(a) - - with precision_scope(ldm_patched.modules.model_management.get_autocast_device(clip_vision.load_device), torch.float32): - outputs = clip_vision.model(pixel_values=pixel_values, output_hidden_states=True) + outputs = clip_vision.model(pixel_values=pixel_values, output_hidden_states=True) ip_adapter = entry['ip_adapter'] ip_layers = entry['ip_layers'] diff --git a/fooocus_version.py b/fooocus_version.py index 70a5e92..b588f46 100644 --- a/fooocus_version.py +++ b/fooocus_version.py @@ -1 +1 @@ -version = '2.1.850' +version = '2.1.851' diff --git a/ldm_patched/contrib/external_images.py b/ldm_patched/contrib/external_images.py index 3dbb3e3..17e9c49 100644 --- a/ldm_patched/contrib/external_images.py +++ b/ldm_patched/contrib/external_images.py @@ -76,7 +76,7 @@ class SaveAnimatedWEBP: OUTPUT_NODE = True - CATEGORY = "_for_testing" + CATEGORY = "image/animation" def save_images(self, images, fps, filename_prefix, lossless, quality, method, num_frames=0, prompt=None, extra_pnginfo=None): method = self.methods.get(method) @@ -138,7 +138,7 @@ class SaveAnimatedPNG: OUTPUT_NODE = True - CATEGORY = "_for_testing" + CATEGORY = "image/animation" def save_images(self, images, fps, compress_level, filename_prefix="ldm_patched", prompt=None, extra_pnginfo=None): filename_prefix += self.prefix_append diff --git a/ldm_patched/modules/args_parser.py b/ldm_patched/modules/args_parser.py index 3931997..7957783 100644 --- a/ldm_patched/modules/args_parser.py +++ b/ldm_patched/modules/args_parser.py @@ -102,7 +102,7 @@ vram_group.add_argument("--always-cpu", action="store_true") parser.add_argument("--always-offload-from-vram", action="store_true") - +parser.add_argument("--pytorch-deterministic", action="store_true") parser.add_argument("--disable-server-log", action="store_true") parser.add_argument("--debug-mode", action="store_true") diff --git a/ldm_patched/modules/model_management.py b/ldm_patched/modules/model_management.py index 0e783b3..31cf95d 100644 --- a/ldm_patched/modules/model_management.py +++ b/ldm_patched/modules/model_management.py @@ -28,6 +28,10 @@ total_vram = 0 lowvram_available = True xpu_available = False +if args.pytorch_deterministic: + print("Using deterministic algorithms for pytorch") + torch.use_deterministic_algorithms(True, warn_only=True) + directml_enabled = False if args.directml is not None: import torch_directml diff --git a/modules/patch_clip.py b/modules/patch_clip.py index 5a3e85d..74ee436 100644 --- a/modules/patch_clip.py +++ b/modules/patch_clip.py @@ -12,17 +12,34 @@ import ldm_patched.modules.args_parser import ldm_patched.modules.model_base import ldm_patched.modules.model_management import ldm_patched.modules.model_patcher -import ldm_patched.modules.ops import ldm_patched.modules.samplers import ldm_patched.modules.sd import ldm_patched.modules.sd1_clip import ldm_patched.modules.clip_vision import ldm_patched.modules.model_management as model_management +import ldm_patched.modules.ops as ops import contextlib from transformers import CLIPTextModel, CLIPTextConfig, modeling_utils, CLIPVisionConfig, CLIPVisionModelWithProjection +@contextlib.contextmanager +def use_patched_ops(operations): + op_names = ['Linear', 'Conv2d', 'Conv3d', 'GroupNorm', 'LayerNorm'] + backups = {op_name: getattr(torch.nn, op_name) for op_name in op_names} + + try: + for op_name in op_names: + setattr(torch.nn, op_name, getattr(operations, op_name)) + + yield + + finally: + for op_name in op_names: + setattr(torch.nn, op_name, backups[op_name]) + return + + def patched_encode_token_weights(self, token_weight_pairs): to_encode = list() max_token_len = 0 @@ -79,15 +96,14 @@ def patched_SDClipModel__init__(self, max_length=77, freeze=True, layer="last", config = CLIPTextConfig.from_json_file(textmodel_json_config) self.num_layers = config.num_hidden_layers - with modeling_utils.no_init_weights(): - self.transformer = CLIPTextModel(config) - - if 'cuda' not in model_management.text_encoder_device().type: - dtype = torch.float32 + with use_patched_ops(ops.manual_cast): + with modeling_utils.no_init_weights(): + self.transformer = CLIPTextModel(config) if dtype is not None: self.transformer.to(dtype) - self.transformer.text_model.embeddings.to(torch.float32) + + self.transformer.text_model.embeddings.to(torch.float32) if freeze: self.freeze() @@ -114,42 +130,37 @@ def patched_SDClipModel_forward(self, tokens): tokens = self.set_up_textual_embeddings(tokens, backup_embeds) tokens = torch.LongTensor(tokens).to(device) - if self.transformer.text_model.final_layer_norm.weight.dtype != torch.float32: - precision_scope = torch.autocast + attention_mask = None + if self.enable_attention_masks: + attention_mask = torch.zeros_like(tokens) + max_token = self.transformer.get_input_embeddings().weight.shape[0] - 1 + for x in range(attention_mask.shape[0]): + for y in range(attention_mask.shape[1]): + attention_mask[x, y] = 1 + if tokens[x, y] == max_token: + break + + outputs = self.transformer(input_ids=tokens, attention_mask=attention_mask, + output_hidden_states=self.layer == "hidden") + self.transformer.set_input_embeddings(backup_embeds) + + if self.layer == "last": + z = outputs.last_hidden_state + elif self.layer == "pooled": + z = outputs.pooler_output[:, None, :] else: - precision_scope = lambda a, dtype: contextlib.nullcontext(a) + z = outputs.hidden_states[self.layer_idx] + if self.layer_norm_hidden_state: + z = self.transformer.text_model.final_layer_norm(z) - with precision_scope(model_management.get_autocast_device(device), dtype=torch.float32): - attention_mask = None - if self.enable_attention_masks: - attention_mask = torch.zeros_like(tokens) - max_token = self.transformer.get_input_embeddings().weight.shape[0] - 1 - for x in range(attention_mask.shape[0]): - for y in range(attention_mask.shape[1]): - attention_mask[x, y] = 1 - if tokens[x, y] == max_token: - break + if hasattr(outputs, "pooler_output"): + pooled_output = outputs.pooler_output.float() + else: + pooled_output = None - outputs = self.transformer(input_ids=tokens, attention_mask=attention_mask, - output_hidden_states=self.layer == "hidden") - self.transformer.set_input_embeddings(backup_embeds) + if self.text_projection is not None and pooled_output is not None: + pooled_output = pooled_output.float().to(self.text_projection.device) @ self.text_projection.float() - if self.layer == "last": - z = outputs.last_hidden_state - elif self.layer == "pooled": - z = outputs.pooler_output[:, None, :] - else: - z = outputs.hidden_states[self.layer_idx] - if self.layer_norm_hidden_state: - z = self.transformer.text_model.final_layer_norm(z) - - if hasattr(outputs, "pooler_output"): - pooled_output = outputs.pooler_output.float() - else: - pooled_output = None - - if self.text_projection is not None and pooled_output is not None: - pooled_output = pooled_output.float().to(self.text_projection.device) @ self.text_projection.float() return z.float(), pooled_output @@ -164,11 +175,9 @@ def patched_ClipVisionModel__init__(self, json_config): else: self.dtype = torch.float32 - if 'cuda' not in self.load_device.type: - self.dtype = torch.float32 - - with modeling_utils.no_init_weights(): - self.model = CLIPVisionModelWithProjection(config) + with use_patched_ops(ops.manual_cast): + with modeling_utils.no_init_weights(): + self.model = CLIPVisionModelWithProjection(config) self.model.to(self.dtype) self.patcher = ldm_patched.modules.model_patcher.ModelPatcher( @@ -181,14 +190,7 @@ def patched_ClipVisionModel__init__(self, json_config): def patched_ClipVisionModel_encode_image(self, image): ldm_patched.modules.model_management.load_model_gpu(self.patcher) pixel_values = ldm_patched.modules.clip_vision.clip_preprocess(image.to(self.load_device)) - - if self.dtype != torch.float32: - precision_scope = torch.autocast - else: - precision_scope = lambda a, b: contextlib.nullcontext(a) - - with precision_scope(ldm_patched.modules.model_management.get_autocast_device(self.load_device), torch.float32): - outputs = self.model(pixel_values=pixel_values, output_hidden_states=True) + outputs = self.model(pixel_values=pixel_values, output_hidden_states=True) for k in outputs: t = outputs[k]