diff --git a/backend/headless/fcbh/clip_vision.py b/backend/headless/fcbh/clip_vision.py index b93b0da..f3c4bb6 100644 --- a/backend/headless/fcbh/clip_vision.py +++ b/backend/headless/fcbh/clip_vision.py @@ -1,5 +1,5 @@ -from transformers import CLIPVisionModelWithProjection, CLIPVisionConfig, CLIPImageProcessor, modeling_utils -from .utils import load_torch_file, transformers_convert +from transformers import CLIPVisionModelWithProjection, CLIPVisionConfig, modeling_utils +from .utils import load_torch_file, transformers_convert, common_upscale import os import torch import contextlib @@ -7,6 +7,18 @@ import contextlib import fcbh.ops import fcbh.model_patcher import fcbh.model_management +import fcbh.utils + +def clip_preprocess(image, size=224): + mean = torch.tensor([ 0.48145466,0.4578275,0.40821073], device=image.device, dtype=image.dtype) + std = torch.tensor([0.26862954,0.26130258,0.27577711], device=image.device, dtype=image.dtype) + scale = (size / min(image.shape[1], image.shape[2])) + image = torch.nn.functional.interpolate(image.movedim(-1, 1), size=(round(scale * image.shape[1]), round(scale * image.shape[2])), mode="bicubic", antialias=True) + h = (image.shape[2] - size)//2 + w = (image.shape[3] - size)//2 + image = image[:,:,h:h+size,w:w+size] + image = torch.clip((255. * image), 0, 255).round() / 255.0 + return (image - mean.view([3,1,1])) / std.view([3,1,1]) class ClipVisionModel(): def __init__(self, json_config): @@ -23,25 +35,12 @@ class ClipVisionModel(): self.model.to(self.dtype) self.patcher = fcbh.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device) - self.processor = CLIPImageProcessor(crop_size=224, - do_center_crop=True, - do_convert_rgb=True, - do_normalize=True, - do_resize=True, - image_mean=[ 0.48145466,0.4578275,0.40821073], - image_std=[0.26862954,0.26130258,0.27577711], - resample=3, #bicubic - size=224) - def load_sd(self, sd): return self.model.load_state_dict(sd, strict=False) def encode_image(self, image): - img = torch.clip((255. * image), 0, 255).round().int() - img = list(map(lambda a: a, img)) - inputs = self.processor(images=img, return_tensors="pt") fcbh.model_management.load_model_gpu(self.patcher) - pixel_values = inputs['pixel_values'].to(self.load_device) + pixel_values = clip_preprocess(image.to(self.load_device)) if self.dtype != torch.float32: precision_scope = torch.autocast diff --git a/backend/headless/fcbh/ldm/modules/attention.py b/backend/headless/fcbh/ldm/modules/attention.py index a0af385..f3e1b6e 100644 --- a/backend/headless/fcbh/ldm/modules/attention.py +++ b/backend/headless/fcbh/ldm/modules/attention.py @@ -222,9 +222,14 @@ def attention_split(q, k, v, heads, mask=None): mem_free_total = model_management.get_free_memory(q.device) + if _ATTN_PRECISION =="fp32": + element_size = 4 + else: + element_size = q.element_size() + gb = 1024 ** 3 - tensor_size = q.shape[0] * q.shape[1] * k.shape[1] * q.element_size() - modifier = 3 if q.element_size() == 2 else 2.5 + tensor_size = q.shape[0] * q.shape[1] * k.shape[1] * element_size + modifier = 3 if element_size == 2 else 2.5 mem_required = tensor_size * modifier steps = 1 diff --git a/backend/headless/fcbh/ldm/modules/sub_quadratic_attention.py b/backend/headless/fcbh/ldm/modules/sub_quadratic_attention.py index 1f07431..11d1dd4 100644 --- a/backend/headless/fcbh/ldm/modules/sub_quadratic_attention.py +++ b/backend/headless/fcbh/ldm/modules/sub_quadratic_attention.py @@ -83,7 +83,8 @@ def _summarize_chunk( ) max_score, _ = torch.max(attn_weights, -1, keepdim=True) max_score = max_score.detach() - torch.exp(attn_weights - max_score, out=attn_weights) + attn_weights -= max_score + torch.exp(attn_weights, out=attn_weights) exp_weights = attn_weights.to(value.dtype) exp_values = torch.bmm(exp_weights, value) max_score = max_score.squeeze(-1) diff --git a/fooocus_extras/ip_adapter.py b/fooocus_extras/ip_adapter.py index 0d2ca01..aeb7de2 100644 --- a/fooocus_extras/ip_adapter.py +++ b/fooocus_extras/ip_adapter.py @@ -7,6 +7,7 @@ import fcbh.ldm.modules.attention as attention from fooocus_extras.resampler import Resampler from fcbh.model_patcher import ModelPatcher +from modules.core import numpy_to_pytorch SD_V12_CHANNELS = [320] * 4 + [640] * 4 + [1280] * 4 + [1280] * 6 + [640] * 6 + [320] * 6 + [1280] * 2 @@ -144,14 +145,27 @@ def load_ip_adapter(clip_vision_path, ip_negative_path, ip_adapter_path): return +@torch.no_grad() +@torch.inference_mode() +def clip_preprocess(image): + mean = torch.tensor([0.48145466, 0.4578275, 0.40821073], device=image.device, dtype=image.dtype).view([1, 3, 1, 1]) + std = torch.tensor([0.26862954, 0.26130258, 0.27577711], device=image.device, dtype=image.dtype).view([1, 3, 1, 1]) + image = image.movedim(-1, 1) + + # https://github.com/tencent-ailab/IP-Adapter/blob/d580c50a291566bbf9fc7ac0f760506607297e6d/README.md?plain=1#L75 + B, C, H, W = image.shape + assert H == 224 and W == 224 + + return (image - mean) / std + + @torch.no_grad() @torch.inference_mode() def preprocess(img): global ip_unconds - inputs = clip_vision.processor(images=img, return_tensors="pt") fcbh.model_management.load_model_gpu(clip_vision.patcher) - pixel_values = inputs['pixel_values'].to(clip_vision.load_device) + pixel_values = clip_preprocess(numpy_to_pytorch(img).to(clip_vision.load_device)) if clip_vision.dtype != torch.float32: precision_scope = torch.autocast diff --git a/fooocus_version.py b/fooocus_version.py index 7436cd3..64de171 100644 --- a/fooocus_version.py +++ b/fooocus_version.py @@ -1 +1 @@ -version = '2.1.744' +version = '2.1.745' diff --git a/modules/async_worker.py b/modules/async_worker.py index 9ef54aa..26cd986 100644 --- a/modules/async_worker.py +++ b/modules/async_worker.py @@ -210,8 +210,8 @@ def worker(): if not skip_prompt_processing: - prompts = remove_empty_str([safe_str(p) for p in prompt.split('\n')], default='') - negative_prompts = remove_empty_str([safe_str(p) for p in negative_prompt.split('\n')], default='') + prompts = remove_empty_str([safe_str(p) for p in prompt.splitlines()], default='') + negative_prompts = remove_empty_str([safe_str(p) for p in negative_prompt.splitlines()], default='') prompt = prompts[0] negative_prompt = negative_prompts[0] @@ -239,8 +239,8 @@ def worker(): if use_style: for s in style_selections: p, n = apply_style(s, positive=task_prompt) - positive_basic_workloads.append(p) - negative_basic_workloads.append(n) + positive_basic_workloads += p + negative_basic_workloads += n else: positive_basic_workloads.append(task_prompt) diff --git a/modules/sdxl_styles.py b/modules/sdxl_styles.py index 6979aa9..14a4ff1 100644 --- a/modules/sdxl_styles.py +++ b/modules/sdxl_styles.py @@ -40,7 +40,9 @@ for styles_file in styles_files: try: with open(os.path.join(styles_path, styles_file), encoding='utf-8') as f: for entry in json.load(f): - name, prompt, negative_prompt = normalize_key(entry['name']), entry['prompt'], entry['negative_prompt'] + name = normalize_key(entry['name']) + prompt = entry['prompt'] if 'prompt' in entry else '' + negative_prompt = entry['negative_prompt'] if 'negative_prompt' in entry else '' styles[name] = (prompt, negative_prompt) except Exception as e: print(str(e)) @@ -53,7 +55,7 @@ legal_style_names = [fooocus_expansion] + style_keys def apply_style(style, positive): p, n = styles[style] - return p.replace('{prompt}', positive), n + return p.replace('{prompt}', positive).splitlines(), n.splitlines() def apply_wildcards(wildcard_text, rng, directory=wildcards_path): diff --git a/sdxl_styles/sdxl_styles_fooocus.json b/sdxl_styles/sdxl_styles_fooocus.json index b6e816d..68c1db2 100644 --- a/sdxl_styles/sdxl_styles_fooocus.json +++ b/sdxl_styles/sdxl_styles_fooocus.json @@ -1,13 +1,12 @@ [ { "name": "Fooocus Enhance", - "prompt": "{prompt} . (perfect real extremely details), award-winning, breathtaking, amazing fine detail, dramatic lighting, best quality", "negative_prompt": "(worst quality, low quality, normal quality, lowres, low details, oversaturated, undersaturated, overexposed, underexposed, grayscale, bw, bad photo, bad photography, bad art:1.4), (watermark, signature, text font, username, error, logo, words, letters, digits, autograph, trademark, name:1.2), (blur, blurry, grainy), morbid, ugly, asymmetrical, mutated malformed, mutilated, poorly lit, bad shadow, draft, cropped, out of frame, cut off, censored, jpeg artifacts, out of focus, glitch, duplicate, (airbrushed, cartoon, anime, semi-realistic, cgi, render, blender, digital art, manga, amateur:1.3), (3D ,3D Game, 3D Game Scene, 3D Character:1.1), (bad hands, bad anatomy, bad body, bad face, bad teeth, bad arms, bad legs, deformities:1.3)" }, { "name": "Fooocus Sharp", - "prompt": "cinematic still {prompt} . sharp focus, emotional, harmonious, vignette, 4k epic detailed photograph shot on kodak detailed cinematic hbo dark moody, 35mm photo, high budget, cinemascope, moody, epic, gorgeous, film grain, grainy", - "negative_prompt": "blurry, anime, cartoon, blured background, graphic, bokeh, background blur, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured" + "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, sharp focus, cinemascope, moody, epic, gorgeous, film grain, grainy", + "negative_prompt": "anime, cartoon, graphic, (blur, blurry, bokeh), text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured" }, { "name": "Fooocus Masterpiece", @@ -21,7 +20,6 @@ }, { "name": "Fooocus Negative", - "prompt": "", "negative_prompt": "deformed, bad anatomy, disfigured, poorly drawn face, mutated, extra limb, ugly, poorly drawn hands, missing limb, floating limbs, disconnected limbs, disconnected head, malformed hands, long neck, mutated hands and fingers, bad hands, missing fingers, cropped, worst quality, low quality, mutation, poorly drawn, huge calf, bad hands, fused hand, missing hand, disappearing arms, disappearing thigh, disappearing calf, disappearing legs, missing fingers, fused fingers, abnormal eye proportion, Abnormal hands, abnormal legs, abnormal feet, abnormal fingers, drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly, anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch" }, {