342 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			342 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import torch
 | 
						|
from torch import nn, einsum
 | 
						|
from .ldm.modules.attention import CrossAttention
 | 
						|
from inspect import isfunction
 | 
						|
 | 
						|
 | 
						|
def exists(val):
 | 
						|
    return val is not None
 | 
						|
 | 
						|
 | 
						|
def uniq(arr):
 | 
						|
    return{el: True for el in arr}.keys()
 | 
						|
 | 
						|
 | 
						|
def default(val, d):
 | 
						|
    if exists(val):
 | 
						|
        return val
 | 
						|
    return d() if isfunction(d) else d
 | 
						|
 | 
						|
 | 
						|
# feedforward
 | 
						|
class GEGLU(nn.Module):
 | 
						|
    def __init__(self, dim_in, dim_out):
 | 
						|
        super().__init__()
 | 
						|
        self.proj = nn.Linear(dim_in, dim_out * 2)
 | 
						|
 | 
						|
    def forward(self, x):
 | 
						|
        x, gate = self.proj(x).chunk(2, dim=-1)
 | 
						|
        return x * torch.nn.functional.gelu(gate)
 | 
						|
 | 
						|
 | 
						|
class FeedForward(nn.Module):
 | 
						|
    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
 | 
						|
        super().__init__()
 | 
						|
        inner_dim = int(dim * mult)
 | 
						|
        dim_out = default(dim_out, dim)
 | 
						|
        project_in = nn.Sequential(
 | 
						|
            nn.Linear(dim, inner_dim),
 | 
						|
            nn.GELU()
 | 
						|
        ) if not glu else GEGLU(dim, inner_dim)
 | 
						|
 | 
						|
        self.net = nn.Sequential(
 | 
						|
            project_in,
 | 
						|
            nn.Dropout(dropout),
 | 
						|
            nn.Linear(inner_dim, dim_out)
 | 
						|
        )
 | 
						|
 | 
						|
    def forward(self, x):
 | 
						|
        return self.net(x)
 | 
						|
 | 
						|
 | 
						|
class GatedCrossAttentionDense(nn.Module):
 | 
						|
    def __init__(self, query_dim, context_dim, n_heads, d_head):
 | 
						|
        super().__init__()
 | 
						|
 | 
						|
        self.attn = CrossAttention(
 | 
						|
            query_dim=query_dim,
 | 
						|
            context_dim=context_dim,
 | 
						|
            heads=n_heads,
 | 
						|
            dim_head=d_head)
 | 
						|
        self.ff = FeedForward(query_dim, glu=True)
 | 
						|
 | 
						|
        self.norm1 = nn.LayerNorm(query_dim)
 | 
						|
        self.norm2 = nn.LayerNorm(query_dim)
 | 
						|
 | 
						|
        self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
 | 
						|
        self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
 | 
						|
 | 
						|
        # this can be useful: we can externally change magnitude of tanh(alpha)
 | 
						|
        # for example, when it is set to 0, then the entire model is same as
 | 
						|
        # original one
 | 
						|
        self.scale = 1
 | 
						|
 | 
						|
    def forward(self, x, objs):
 | 
						|
 | 
						|
        x = x + self.scale * \
 | 
						|
            torch.tanh(self.alpha_attn) * self.attn(self.norm1(x), objs, objs)
 | 
						|
        x = x + self.scale * \
 | 
						|
            torch.tanh(self.alpha_dense) * self.ff(self.norm2(x))
 | 
						|
 | 
						|
        return x
 | 
						|
 | 
						|
 | 
						|
class GatedSelfAttentionDense(nn.Module):
 | 
						|
    def __init__(self, query_dim, context_dim, n_heads, d_head):
 | 
						|
        super().__init__()
 | 
						|
 | 
						|
        # we need a linear projection since we need cat visual feature and obj
 | 
						|
        # feature
 | 
						|
        self.linear = nn.Linear(context_dim, query_dim)
 | 
						|
 | 
						|
        self.attn = CrossAttention(
 | 
						|
            query_dim=query_dim,
 | 
						|
            context_dim=query_dim,
 | 
						|
            heads=n_heads,
 | 
						|
            dim_head=d_head)
 | 
						|
        self.ff = FeedForward(query_dim, glu=True)
 | 
						|
 | 
						|
        self.norm1 = nn.LayerNorm(query_dim)
 | 
						|
        self.norm2 = nn.LayerNorm(query_dim)
 | 
						|
 | 
						|
        self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
 | 
						|
        self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
 | 
						|
 | 
						|
        # this can be useful: we can externally change magnitude of tanh(alpha)
 | 
						|
        # for example, when it is set to 0, then the entire model is same as
 | 
						|
        # original one
 | 
						|
        self.scale = 1
 | 
						|
 | 
						|
    def forward(self, x, objs):
 | 
						|
 | 
						|
        N_visual = x.shape[1]
 | 
						|
        objs = self.linear(objs)
 | 
						|
 | 
						|
        x = x + self.scale * torch.tanh(self.alpha_attn) * self.attn(
 | 
						|
            self.norm1(torch.cat([x, objs], dim=1)))[:, 0:N_visual, :]
 | 
						|
        x = x + self.scale * \
 | 
						|
            torch.tanh(self.alpha_dense) * self.ff(self.norm2(x))
 | 
						|
 | 
						|
        return x
 | 
						|
 | 
						|
 | 
						|
class GatedSelfAttentionDense2(nn.Module):
 | 
						|
    def __init__(self, query_dim, context_dim, n_heads, d_head):
 | 
						|
        super().__init__()
 | 
						|
 | 
						|
        # we need a linear projection since we need cat visual feature and obj
 | 
						|
        # feature
 | 
						|
        self.linear = nn.Linear(context_dim, query_dim)
 | 
						|
 | 
						|
        self.attn = CrossAttention(
 | 
						|
            query_dim=query_dim, context_dim=query_dim, dim_head=d_head)
 | 
						|
        self.ff = FeedForward(query_dim, glu=True)
 | 
						|
 | 
						|
        self.norm1 = nn.LayerNorm(query_dim)
 | 
						|
        self.norm2 = nn.LayerNorm(query_dim)
 | 
						|
 | 
						|
        self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
 | 
						|
        self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
 | 
						|
 | 
						|
        # this can be useful: we can externally change magnitude of tanh(alpha)
 | 
						|
        # for example, when it is set to 0, then the entire model is same as
 | 
						|
        # original one
 | 
						|
        self.scale = 1
 | 
						|
 | 
						|
    def forward(self, x, objs):
 | 
						|
 | 
						|
        B, N_visual, _ = x.shape
 | 
						|
        B, N_ground, _ = objs.shape
 | 
						|
 | 
						|
        objs = self.linear(objs)
 | 
						|
 | 
						|
        # sanity check
 | 
						|
        size_v = math.sqrt(N_visual)
 | 
						|
        size_g = math.sqrt(N_ground)
 | 
						|
        assert int(size_v) == size_v, "Visual tokens must be square rootable"
 | 
						|
        assert int(size_g) == size_g, "Grounding tokens must be square rootable"
 | 
						|
        size_v = int(size_v)
 | 
						|
        size_g = int(size_g)
 | 
						|
 | 
						|
        # select grounding token and resize it to visual token size as residual
 | 
						|
        out = self.attn(self.norm1(torch.cat([x, objs], dim=1)))[
 | 
						|
            :, N_visual:, :]
 | 
						|
        out = out.permute(0, 2, 1).reshape(B, -1, size_g, size_g)
 | 
						|
        out = torch.nn.functional.interpolate(
 | 
						|
            out, (size_v, size_v), mode='bicubic')
 | 
						|
        residual = out.reshape(B, -1, N_visual).permute(0, 2, 1)
 | 
						|
 | 
						|
        # add residual to visual feature
 | 
						|
        x = x + self.scale * torch.tanh(self.alpha_attn) * residual
 | 
						|
        x = x + self.scale * \
 | 
						|
            torch.tanh(self.alpha_dense) * self.ff(self.norm2(x))
 | 
						|
 | 
						|
        return x
 | 
						|
 | 
						|
 | 
						|
class FourierEmbedder():
 | 
						|
    def __init__(self, num_freqs=64, temperature=100):
 | 
						|
 | 
						|
        self.num_freqs = num_freqs
 | 
						|
        self.temperature = temperature
 | 
						|
        self.freq_bands = temperature ** (torch.arange(num_freqs) / num_freqs)
 | 
						|
 | 
						|
    @torch.no_grad()
 | 
						|
    def __call__(self, x, cat_dim=-1):
 | 
						|
        "x: arbitrary shape of tensor. dim: cat dim"
 | 
						|
        out = []
 | 
						|
        for freq in self.freq_bands:
 | 
						|
            out.append(torch.sin(freq * x))
 | 
						|
            out.append(torch.cos(freq * x))
 | 
						|
        return torch.cat(out, cat_dim)
 | 
						|
 | 
						|
 | 
						|
class PositionNet(nn.Module):
 | 
						|
    def __init__(self, in_dim, out_dim, fourier_freqs=8):
 | 
						|
        super().__init__()
 | 
						|
        self.in_dim = in_dim
 | 
						|
        self.out_dim = out_dim
 | 
						|
 | 
						|
        self.fourier_embedder = FourierEmbedder(num_freqs=fourier_freqs)
 | 
						|
        self.position_dim = fourier_freqs * 2 * 4  # 2 is sin&cos, 4 is xyxy
 | 
						|
 | 
						|
        self.linears = nn.Sequential(
 | 
						|
            nn.Linear(self.in_dim + self.position_dim, 512),
 | 
						|
            nn.SiLU(),
 | 
						|
            nn.Linear(512, 512),
 | 
						|
            nn.SiLU(),
 | 
						|
            nn.Linear(512, out_dim),
 | 
						|
        )
 | 
						|
 | 
						|
        self.null_positive_feature = torch.nn.Parameter(
 | 
						|
            torch.zeros([self.in_dim]))
 | 
						|
        self.null_position_feature = torch.nn.Parameter(
 | 
						|
            torch.zeros([self.position_dim]))
 | 
						|
 | 
						|
    def forward(self, boxes, masks, positive_embeddings):
 | 
						|
        B, N, _ = boxes.shape
 | 
						|
        dtype = self.linears[0].weight.dtype
 | 
						|
        masks = masks.unsqueeze(-1).to(dtype)
 | 
						|
        positive_embeddings = positive_embeddings.to(dtype)
 | 
						|
 | 
						|
        # embedding position (it may includes padding as placeholder)
 | 
						|
        xyxy_embedding = self.fourier_embedder(boxes.to(dtype))  # B*N*4 --> B*N*C
 | 
						|
 | 
						|
        # learnable null embedding
 | 
						|
        positive_null = self.null_positive_feature.view(1, 1, -1)
 | 
						|
        xyxy_null = self.null_position_feature.view(1, 1, -1)
 | 
						|
 | 
						|
        # replace padding with learnable null embedding
 | 
						|
        positive_embeddings = positive_embeddings * \
 | 
						|
            masks + (1 - masks) * positive_null
 | 
						|
        xyxy_embedding = xyxy_embedding * masks + (1 - masks) * xyxy_null
 | 
						|
 | 
						|
        objs = self.linears(
 | 
						|
            torch.cat([positive_embeddings, xyxy_embedding], dim=-1))
 | 
						|
        assert objs.shape == torch.Size([B, N, self.out_dim])
 | 
						|
        return objs
 | 
						|
 | 
						|
 | 
						|
class Gligen(nn.Module):
 | 
						|
    def __init__(self, modules, position_net, key_dim):
 | 
						|
        super().__init__()
 | 
						|
        self.module_list = nn.ModuleList(modules)
 | 
						|
        self.position_net = position_net
 | 
						|
        self.key_dim = key_dim
 | 
						|
        self.max_objs = 30
 | 
						|
        self.current_device = torch.device("cpu")
 | 
						|
 | 
						|
    def _set_position(self, boxes, masks, positive_embeddings):
 | 
						|
        objs = self.position_net(boxes, masks, positive_embeddings)
 | 
						|
        def func(x, extra_options):
 | 
						|
            key = extra_options["transformer_index"]
 | 
						|
            module = self.module_list[key]
 | 
						|
            return module(x, objs)
 | 
						|
        return func
 | 
						|
 | 
						|
    def set_position(self, latent_image_shape, position_params, device):
 | 
						|
        batch, c, h, w = latent_image_shape
 | 
						|
        masks = torch.zeros([self.max_objs], device="cpu")
 | 
						|
        boxes = []
 | 
						|
        positive_embeddings = []
 | 
						|
        for p in position_params:
 | 
						|
            x1 = (p[4]) / w
 | 
						|
            y1 = (p[3]) / h
 | 
						|
            x2 = (p[4] + p[2]) / w
 | 
						|
            y2 = (p[3] + p[1]) / h
 | 
						|
            masks[len(boxes)] = 1.0
 | 
						|
            boxes += [torch.tensor((x1, y1, x2, y2)).unsqueeze(0)]
 | 
						|
            positive_embeddings += [p[0]]
 | 
						|
        append_boxes = []
 | 
						|
        append_conds = []
 | 
						|
        if len(boxes) < self.max_objs:
 | 
						|
            append_boxes = [torch.zeros(
 | 
						|
                [self.max_objs - len(boxes), 4], device="cpu")]
 | 
						|
            append_conds = [torch.zeros(
 | 
						|
                [self.max_objs - len(boxes), self.key_dim], device="cpu")]
 | 
						|
 | 
						|
        box_out = torch.cat(
 | 
						|
            boxes + append_boxes).unsqueeze(0).repeat(batch, 1, 1)
 | 
						|
        masks = masks.unsqueeze(0).repeat(batch, 1)
 | 
						|
        conds = torch.cat(positive_embeddings +
 | 
						|
                          append_conds).unsqueeze(0).repeat(batch, 1, 1)
 | 
						|
        return self._set_position(
 | 
						|
            box_out.to(device),
 | 
						|
            masks.to(device),
 | 
						|
            conds.to(device))
 | 
						|
 | 
						|
    def set_empty(self, latent_image_shape, device):
 | 
						|
        batch, c, h, w = latent_image_shape
 | 
						|
        masks = torch.zeros([self.max_objs], device="cpu").repeat(batch, 1)
 | 
						|
        box_out = torch.zeros([self.max_objs, 4],
 | 
						|
                              device="cpu").repeat(batch, 1, 1)
 | 
						|
        conds = torch.zeros([self.max_objs, self.key_dim],
 | 
						|
                            device="cpu").repeat(batch, 1, 1)
 | 
						|
        return self._set_position(
 | 
						|
            box_out.to(device),
 | 
						|
            masks.to(device),
 | 
						|
            conds.to(device))
 | 
						|
 | 
						|
 | 
						|
def load_gligen(sd):
 | 
						|
    sd_k = sd.keys()
 | 
						|
    output_list = []
 | 
						|
    key_dim = 768
 | 
						|
    for a in ["input_blocks", "middle_block", "output_blocks"]:
 | 
						|
        for b in range(20):
 | 
						|
            k_temp = filter(lambda k: "{}.{}.".format(a, b)
 | 
						|
                            in k and ".fuser." in k, sd_k)
 | 
						|
            k_temp = map(lambda k: (k, k.split(".fuser.")[-1]), k_temp)
 | 
						|
 | 
						|
            n_sd = {}
 | 
						|
            for k in k_temp:
 | 
						|
                n_sd[k[1]] = sd[k[0]]
 | 
						|
            if len(n_sd) > 0:
 | 
						|
                query_dim = n_sd["linear.weight"].shape[0]
 | 
						|
                key_dim = n_sd["linear.weight"].shape[1]
 | 
						|
 | 
						|
                if key_dim == 768:  # SD1.x
 | 
						|
                    n_heads = 8
 | 
						|
                    d_head = query_dim // n_heads
 | 
						|
                else:
 | 
						|
                    d_head = 64
 | 
						|
                    n_heads = query_dim // d_head
 | 
						|
 | 
						|
                gated = GatedSelfAttentionDense(
 | 
						|
                    query_dim, key_dim, n_heads, d_head)
 | 
						|
                gated.load_state_dict(n_sd, strict=False)
 | 
						|
                output_list.append(gated)
 | 
						|
 | 
						|
    if "position_net.null_positive_feature" in sd_k:
 | 
						|
        in_dim = sd["position_net.null_positive_feature"].shape[0]
 | 
						|
        out_dim = sd["position_net.linears.4.weight"].shape[0]
 | 
						|
 | 
						|
        class WeightsLoader(torch.nn.Module):
 | 
						|
            pass
 | 
						|
        w = WeightsLoader()
 | 
						|
        w.position_net = PositionNet(in_dim, out_dim)
 | 
						|
        w.load_state_dict(sd, strict=False)
 | 
						|
 | 
						|
    gligen = Gligen(output_list, w.position_net, key_dim)
 | 
						|
    return gligen
 |