307 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			307 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #taken from: https://github.com/lllyasviel/ControlNet
 | |
| #and modified
 | |
| 
 | |
| import torch
 | |
| import torch as th
 | |
| import torch.nn as nn
 | |
| 
 | |
| from ..ldm.modules.diffusionmodules.util import (
 | |
|     zero_module,
 | |
|     timestep_embedding,
 | |
| )
 | |
| 
 | |
| from ..ldm.modules.attention import SpatialTransformer
 | |
| from ..ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample
 | |
| from ..ldm.util import exists
 | |
| import fcbh.ops
 | |
| 
 | |
| class ControlledUnetModel(UNetModel):
 | |
|     #implemented in the ldm unet
 | |
|     pass
 | |
| 
 | |
| class ControlNet(nn.Module):
 | |
|     def __init__(
 | |
|         self,
 | |
|         image_size,
 | |
|         in_channels,
 | |
|         model_channels,
 | |
|         hint_channels,
 | |
|         num_res_blocks,
 | |
|         attention_resolutions,
 | |
|         dropout=0,
 | |
|         channel_mult=(1, 2, 4, 8),
 | |
|         conv_resample=True,
 | |
|         dims=2,
 | |
|         num_classes=None,
 | |
|         use_checkpoint=False,
 | |
|         dtype=torch.float32,
 | |
|         num_heads=-1,
 | |
|         num_head_channels=-1,
 | |
|         num_heads_upsample=-1,
 | |
|         use_scale_shift_norm=False,
 | |
|         resblock_updown=False,
 | |
|         use_new_attention_order=False,
 | |
|         use_spatial_transformer=False,    # custom transformer support
 | |
|         transformer_depth=1,              # custom transformer support
 | |
|         context_dim=None,                 # custom transformer support
 | |
|         n_embed=None,                     # custom support for prediction of discrete ids into codebook of first stage vq model
 | |
|         legacy=True,
 | |
|         disable_self_attentions=None,
 | |
|         num_attention_blocks=None,
 | |
|         disable_middle_self_attn=False,
 | |
|         use_linear_in_transformer=False,
 | |
|         adm_in_channels=None,
 | |
|         transformer_depth_middle=None,
 | |
|         device=None,
 | |
|         operations=fcbh.ops,
 | |
|     ):
 | |
|         super().__init__()
 | |
|         assert use_spatial_transformer == True, "use_spatial_transformer has to be true"
 | |
|         if use_spatial_transformer:
 | |
|             assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
 | |
| 
 | |
|         if context_dim is not None:
 | |
|             assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
 | |
|             # from omegaconf.listconfig import ListConfig
 | |
|             # if type(context_dim) == ListConfig:
 | |
|             #     context_dim = list(context_dim)
 | |
| 
 | |
|         if num_heads_upsample == -1:
 | |
|             num_heads_upsample = num_heads
 | |
| 
 | |
|         if num_heads == -1:
 | |
|             assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
 | |
| 
 | |
|         if num_head_channels == -1:
 | |
|             assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
 | |
| 
 | |
|         self.dims = dims
 | |
|         self.image_size = image_size
 | |
|         self.in_channels = in_channels
 | |
|         self.model_channels = model_channels
 | |
|         if isinstance(transformer_depth, int):
 | |
|             transformer_depth = len(channel_mult) * [transformer_depth]
 | |
|         if transformer_depth_middle is None:
 | |
|             transformer_depth_middle =  transformer_depth[-1]
 | |
|         if isinstance(num_res_blocks, int):
 | |
|             self.num_res_blocks = len(channel_mult) * [num_res_blocks]
 | |
|         else:
 | |
|             if len(num_res_blocks) != len(channel_mult):
 | |
|                 raise ValueError("provide num_res_blocks either as an int (globally constant) or "
 | |
|                                  "as a list/tuple (per-level) with the same length as channel_mult")
 | |
|             self.num_res_blocks = num_res_blocks
 | |
|         if disable_self_attentions is not None:
 | |
|             # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
 | |
|             assert len(disable_self_attentions) == len(channel_mult)
 | |
|         if num_attention_blocks is not None:
 | |
|             assert len(num_attention_blocks) == len(self.num_res_blocks)
 | |
|             assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
 | |
|             print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
 | |
|                   f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
 | |
|                   f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
 | |
|                   f"attention will still not be set.")
 | |
| 
 | |
|         self.attention_resolutions = attention_resolutions
 | |
|         self.dropout = dropout
 | |
|         self.channel_mult = channel_mult
 | |
|         self.conv_resample = conv_resample
 | |
|         self.num_classes = num_classes
 | |
|         self.use_checkpoint = use_checkpoint
 | |
|         self.dtype = dtype
 | |
|         self.num_heads = num_heads
 | |
|         self.num_head_channels = num_head_channels
 | |
|         self.num_heads_upsample = num_heads_upsample
 | |
|         self.predict_codebook_ids = n_embed is not None
 | |
| 
 | |
|         time_embed_dim = model_channels * 4
 | |
|         self.time_embed = nn.Sequential(
 | |
|             operations.Linear(model_channels, time_embed_dim, dtype=self.dtype, device=device),
 | |
|             nn.SiLU(),
 | |
|             operations.Linear(time_embed_dim, time_embed_dim, dtype=self.dtype, device=device),
 | |
|         )
 | |
| 
 | |
|         if self.num_classes is not None:
 | |
|             if isinstance(self.num_classes, int):
 | |
|                 self.label_emb = nn.Embedding(num_classes, time_embed_dim)
 | |
|             elif self.num_classes == "continuous":
 | |
|                 print("setting up linear c_adm embedding layer")
 | |
|                 self.label_emb = nn.Linear(1, time_embed_dim)
 | |
|             elif self.num_classes == "sequential":
 | |
|                 assert adm_in_channels is not None
 | |
|                 self.label_emb = nn.Sequential(
 | |
|                     nn.Sequential(
 | |
|                         operations.Linear(adm_in_channels, time_embed_dim, dtype=self.dtype, device=device),
 | |
|                         nn.SiLU(),
 | |
|                         operations.Linear(time_embed_dim, time_embed_dim, dtype=self.dtype, device=device),
 | |
|                     )
 | |
|                 )
 | |
|             else:
 | |
|                 raise ValueError()
 | |
| 
 | |
|         self.input_blocks = nn.ModuleList(
 | |
|             [
 | |
|                 TimestepEmbedSequential(
 | |
|                     operations.conv_nd(dims, in_channels, model_channels, 3, padding=1, dtype=self.dtype, device=device)
 | |
|                 )
 | |
|             ]
 | |
|         )
 | |
|         self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels, operations=operations)])
 | |
| 
 | |
|         self.input_hint_block = TimestepEmbedSequential(
 | |
|                     operations.conv_nd(dims, hint_channels, 16, 3, padding=1),
 | |
|                     nn.SiLU(),
 | |
|                     operations.conv_nd(dims, 16, 16, 3, padding=1),
 | |
|                     nn.SiLU(),
 | |
|                     operations.conv_nd(dims, 16, 32, 3, padding=1, stride=2),
 | |
|                     nn.SiLU(),
 | |
|                     operations.conv_nd(dims, 32, 32, 3, padding=1),
 | |
|                     nn.SiLU(),
 | |
|                     operations.conv_nd(dims, 32, 96, 3, padding=1, stride=2),
 | |
|                     nn.SiLU(),
 | |
|                     operations.conv_nd(dims, 96, 96, 3, padding=1),
 | |
|                     nn.SiLU(),
 | |
|                     operations.conv_nd(dims, 96, 256, 3, padding=1, stride=2),
 | |
|                     nn.SiLU(),
 | |
|                     zero_module(operations.conv_nd(dims, 256, model_channels, 3, padding=1))
 | |
|         )
 | |
| 
 | |
|         self._feature_size = model_channels
 | |
|         input_block_chans = [model_channels]
 | |
|         ch = model_channels
 | |
|         ds = 1
 | |
|         for level, mult in enumerate(channel_mult):
 | |
|             for nr in range(self.num_res_blocks[level]):
 | |
|                 layers = [
 | |
|                     ResBlock(
 | |
|                         ch,
 | |
|                         time_embed_dim,
 | |
|                         dropout,
 | |
|                         out_channels=mult * model_channels,
 | |
|                         dims=dims,
 | |
|                         use_checkpoint=use_checkpoint,
 | |
|                         use_scale_shift_norm=use_scale_shift_norm,
 | |
|                         operations=operations
 | |
|                     )
 | |
|                 ]
 | |
|                 ch = mult * model_channels
 | |
|                 if ds in attention_resolutions:
 | |
|                     if num_head_channels == -1:
 | |
|                         dim_head = ch // num_heads
 | |
|                     else:
 | |
|                         num_heads = ch // num_head_channels
 | |
|                         dim_head = num_head_channels
 | |
|                     if legacy:
 | |
|                         #num_heads = 1
 | |
|                         dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
 | |
|                     if exists(disable_self_attentions):
 | |
|                         disabled_sa = disable_self_attentions[level]
 | |
|                     else:
 | |
|                         disabled_sa = False
 | |
| 
 | |
|                     if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
 | |
|                         layers.append(
 | |
|                             SpatialTransformer(
 | |
|                                 ch, num_heads, dim_head, depth=transformer_depth[level], context_dim=context_dim,
 | |
|                                 disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
 | |
|                                 use_checkpoint=use_checkpoint, operations=operations
 | |
|                             )
 | |
|                         )
 | |
|                 self.input_blocks.append(TimestepEmbedSequential(*layers))
 | |
|                 self.zero_convs.append(self.make_zero_conv(ch, operations=operations))
 | |
|                 self._feature_size += ch
 | |
|                 input_block_chans.append(ch)
 | |
|             if level != len(channel_mult) - 1:
 | |
|                 out_ch = ch
 | |
|                 self.input_blocks.append(
 | |
|                     TimestepEmbedSequential(
 | |
|                         ResBlock(
 | |
|                             ch,
 | |
|                             time_embed_dim,
 | |
|                             dropout,
 | |
|                             out_channels=out_ch,
 | |
|                             dims=dims,
 | |
|                             use_checkpoint=use_checkpoint,
 | |
|                             use_scale_shift_norm=use_scale_shift_norm,
 | |
|                             down=True,
 | |
|                             operations=operations
 | |
|                         )
 | |
|                         if resblock_updown
 | |
|                         else Downsample(
 | |
|                             ch, conv_resample, dims=dims, out_channels=out_ch, operations=operations
 | |
|                         )
 | |
|                     )
 | |
|                 )
 | |
|                 ch = out_ch
 | |
|                 input_block_chans.append(ch)
 | |
|                 self.zero_convs.append(self.make_zero_conv(ch, operations=operations))
 | |
|                 ds *= 2
 | |
|                 self._feature_size += ch
 | |
| 
 | |
|         if num_head_channels == -1:
 | |
|             dim_head = ch // num_heads
 | |
|         else:
 | |
|             num_heads = ch // num_head_channels
 | |
|             dim_head = num_head_channels
 | |
|         if legacy:
 | |
|             #num_heads = 1
 | |
|             dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
 | |
|         self.middle_block = TimestepEmbedSequential(
 | |
|             ResBlock(
 | |
|                 ch,
 | |
|                 time_embed_dim,
 | |
|                 dropout,
 | |
|                 dims=dims,
 | |
|                 use_checkpoint=use_checkpoint,
 | |
|                 use_scale_shift_norm=use_scale_shift_norm,
 | |
|                 operations=operations
 | |
|             ),
 | |
|             SpatialTransformer(  # always uses a self-attn
 | |
|                             ch, num_heads, dim_head, depth=transformer_depth_middle, context_dim=context_dim,
 | |
|                             disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
 | |
|                             use_checkpoint=use_checkpoint, operations=operations
 | |
|                         ),
 | |
|             ResBlock(
 | |
|                 ch,
 | |
|                 time_embed_dim,
 | |
|                 dropout,
 | |
|                 dims=dims,
 | |
|                 use_checkpoint=use_checkpoint,
 | |
|                 use_scale_shift_norm=use_scale_shift_norm,
 | |
|                 operations=operations
 | |
|             ),
 | |
|         )
 | |
|         self.middle_block_out = self.make_zero_conv(ch, operations=operations)
 | |
|         self._feature_size += ch
 | |
| 
 | |
|     def make_zero_conv(self, channels, operations=None):
 | |
|         return TimestepEmbedSequential(zero_module(operations.conv_nd(self.dims, channels, channels, 1, padding=0)))
 | |
| 
 | |
|     def forward(self, x, hint, timesteps, context, y=None, **kwargs):
 | |
|         t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(self.dtype)
 | |
|         emb = self.time_embed(t_emb)
 | |
| 
 | |
|         guided_hint = self.input_hint_block(hint, emb, context)
 | |
| 
 | |
|         outs = []
 | |
| 
 | |
|         hs = []
 | |
|         if self.num_classes is not None:
 | |
|             assert y.shape[0] == x.shape[0]
 | |
|             emb = emb + self.label_emb(y)
 | |
| 
 | |
|         h = x.type(self.dtype)
 | |
|         for module, zero_conv in zip(self.input_blocks, self.zero_convs):
 | |
|             if guided_hint is not None:
 | |
|                 h = module(h, emb, context)
 | |
|                 h += guided_hint
 | |
|                 guided_hint = None
 | |
|             else:
 | |
|                 h = module(h, emb, context)
 | |
|             outs.append(zero_conv(h, emb, context))
 | |
| 
 | |
|         h = self.middle_block(h, emb, context)
 | |
|         outs.append(self.middle_block_out(h, emb, context))
 | |
| 
 | |
|         return outs
 | |
| 
 |