Revert "Merge pull request #2411 from vladmandic/master"

This reverts commit 64cce8a, reversing changes made to 597fc18.
vladmandic · Oct 26, 2023 · 267905e · 267905e
1 parent 886af2e
commit 267905e
Show file tree

Hide file tree

Showing 282 changed files with 43,168 additions and 288 deletions.
diff --git a/.gitignore b/.gitignore
@@ -43,7 +43,6 @@ cache
 !package.json
 
 # all dynamic stuff
-/repositories/**/*
 /extensions/**/*
 /outputs/**/*
 /embeddings/**/*
@@ -59,6 +58,5 @@ cache
 /localizations
 
 # unexcluded so folders get created
-!/repositories/.placeholder
 !/models/VAE-approx
 !/models/VAE-approx/model.pt
diff --git a/.gitmodules b/.gitmodules
@@ -32,3 +32,7 @@
   path = extensions-builtin/sd-extension-chainner
   url = https://github.com/vladmandic/sd-extension-chainner
   ignore = dirty
+[submodule "modules/k-diffusion"]
+  path = modules/k-diffusion
+  url = https://github.com/crowsonkb/k-diffusion
+  ignore = dirty
diff --git a/.pylintrc b/.pylintrc
@@ -151,6 +151,7 @@ disable=bad-inline-option,
         missing-function-docstring,
         missing-module-docstring,
         no-else-return,
+        not-callable,
         pointless-string-statement,
         raw-checker-failed,
         simplifiable-if-expression,

diff --git a/configs/v2-1-stable-unclip-h-inference.yaml b/configs/v2-1-stable-unclip-h-inference.yaml
@@ -0,0 +1,80 @@
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.ImageEmbeddingConditionedLatentDiffusion
+  params:
+    embedding_dropout: 0.25
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.0120
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 96
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn-adm
+    scale_factor: 0.18215
+    monitor: val/loss_simple_ema
+    use_ema: False
+
+    embedder_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+
+    noise_aug_config:
+      target: ldm.modules.encoders.modules.CLIPEmbeddingNoiseAugmentation
+      params:
+        timestep_dim: 1024
+        noise_schedule_config:
+          timesteps: 1000
+          beta_schedule: squaredcos_cap_v2
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        num_classes: "sequential"
+        adm_in_channels: 2048
+        use_checkpoint: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+            - 1
+            - 2
+            - 4
+            - 4
+          num_res_blocks: 2
+          attn_resolutions: [ ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"
diff --git a/configs/v2-1-stable-unclip-l-inference.yaml b/configs/v2-1-stable-unclip-l-inference.yaml
@@ -0,0 +1,83 @@
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.ImageEmbeddingConditionedLatentDiffusion
+  params:
+    embedding_dropout: 0.25
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.0120
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 96
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn-adm
+    scale_factor: 0.18215
+    monitor: val/loss_simple_ema
+    use_ema: False
+
+    embedder_config:
+      target: ldm.modules.encoders.modules.ClipImageEmbedder
+      params:
+        model: "ViT-L/14"
+
+    noise_aug_config:
+      target: ldm.modules.encoders.modules.CLIPEmbeddingNoiseAugmentation
+      params:
+        clip_stats_path: "checkpoints/karlo_models/ViT-L-14_stats.th"
+        timestep_dim: 768
+        noise_schedule_config:
+          timesteps: 1000
+          beta_schedule: squaredcos_cap_v2
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        num_classes: "sequential"
+        adm_in_channels: 1536
+        use_checkpoint: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+            - 1
+            - 2
+            - 4
+            - 4
+          num_res_blocks: 2
+          attn_resolutions: [ ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"
diff --git a/configs/v2-midas-inference.yaml b/configs/v2-midas-inference.yaml
@@ -0,0 +1,74 @@
+model:
+  base_learning_rate: 5.0e-07
+  target: ldm.models.diffusion.ddpm.LatentDepth2ImageDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: hybrid
+    scale_factor: 0.18215
+    monitor: val/loss_simple_ema
+    finetune_keys: null
+    use_ema: False
+
+    depth_stage_config:
+      target: ldm.modules.midas.api.MiDaSInference
+      params:
+        model_type: "dpt_hybrid"
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        image_size: 32 # unused
+        in_channels: 5
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+            - 1
+            - 2
+            - 4
+            - 4
+          num_res_blocks: 2
+          attn_resolutions: [ ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"
+
+
diff --git a/extensions-builtin/Lora/lora_convert.py b/extensions-builtin/Lora/lora_convert.py
@@ -112,11 +112,12 @@ def __init__(self):
             self.converter = self.diffusers
             self.is_sdxl = True if shared.sd_model_type == "sdxl" else False
             self.UNET_CONVERSION_MAP = make_unet_conversion_map() if self.is_sdxl else None
-            self.LORA_PREFIX_UNET = "lora_unet"
-            self.LORA_PREFIX_TEXT_ENCODER = "lora_te"
+            self.LORA_PREFIX_UNET = "lora_unet_"
+            self.LORA_PREFIX_TEXT_ENCODER = "lora_te_"
+            self.OFT_PREFIX_UNET = "oft_unet_"
             # SDXL: must starts with LORA_PREFIX_TEXT_ENCODER
-            self.LORA_PREFIX_TEXT_ENCODER1 = "lora_te1"
-            self.LORA_PREFIX_TEXT_ENCODER2 = "lora_te2"
+            self.LORA_PREFIX_TEXT_ENCODER1 = "lora_te1_"
+            self.LORA_PREFIX_TEXT_ENCODER2 = "lora_te2_"
 
     def original(self, key):
         key = convert_diffusers_name_to_compvis(key, self.is_sd2)
@@ -142,13 +143,12 @@ def diffusers(self, key):
         if self.is_sdxl:
             map_keys = list(self.UNET_CONVERSION_MAP.keys())  # prefix of U-Net modules
             map_keys.sort()
-            search_key = key.replace(self.LORA_PREFIX_UNET + "_", "").replace(self.LORA_PREFIX_TEXT_ENCODER1 + "_",
-                                                                              "").replace(
-                self.LORA_PREFIX_TEXT_ENCODER2 + "_", "")
+            search_key = key.replace(self.LORA_PREFIX_UNET, "").replace(self.OFT_PREFIX_UNET, "").replace(self.LORA_PREFIX_TEXT_ENCODER1, "").replace(self.LORA_PREFIX_TEXT_ENCODER2, "")
+
             position = bisect.bisect_right(map_keys, search_key)
             map_key = map_keys[position - 1]
             if search_key.startswith(map_key):
-                key = key.replace(map_key, self.UNET_CONVERSION_MAP[map_key]) # pylint: disable=unsubscriptable-object
+                key = key.replace(map_key, self.UNET_CONVERSION_MAP[map_key]).replace("oft","lora") # pylint: disable=unsubscriptable-object
         sd_module = shared.sd_model.network_layer_mapping.get(key, None)
         return key, sd_module
 

diff --git a/extensions-builtin/Lora/network_oft.py b/extensions-builtin/Lora/network_oft.py
@@ -0,0 +1,49 @@
+import torch
+import diffusers.models.lora as diffusers_lora
+import network
+from modules import devices
+
+class ModuleTypeOFT(network.ModuleType):
+    def create_module(self, net: network.Network, weights: network.NetworkWeights):
+        """
+        weights.w.items()
+
+        alpha  :  tensor(0.0010, dtype=torch.bfloat16)
+        oft_blocks  :  tensor([[[ 0.0000e+00,  1.4400e-04,  1.7319e-03,  ..., -8.8882e-04,
+           5.7373e-03, -4.4250e-03],
+         [-1.4400e-04,  0.0000e+00,  8.6594e-04,  ...,  1.5945e-03,
+          -8.5449e-04,  1.9684e-03], ...etc...
+         , dtype=torch.bfloat16)"""
+
+        if "oft_blocks" in weights.w.keys():
+            module = NetworkModuleOFT(net, weights)
+            return module
+        else:
+            return None
+
+
+class NetworkModuleOFT(network.NetworkModule):
+    def __init__(self, net: network.Network, weights: network.NetworkWeights):
+        super().__init__(net, weights)
+
+        self.weights = weights.w.get("oft_blocks").to(device=devices.device)
+        self.dim = self.weights.shape[0]  # num blocks
+        self.alpha = self.multiplier()
+        self.block_size = self.weights.shape[-1]
+
+    def get_weight(self):
+        block_Q = self.weights - self.weights.transpose(1, 2)
+        I = torch.eye(self.block_size, device=devices.device).unsqueeze(0).repeat(self.dim, 1, 1)
+        block_R = torch.matmul(I + block_Q, (I - block_Q).inverse())
+        block_R_weighted = self.alpha * block_R + (1 - self.alpha) * I
+        R = torch.block_diag(*block_R_weighted)
+        return R
+
+    def calc_updown(self, orig_weight):
+        R = self.get_weight().to(device=devices.device, dtype=orig_weight.dtype)
+        if orig_weight.dim() == 4:
+            updown = torch.einsum("oihw, op -> pihw", orig_weight, R) * self.calc_scale()
+        else:
+            updown = torch.einsum("oi, op -> pi", orig_weight, R) * self.calc_scale()
+
+        return self.finalize_updown(updown, orig_weight, orig_weight.shape)
diff --git a/extensions-builtin/Lora/networks.py b/extensions-builtin/Lora/networks.py
@@ -7,6 +7,7 @@
 import network_lora
 import network_hada
 import network_ia3
+import network_oft
 import network_lokr
 import network_full
 import network_norm
@@ -32,6 +33,7 @@
     network_lora.ModuleTypeLora(),
     network_hada.ModuleTypeHada(),
     network_ia3.ModuleTypeIa3(),
+    network_oft.ModuleTypeOFT(),
     network_lokr.ModuleTypeLokr(),
     network_full.ModuleTypeFull(),
     network_norm.ModuleTypeNorm(),

diff --git a/installer.py b/installer.py
@@ -591,6 +591,7 @@ def install_packages():
 
 # clone required repositories
 def install_repositories():
+    """
     if args.profile:
         pr = cProfile.Profile()
         pr.enable()
@@ -615,6 +616,7 @@ def d(name):
     clone(blip_repo, d('BLIP'), blip_commit)
     if args.profile:
         print_profile(pr, 'Repositories')
+    """
 
 
 # run extension installer