Merge pull request AUTOMATIC1111#2047 from vladmandic/master

update dev
Symbiomatrix · Aug 20, 2023 · 4826197 · 4826197
2 parents 5b7f873 + 70395fb
commit 4826197
Show file tree

Hide file tree

Showing 9 changed files with 21 additions and 45 deletions.
diff --git a/installer.py b/installer.py
@@ -388,10 +388,10 @@ def check_torch():
         os.environ.setdefault('NEOReadDebugKeys', '1')
         os.environ.setdefault('ClDeviceGlobalMemSizeAvailablePercent', '100')
         if "linux" in sys.platform:
-            torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.0.1a0 torchvision==0.15.2a0 intel_extension_for_pytorch==2.0.110+xpu openvino==2023.1.0.dev20230728 -f https://developer.intel.com/ipex-whl-stable-xpu')
+            torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.0.1a0 torchvision==0.15.2a0 intel_extension_for_pytorch==2.0.110+xpu -f https://developer.intel.com/ipex-whl-stable-xpu')
             os.environ.setdefault('TENSORFLOW_PACKAGE', 'tensorflow==2.13.0 intel-extension-for-tensorflow[gpu]')
         else:
-            torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.0.0a0 intel_extension_for_pytorch==2.0.110+gitba7f6c1 openvino==2023.1.0.dev20230728 -f https://developer.intel.com/ipex-whl-stable-xpu')
+            torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.0.0a0 intel_extension_for_pytorch==2.0.110+gitba7f6c1 -f https://developer.intel.com/ipex-whl-stable-xpu')
     else:
         machine = platform.machine()
         if sys.platform == 'darwin':
@@ -456,6 +456,8 @@ def check_torch():
         log.debug(f'Cannot install xformers package: {e}')
     if opts.get('cuda_compile_backend', '') == 'hidet':
         install('hidet', 'hidet')
+    if opts.get('cuda_compile_backend', '') == 'openvino_fx':
+        install('openvino==2023.1.0.dev20230811', 'openvino')
     if args.profile:
         print_profile(pr, 'Torch')
 

diff --git a/modules/devices.py b/modules/devices.py
@@ -167,7 +167,7 @@ def set_cuda_params():
 args = cmd_args.parser.parse_args()
 if args.use_ipex or (hasattr(torch, 'xpu') and torch.xpu.is_available()):
     backend = 'ipex'
-    from modules.ipex_specific import ipex_init
+    from modules.intel.ipex import ipex_init
     ipex_init()
 elif args.use_directml:
     backend = 'directml'

diff --git a/modules/ipex_specific/__init__.py → modules/intel/ipex/__init__.py b/modules/ipex_specific/__init__.py → modules/intel/ipex/__init__.py
@@ -163,7 +163,3 @@ def ipex_init():
 
     ipex_hijacks()
     ipex_diffusers()
-    try:
-        from .openvino import openvino_fx
-    except Exception:
-        pass
diff --git a/modules/ipex_specific/diffusers.py → modules/intel/ipex/diffusers.py b/modules/ipex_specific/diffusers.py → modules/intel/ipex/diffusers.py
@@ -61,15 +61,11 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
         if block_size >= 4000:
             do_split_2 = True
             #Find something divisible with the query_tokens
-            sanity_check = 0
             while ((self.slice_size * split_2_slice_size * shape_three) / 1024 * block_multiply) > 4000:
                 split_2_slice_size = split_2_slice_size // 2
                 if split_2_slice_size <= 1:
                     split_2_slice_size = 1
                     break
-                sanity_check = sanity_check + 1
-                if sanity_check >= 128:
-                    break
         else:
             do_split_2 = False
 
@@ -185,15 +181,11 @@ def __call__(
         if block_size >= 4000:
             do_split = True
             #Find something divisible with the shape_one
-            sanity_check = 0
             while ((shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply) > 4000:
                 split_slice_size = split_slice_size // 2
                 if split_slice_size <= 1:
                     split_slice_size = 1
                     break
-                sanity_check = sanity_check + 1
-                if sanity_check >= 128:
-                    break
         else:
             do_split = False
 
@@ -202,15 +194,11 @@ def __call__(
         if split_block_size >= 4000:
             do_split_2 = True
             #Find something divisible with the batch_size_attention
-            sanity_check = 0
             while ((shape_one * split_slice_size * split_2_slice_size * shape_four) / 1024 * block_multiply) > 4000:
                 split_2_slice_size = split_2_slice_size // 2
                 if split_2_slice_size <= 1:
                     split_2_slice_size = 1
                     break
-                sanity_check = sanity_check + 1
-                if sanity_check >= 128:
-                    break
         else:
             do_split_2 = False
 

diff --git a/modules/ipex_specific/hijacks.py → modules/intel/ipex/hijacks.py b/modules/ipex_specific/hijacks.py → modules/intel/ipex/hijacks.py
@@ -11,22 +11,14 @@ def ipex_no_cuda(orig_func, *args, **kwargs): # pylint: disable=redefined-outer-
 #Autocast
 original_autocast = torch.autocast
 def ipex_autocast(*args, **kwargs):
-    if args[0] == "cuda":
+    if args[0] == "cuda" or args[0] == "xpu":
         if "dtype" in kwargs:
             return original_autocast("xpu", *args[1:], **kwargs)
         else:
             return original_autocast("xpu", *args[1:], dtype=devices.dtype, **kwargs)
     else:
         return original_autocast(*args, **kwargs)
 
-#Diffusers BF16:
-original_linear_forward = torch.nn.modules.Linear.forward
-def linear_forward(self, input):
-    if input.dtype != self.weight.data.dtype:
-        return original_linear_forward(self, input.to(self.weight.data.dtype))
-    else:
-        return original_linear_forward(self, input)
-
 #Embedding BF16
 original_torch_cat = torch.cat
 def torch_cat(input, *args, **kwargs):
@@ -35,14 +27,6 @@ def torch_cat(input, *args, **kwargs):
     else:
         return original_torch_cat(input, *args, **kwargs)
 
-original_conv2d = torch.nn.functional.conv2d
-#Diffusers BF16:
-def conv2d(input, weight, *args, **kwargs):
-    if input.dtype != weight.data.dtype:
-        return original_conv2d(input.to(weight.data.dtype), weight, *args, **kwargs)
-    else:
-        return original_conv2d(input, weight, *args, **kwargs)
-
 original_interpolate = torch.nn.functional.interpolate
 #Latent antialias:
 def interpolate(input, size=None, scale_factor=None, mode='nearest', align_corners=None, recompute_scale_factor=None, antialias=False):
@@ -77,6 +61,9 @@ def ipex_hijacks():
     CondFunc('torch.zeros',
         lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=devices.device, **kwargs),
         lambda orig_func, *args, device=None, **kwargs: (type(device) is torch.device and device.type == "cuda") or (type(device) is str and "cuda" in device))
+    CondFunc('torch.tensor',
+        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=devices.device, **kwargs),
+        lambda orig_func, *args, device=None, **kwargs: (type(device) is torch.device and device.type == "cuda") or (type(device) is str and "cuda" in device))
 
     #Broken functions when torch.cuda.is_available is True:
     #Pin Memory:
@@ -98,7 +85,7 @@ def ipex_hijacks():
         lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
         orig_func(input.to(weight.data.dtype), normalized_shape, weight, *args, **kwargs),
         lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
-        input.dtype != weight.data.dtype and weight is not None)
+        weight is not None and input.dtype != weight.data.dtype)
 
     #Functions that does not work with the XPU:
     #UniPC:
@@ -129,7 +116,5 @@ def ipex_hijacks():
 
     #Functions that make compile mad with CondFunc:
     torch.autocast = ipex_autocast
-    torch.nn.modules.Linear.forward = linear_forward
     torch.cat = torch_cat
-    torch.nn.functional.conv2d = conv2d
     torch.nn.functional.interpolate = interpolate
diff --git a/modules/ipex_specific/openvino.py → modules/intel/openvino/__init__.py b/modules/ipex_specific/openvino.py → modules/intel/openvino/__init__.py
@@ -1,6 +1,5 @@
 import os
 import torch
-import intel_extension_for_pytorch as ipex
 from openvino.frontend.pytorch.torchdynamo.execute import execute
 from openvino.frontend.pytorch.torchdynamo.partition import Partitioner
 from torch._dynamo.backends.common import fake_tensor_unsupported

diff --git a/modules/sd_hijack.py b/modules/sd_hijack.py
@@ -188,6 +188,8 @@ def hijack(self, m):
                 import logging
                 shared.log.info(f"Compiling pipeline={m.model.__class__.__name__} mode={opts.cuda_compile_backend}")
                 import torch._dynamo # pylint: disable=unused-import,redefined-outer-name
+                if shared.opts.cuda_compile_backend == "openvino_fx":
+                    from modules.intel.openvino import openvino_fx
                 log_level = logging.WARNING if opts.cuda_compile_verbose else logging.CRITICAL # pylint: disable=protected-access
                 if hasattr(torch, '_logging'):
                     torch._logging.set_logs(dynamo=log_level, aot=log_level, inductor=log_level) # pylint: disable=protected-access

diff --git a/modules/sd_models.py b/modules/sd_models.py
@@ -600,6 +600,7 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
     import logging
     logging.getLogger("diffusers").setLevel(logging.ERROR)
     timer.record("diffusers")
+    devices.set_cuda_params()
     diffusers_load_config = {
         "low_cpu_mem_usage": True,
         "torch_dtype": devices.dtype,
@@ -636,7 +637,6 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
             if model_name is not None:
                 shared.log.info(f'Loading diffuser {op}: {model_name}')
                 model_file = modelloader.download_diffusers_model(hub_id=model_name)
-                devices.set_cuda_params()
                 try:
                     shared.log.debug(f'Diffusers load {op} config: {diffusers_load_config}')
                     sd_model = diffusers.DiffusionPipeline.from_pretrained(model_file, **diffusers_load_config)
@@ -651,7 +651,6 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
                 unload_model_weights(op=op)
                 return
 
-            devices.set_cuda_params()
             vae = None
             sd_vae.loaded_vae_file = None
             if op == 'model' or op == 'refiner':
@@ -759,7 +758,7 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
             sd_model.unet.to(memory_format=torch.channels_last)
 
         base_sent_to_cpu=False
-        if (shared.opts.cuda_compile or shared.opts.ipex_optimize) and torch.cuda.is_available():
+        if (shared.opts.cuda_compile and shared.opts.cuda_compile_backend != 'none') or shared.opts.ipex_optimize:
             if op == 'refiner' and not sd_model.has_accelerate:
                 gpu_vram = memory_stats().get('gpu', {})
                 free_vram = gpu_vram.get('total', 0) - gpu_vram.get('used', 0)
@@ -785,20 +784,25 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
             try:
                 if shared.opts.ipex_optimize:
                     sd_model.unet.training = False
+                    sd_model.vae.training = False
                     sd_model.unet = torch.xpu.optimize(sd_model.unet, dtype=devices.dtype_unet, inplace=True, weights_prepack=False) # pylint: disable=attribute-defined-outside-init
+                    sd_model.vae = torch.xpu.optimize(sd_model.vae, dtype=devices.dtype_unet, inplace=True, weights_prepack=False) # pylint: disable=attribute-defined-outside-init
                     shared.log.info("Applied IPEX Optimize.")
             except Exception as err:
                 shared.log.warning(f"IPEX Optimize not supported: {err}")
             try:
-                if shared.opts.cuda_compile:
+                if shared.opts.cuda_compile and shared.opts.cuda_compile_backend != 'none':
                     shared.log.info(f"Compiling pipeline={sd_model.__class__.__name__} shape={8 * sd_model.unet.config.sample_size} mode={shared.opts.cuda_compile_backend}")
                     import torch._dynamo # pylint: disable=unused-import,redefined-outer-name
+                    if shared.opts.cuda_compile_backend == "openvino_fx":
+                        from modules.intel.openvino import openvino_fx
                     log_level = logging.WARNING if shared.opts.cuda_compile_verbose else logging.CRITICAL # pylint: disable=protected-access
                     if hasattr(torch, '_logging'):
                         torch._logging.set_logs(dynamo=log_level, aot=log_level, inductor=log_level) # pylint: disable=protected-access
                     torch._dynamo.config.verbose = shared.opts.cuda_compile_verbose # pylint: disable=protected-access
                     torch._dynamo.config.suppress_errors = shared.opts.cuda_compile_errors # pylint: disable=protected-access
                     sd_model.unet = torch.compile(sd_model.unet, mode=shared.opts.cuda_compile_mode, backend=shared.opts.cuda_compile_backend, fullgraph=shared.opts.cuda_compile_fullgraph) # pylint: disable=attribute-defined-outside-init
+                    sd_model.vae.decode = torch.compile(sd_model.vae.decode, mode=shared.opts.cuda_compile_mode, backend=shared.opts.cuda_compile_backend, fullgraph=shared.opts.cuda_compile_fullgraph) # pylint: disable=attribute-defined-outside-init
                     sd_model("dummy prompt")
                     shared.log.info("Complilation done.")
             except Exception as err:

diff --git a/modules/shared.py b/modules/shared.py
@@ -373,7 +373,7 @@ def default(obj):
 options_templates.update(options_section(('cuda', "Compute Settings"), {
     "memmon_poll_rate": OptionInfo(2, "VRAM usage polls per second during generation", gr.Slider, {"minimum": 0, "maximum": 40, "step": 1}),
     "precision": OptionInfo("Autocast", "Precision type", gr.Radio, lambda: {"choices": ["Autocast", "Full"]}),
-    "cuda_dtype": OptionInfo("FP32" if sys.platform == "darwin" else "FP16", "Device precision type", gr.Radio, lambda: {"choices": ["FP32", "FP16", "BF16"]}),
+    "cuda_dtype": OptionInfo("FP32" if sys.platform == "darwin" else "BF16" if devices.backend == "ipex" else "FP16", "Device precision type", gr.Radio, lambda: {"choices": ["FP32", "FP16", "BF16"]}),
     "no_half": OptionInfo(False, "Use full precision for model (--no-half)", None, None, None),
     "no_half_vae": OptionInfo(False, "Use full precision for VAE (--no-half-vae)"),
     "upcast_sampling": OptionInfo(True if sys.platform == "darwin" else False, "Enable upcast sampling"),