Skip to content

Commit

Permalink
Merge pull request AUTOMATIC1111#2047 from vladmandic/master
Browse files Browse the repository at this point in the history
update dev
  • Loading branch information
vladmandic authored Aug 20, 2023
2 parents 5b7f873 + 70395fb commit 4826197
Show file tree
Hide file tree
Showing 9 changed files with 21 additions and 45 deletions.
6 changes: 4 additions & 2 deletions installer.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,10 +388,10 @@ def check_torch():
os.environ.setdefault('NEOReadDebugKeys', '1')
os.environ.setdefault('ClDeviceGlobalMemSizeAvailablePercent', '100')
if "linux" in sys.platform:
torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.0.1a0 torchvision==0.15.2a0 intel_extension_for_pytorch==2.0.110+xpu openvino==2023.1.0.dev20230728 -f https://developer.intel.com/ipex-whl-stable-xpu')
torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.0.1a0 torchvision==0.15.2a0 intel_extension_for_pytorch==2.0.110+xpu -f https://developer.intel.com/ipex-whl-stable-xpu')
os.environ.setdefault('TENSORFLOW_PACKAGE', 'tensorflow==2.13.0 intel-extension-for-tensorflow[gpu]')
else:
torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.0.0a0 intel_extension_for_pytorch==2.0.110+gitba7f6c1 openvino==2023.1.0.dev20230728 -f https://developer.intel.com/ipex-whl-stable-xpu')
torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.0.0a0 intel_extension_for_pytorch==2.0.110+gitba7f6c1 -f https://developer.intel.com/ipex-whl-stable-xpu')
else:
machine = platform.machine()
if sys.platform == 'darwin':
Expand Down Expand Up @@ -456,6 +456,8 @@ def check_torch():
log.debug(f'Cannot install xformers package: {e}')
if opts.get('cuda_compile_backend', '') == 'hidet':
install('hidet', 'hidet')
if opts.get('cuda_compile_backend', '') == 'openvino_fx':
install('openvino==2023.1.0.dev20230811', 'openvino')
if args.profile:
print_profile(pr, 'Torch')

Expand Down
2 changes: 1 addition & 1 deletion modules/devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def set_cuda_params():
args = cmd_args.parser.parse_args()
if args.use_ipex or (hasattr(torch, 'xpu') and torch.xpu.is_available()):
backend = 'ipex'
from modules.ipex_specific import ipex_init
from modules.intel.ipex import ipex_init
ipex_init()
elif args.use_directml:
backend = 'directml'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,3 @@ def ipex_init():

ipex_hijacks()
ipex_diffusers()
try:
from .openvino import openvino_fx
except Exception:
pass
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,11 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
if block_size >= 4000:
do_split_2 = True
#Find something divisible with the query_tokens
sanity_check = 0
while ((self.slice_size * split_2_slice_size * shape_three) / 1024 * block_multiply) > 4000:
split_2_slice_size = split_2_slice_size // 2
if split_2_slice_size <= 1:
split_2_slice_size = 1
break
sanity_check = sanity_check + 1
if sanity_check >= 128:
break
else:
do_split_2 = False

Expand Down Expand Up @@ -185,15 +181,11 @@ def __call__(
if block_size >= 4000:
do_split = True
#Find something divisible with the shape_one
sanity_check = 0
while ((shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply) > 4000:
split_slice_size = split_slice_size // 2
if split_slice_size <= 1:
split_slice_size = 1
break
sanity_check = sanity_check + 1
if sanity_check >= 128:
break
else:
do_split = False

Expand All @@ -202,15 +194,11 @@ def __call__(
if split_block_size >= 4000:
do_split_2 = True
#Find something divisible with the batch_size_attention
sanity_check = 0
while ((shape_one * split_slice_size * split_2_slice_size * shape_four) / 1024 * block_multiply) > 4000:
split_2_slice_size = split_2_slice_size // 2
if split_2_slice_size <= 1:
split_2_slice_size = 1
break
sanity_check = sanity_check + 1
if sanity_check >= 128:
break
else:
do_split_2 = False

Expand Down
25 changes: 5 additions & 20 deletions modules/ipex_specific/hijacks.py → modules/intel/ipex/hijacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,14 @@ def ipex_no_cuda(orig_func, *args, **kwargs): # pylint: disable=redefined-outer-
#Autocast
original_autocast = torch.autocast
def ipex_autocast(*args, **kwargs):
if args[0] == "cuda":
if args[0] == "cuda" or args[0] == "xpu":
if "dtype" in kwargs:
return original_autocast("xpu", *args[1:], **kwargs)
else:
return original_autocast("xpu", *args[1:], dtype=devices.dtype, **kwargs)
else:
return original_autocast(*args, **kwargs)

#Diffusers BF16:
original_linear_forward = torch.nn.modules.Linear.forward
def linear_forward(self, input):
if input.dtype != self.weight.data.dtype:
return original_linear_forward(self, input.to(self.weight.data.dtype))
else:
return original_linear_forward(self, input)

#Embedding BF16
original_torch_cat = torch.cat
def torch_cat(input, *args, **kwargs):
Expand All @@ -35,14 +27,6 @@ def torch_cat(input, *args, **kwargs):
else:
return original_torch_cat(input, *args, **kwargs)

original_conv2d = torch.nn.functional.conv2d
#Diffusers BF16:
def conv2d(input, weight, *args, **kwargs):
if input.dtype != weight.data.dtype:
return original_conv2d(input.to(weight.data.dtype), weight, *args, **kwargs)
else:
return original_conv2d(input, weight, *args, **kwargs)

original_interpolate = torch.nn.functional.interpolate
#Latent antialias:
def interpolate(input, size=None, scale_factor=None, mode='nearest', align_corners=None, recompute_scale_factor=None, antialias=False):
Expand Down Expand Up @@ -77,6 +61,9 @@ def ipex_hijacks():
CondFunc('torch.zeros',
lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=devices.device, **kwargs),
lambda orig_func, *args, device=None, **kwargs: (type(device) is torch.device and device.type == "cuda") or (type(device) is str and "cuda" in device))
CondFunc('torch.tensor',
lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=devices.device, **kwargs),
lambda orig_func, *args, device=None, **kwargs: (type(device) is torch.device and device.type == "cuda") or (type(device) is str and "cuda" in device))

#Broken functions when torch.cuda.is_available is True:
#Pin Memory:
Expand All @@ -98,7 +85,7 @@ def ipex_hijacks():
lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
orig_func(input.to(weight.data.dtype), normalized_shape, weight, *args, **kwargs),
lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
input.dtype != weight.data.dtype and weight is not None)
weight is not None and input.dtype != weight.data.dtype)

#Functions that does not work with the XPU:
#UniPC:
Expand Down Expand Up @@ -129,7 +116,5 @@ def ipex_hijacks():

#Functions that make compile mad with CondFunc:
torch.autocast = ipex_autocast
torch.nn.modules.Linear.forward = linear_forward
torch.cat = torch_cat
torch.nn.functional.conv2d = conv2d
torch.nn.functional.interpolate = interpolate
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import torch
import intel_extension_for_pytorch as ipex
from openvino.frontend.pytorch.torchdynamo.execute import execute
from openvino.frontend.pytorch.torchdynamo.partition import Partitioner
from torch._dynamo.backends.common import fake_tensor_unsupported
Expand Down
2 changes: 2 additions & 0 deletions modules/sd_hijack.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,8 @@ def hijack(self, m):
import logging
shared.log.info(f"Compiling pipeline={m.model.__class__.__name__} mode={opts.cuda_compile_backend}")
import torch._dynamo # pylint: disable=unused-import,redefined-outer-name
if shared.opts.cuda_compile_backend == "openvino_fx":
from modules.intel.openvino import openvino_fx
log_level = logging.WARNING if opts.cuda_compile_verbose else logging.CRITICAL # pylint: disable=protected-access
if hasattr(torch, '_logging'):
torch._logging.set_logs(dynamo=log_level, aot=log_level, inductor=log_level) # pylint: disable=protected-access
Expand Down
12 changes: 8 additions & 4 deletions modules/sd_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,7 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
import logging
logging.getLogger("diffusers").setLevel(logging.ERROR)
timer.record("diffusers")
devices.set_cuda_params()
diffusers_load_config = {
"low_cpu_mem_usage": True,
"torch_dtype": devices.dtype,
Expand Down Expand Up @@ -636,7 +637,6 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
if model_name is not None:
shared.log.info(f'Loading diffuser {op}: {model_name}')
model_file = modelloader.download_diffusers_model(hub_id=model_name)
devices.set_cuda_params()
try:
shared.log.debug(f'Diffusers load {op} config: {diffusers_load_config}')
sd_model = diffusers.DiffusionPipeline.from_pretrained(model_file, **diffusers_load_config)
Expand All @@ -651,7 +651,6 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
unload_model_weights(op=op)
return

devices.set_cuda_params()
vae = None
sd_vae.loaded_vae_file = None
if op == 'model' or op == 'refiner':
Expand Down Expand Up @@ -759,7 +758,7 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
sd_model.unet.to(memory_format=torch.channels_last)

base_sent_to_cpu=False
if (shared.opts.cuda_compile or shared.opts.ipex_optimize) and torch.cuda.is_available():
if (shared.opts.cuda_compile and shared.opts.cuda_compile_backend != 'none') or shared.opts.ipex_optimize:
if op == 'refiner' and not sd_model.has_accelerate:
gpu_vram = memory_stats().get('gpu', {})
free_vram = gpu_vram.get('total', 0) - gpu_vram.get('used', 0)
Expand All @@ -785,20 +784,25 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
try:
if shared.opts.ipex_optimize:
sd_model.unet.training = False
sd_model.vae.training = False
sd_model.unet = torch.xpu.optimize(sd_model.unet, dtype=devices.dtype_unet, inplace=True, weights_prepack=False) # pylint: disable=attribute-defined-outside-init
sd_model.vae = torch.xpu.optimize(sd_model.vae, dtype=devices.dtype_unet, inplace=True, weights_prepack=False) # pylint: disable=attribute-defined-outside-init
shared.log.info("Applied IPEX Optimize.")
except Exception as err:
shared.log.warning(f"IPEX Optimize not supported: {err}")
try:
if shared.opts.cuda_compile:
if shared.opts.cuda_compile and shared.opts.cuda_compile_backend != 'none':
shared.log.info(f"Compiling pipeline={sd_model.__class__.__name__} shape={8 * sd_model.unet.config.sample_size} mode={shared.opts.cuda_compile_backend}")
import torch._dynamo # pylint: disable=unused-import,redefined-outer-name
if shared.opts.cuda_compile_backend == "openvino_fx":
from modules.intel.openvino import openvino_fx
log_level = logging.WARNING if shared.opts.cuda_compile_verbose else logging.CRITICAL # pylint: disable=protected-access
if hasattr(torch, '_logging'):
torch._logging.set_logs(dynamo=log_level, aot=log_level, inductor=log_level) # pylint: disable=protected-access
torch._dynamo.config.verbose = shared.opts.cuda_compile_verbose # pylint: disable=protected-access
torch._dynamo.config.suppress_errors = shared.opts.cuda_compile_errors # pylint: disable=protected-access
sd_model.unet = torch.compile(sd_model.unet, mode=shared.opts.cuda_compile_mode, backend=shared.opts.cuda_compile_backend, fullgraph=shared.opts.cuda_compile_fullgraph) # pylint: disable=attribute-defined-outside-init
sd_model.vae.decode = torch.compile(sd_model.vae.decode, mode=shared.opts.cuda_compile_mode, backend=shared.opts.cuda_compile_backend, fullgraph=shared.opts.cuda_compile_fullgraph) # pylint: disable=attribute-defined-outside-init
sd_model("dummy prompt")
shared.log.info("Complilation done.")
except Exception as err:
Expand Down
2 changes: 1 addition & 1 deletion modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ def default(obj):
options_templates.update(options_section(('cuda', "Compute Settings"), {
"memmon_poll_rate": OptionInfo(2, "VRAM usage polls per second during generation", gr.Slider, {"minimum": 0, "maximum": 40, "step": 1}),
"precision": OptionInfo("Autocast", "Precision type", gr.Radio, lambda: {"choices": ["Autocast", "Full"]}),
"cuda_dtype": OptionInfo("FP32" if sys.platform == "darwin" else "FP16", "Device precision type", gr.Radio, lambda: {"choices": ["FP32", "FP16", "BF16"]}),
"cuda_dtype": OptionInfo("FP32" if sys.platform == "darwin" else "BF16" if devices.backend == "ipex" else "FP16", "Device precision type", gr.Radio, lambda: {"choices": ["FP32", "FP16", "BF16"]}),
"no_half": OptionInfo(False, "Use full precision for model (--no-half)", None, None, None),
"no_half_vae": OptionInfo(False, "Use full precision for VAE (--no-half-vae)"),
"upcast_sampling": OptionInfo(True if sys.platform == "darwin" else False, "Enable upcast sampling"),
Expand Down

0 comments on commit 4826197

Please sign in to comment.