Description
I follow Enable PyTorch with DirectML on Windows and can use AMD GPU to run simple calculations:
import torch
import torch_directml
dml = torch_directml.device()
#Test a simple operation
x = torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32, device=dml)
y = torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32, device=dml)
print(x + y)
tensor([2., 4., 6.], device='privateuseone:0')
But it gets stuck when using Breeze-7B LLM. Did I go wrong somewhere?
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch_directml
dml = torch_directml.device()
model = AutoModelForCausalLM.from_pretrained(
"MediaTek-Research/Breeze-7B-Instruct-v1_0",
torch_dtype=torch.float32,
)
model.to(dml)
tokenizer = AutoTokenizer.from_pretrained('MediaTek-Research/Breeze-7B-Instruct-v1_0')
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token
def get_completion_breeze(prompt):
chat = [{"role": "user", "content": prompt}]
input_text = tokenizer.apply_chat_template(chat, tokenize=False)
input_tensors = tokenizer(input_text, return_tensors="pt").to(dml) # Move tensors to DirectML device
input_tensors['attention_mask'] = torch.ones(input_tensors['input_ids'].shape, dtype=torch.long, device=dml) # Ensure attention_mask is also on the DirectML device
outputs = model.generate(
input_tensors["input_ids"],
attention_mask=input_tensors['attention_mask'],
max_new_tokens=200,
top_p=0.01,
top_k=85,
repetition_penalty=1.1,
temperature=0.01
)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
return result
print(get_completion_breeze("hi"))
PS C:\Users\hung> & C:/Users/hung/miniconda3/envs/pydml/python.exe c:/Users/hung/Desktop/Breeze-7B.py
Loading checkpoint shards: 100%|███████████████████████| 4/4 [00:22<00:00, 5.72s/it]
Traceback (most recent call last):
File "c:\Users\hung\Desktop\Breeze-7B.py", line 11, in
model.to(dml)
File "C:\Users\hung\miniconda3\envs\pydml\lib\site-packages\transformers\modeling_utils.py", line 2576, in to
return super().to(*args, **kwargs)
File "C:\Users\hung\miniconda3\envs\pydml\lib\site-packages\torch\nn\modules\module.py", line 1145, in to
return self._apply(convert)
File "C:\Users\hung\miniconda3\envs\pydml\lib\site-packages\torch\nn\modules\module.py", line 797, in _apply
module._apply(fn)
File "C:\Users\hung\miniconda3\envs\pydml\lib\site-packages\torch\nn\modules\module.py", line 797, in _apply
module._apply(fn)
File "C:\Users\hung\miniconda3\envs\pydml\lib\site-packages\torch\nn\modules\module.py", line 797, in _apply
module._apply(fn)
[Previous line repeated 2 more times]
File "C:\Users\hung\miniconda3\envs\pydml\lib\site-packages\torch\nn\modules\module.py", line 820, in _apply
param_applied = fn(param)
File "C:\Users\hung\miniconda3\envs\pydml\lib\site-packages\torch\nn\modules\module.py", line 1143, in convert
return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
RuntimeError: Could not allocate tensor with 234881024 bytes. There is not enough GPU video memory available!