Conversation
|
Cool! I will check it tomorrow~ |
|
@gluttony-10 What's your version of bitsandbytes. I encountered an error with INT8 when running it in the default environment. 511 if mode == 'und':
--> 512 packed_query_states = self.q_proj(packed_query_sequence).view(-1, self.num_heads, self.head_dim)
513 packed_key_states = self.k_proj(packed_query_sequence).view(-1, self.num_key_value_heads, self.head_dim)
514 packed_value_states = self.v_proj(packed_query_sequence).view(-1, self.num_key_value_heads, self.head_dim)
File /usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
File /usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()
File /usr/local/lib/python3.11/dist-packages/accelerate/hooks.py:170, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
168 output = module._old_forward(*args, **kwargs)
169 else:
--> 170 output = module._old_forward(*args, **kwargs)
171 return module._hf_hook.post_forward(module, output)
File /usr/local/lib/python3.11/dist-packages/bitsandbytes/nn/modules.py:797, in Linear8bitLt.forward(self, x)
794 if self.bias is not None and self.bias.dtype != x.dtype:
795 self.bias.data = self.bias.data.to(x.dtype)
--> 797 out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
799 if not self.state.has_fp16_weights:
800 if self.state.CB is not None and self.state.CxB is not None:
801 # we converted 8-bit row major to turing/ampere format in the first inference pass
802 # we no longer need the row-major weight
File /usr/local/lib/python3.11/dist-packages/bitsandbytes/autograd/_functions.py:556, in matmul(A, B, out, state, threshold, bias)
554 if threshold > 0.0:
555 state.threshold = threshold
--> 556 return MatMul8bitLt.apply(A, B, out, bias, state)
File /usr/local/lib/python3.11/dist-packages/torch/autograd/function.py:575, in Function.apply(cls, *args, **kwargs)
572 if not torch._C._are_functorch_transforms_active():
573 # See NOTE: [functorch vjp and autograd interaction]
574 args = _functorch.utils.unwrap_dead_wrappers(args)
--> 575 return super().apply(*args, **kwargs) # type: ignore[misc]
577 if not is_setup_ctx_defined:
578 raise RuntimeError(
579 "In order to use an autograd.Function with functorch transforms "
580 "(vmap, grad, jvp, jacrev, ...), it must override the setup_context "
581 "staticmethod. For more details, please see "
582 "https://pytorch.org/docs/main/notes/extending.func.html"
583 )
File /usr/local/lib/python3.11/dist-packages/bitsandbytes/autograd/_functions.py:395, in MatMul8bitLt.forward(ctx, A, B, out, bias, state)
393 if using_igemmlt:
394 C32A, SA = F.transform(CA, "col32")
--> 395 out32, Sout32 = F.igemmlt(C32A, state.CxB, SA, state.SB)
396 if bias is None or bias.dtype == torch.float16:
397 # we apply the fused bias here
398 output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=bias)
File /usr/local/lib/python3.11/dist-packages/bitsandbytes/functional.py:2337, in igemmlt(A, B, SA, SB, out, Sout, dtype)
2335 if has_error:
2336 print(f"A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}")
-> 2337 raise Exception("cublasLt ran into an error!")
2339 torch.cuda.set_device(prev_device)
2341 return out, Sout
Exception: cublasLt ran into an error! |
|
Package Version accelerate 1.7.0 |
|
Okay~ It works on A100, but not on H-series GPU. I will merge it and update some descriptions. |
|
Thank you for merging. |
1.添加量化代码(代码量化,官方模型即可,无需额外下载)
2.添加中文翻译(可选择,使用--zh启动参数)
3.修正文本
4.添加进度条,参考了#49
5.补充精度转化
6.添加必要依赖
7.修改readme中的安装方法(安装更顺畅)
8.添加readme中的启动方法(添加了两个量化的启动方法,并给出了参考显存)
9.本来想把DF11也加上的,但是依赖的安装顺序就比较复杂,而且需要下载额外模型,因此放弃添加DF11