File tree Expand file tree Collapse file tree 5 files changed +14
-7
lines changed
model_executor/layers/quantization/utils Expand file tree Collapse file tree 5 files changed +14
-7
lines changed Original file line number Diff line number Diff line change 16
16
- run : echo "::add-matcher::.github/workflows/matchers/actionlint.json"
17
17
- uses : pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
18
18
with :
19
- extra_args : --hook-stage manual
19
+ extra_args : --all-files -- hook-stage manual
Original file line number Diff line number Diff line change 17
17
rev : v2.3.0
18
18
hooks :
19
19
- id : codespell
20
- exclude : ' benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
20
+ exclude : ' benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*|csrc/rocm/.*|csrc/gradlib/.* '
21
21
- repo : https://github.com/PyCQA/isort
22
22
rev : 5.13.2
23
23
hooks :
Original file line number Diff line number Diff line change @@ -326,6 +326,11 @@ def measure_current_non_torch():
326
326
# Add some extra non-torch memory 256 MiB (simulate NCCL)
327
327
handle2 = lib .cudaMalloc (256 * 1024 * 1024 )
328
328
329
+ # this is an analytic value, it is exact,
330
+ # we only have 256 MiB non-torch memory increase
331
+ measured_diff = monitored_values .values [- 1 ] - monitored_values .values [0 ]
332
+ assert measured_diff == 256 * 1024 * 1024
333
+
329
334
# Check that the memory usage is within 5% of the expected values
330
335
# 5% tolerance is caused by cuda runtime.
331
336
# we cannot control cuda runtime in the granularity of bytes,
Original file line number Diff line number Diff line change @@ -681,10 +681,12 @@ def forward(
681
681
seq_lens ,
682
682
make_attn_mask = False ) # type: ignore
683
683
full_scales = (
684
- 1.0 / layer ._q_scale .item (), 1.0 / layer ._k_scale .item (),
685
- 1.0 / layer ._v_scale .item (), 1.0 / layer ._prob_scale .item (),
684
+ 1.0 / layer ._q_scale .item (),
685
+ 1.0 / layer ._k_scale .item (), 1.0 /
686
+ layer ._v_scale .item (), 1.0 / layer ._prob_scale .item (),
686
687
fp8_out_scale .item ()) if (
687
- fp8_out_scale and layer ._q_scale and layer ._prob_scale
688
+ fp8_out_scale and layer ._q_scale
689
+ and layer ._prob_scale
688
690
and envs .VLLM_USE_ROCM_FP8_FLASH_ATTN ) else None
689
691
out , _ = self .attn_func (
690
692
query ,
Original file line number Diff line number Diff line change @@ -36,8 +36,8 @@ def apply_w8a8_block_fp8_linear(
36
36
37
37
38
38
def input_to_float8 (
39
- x : torch .Tensor ,
40
- dtype : Optional [torch .dtype ] = None
39
+ x : torch .Tensor ,
40
+ dtype : Optional [torch .dtype ] = None
41
41
) -> Tuple [torch .Tensor , torch .Tensor ]:
42
42
"""This function quantizes input values to float8 values "
43
43
"with tensor-wise quantization."""
You can’t perform that action at this time.
0 commit comments