|
| 1 | +# Copyright (c) 2025 Intel Corporation |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +from typing import Optional |
| 16 | + |
| 17 | +import torch |
| 18 | + |
| 19 | +# Module-level device tensor cache to fix cuda graph issue |
| 20 | +_DEVICE_E2M1_TENSORS = {} |
| 21 | + |
| 22 | +# Constants for FP4 values (E2M1 format) |
| 23 | +_E2M1_VALUES = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0] |
| 24 | + |
| 25 | + |
| 26 | +def get_e2m1_tensor(device): |
| 27 | + """Get device-specific E2M1 lookup tensor, creating it if needed.""" |
| 28 | + device_str = str(device) |
| 29 | + if device_str not in _DEVICE_E2M1_TENSORS: |
| 30 | + _DEVICE_E2M1_TENSORS[device_str] = torch.tensor(_E2M1_VALUES, dtype=torch.float32, device=device) |
| 31 | + return _DEVICE_E2M1_TENSORS[device_str] |
| 32 | + |
| 33 | + |
| 34 | +def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor: |
| 35 | + m, n = x.shape |
| 36 | + device = x.device |
| 37 | + |
| 38 | + # Create lookup table for FP4 values to indices |
| 39 | + # Map the absolute values to 0-7 indices |
| 40 | + kE2M1 = get_e2m1_tensor(x.device) |
| 41 | + |
| 42 | + # Find closest valid FP4 value index for each element |
| 43 | + abs_x = torch.abs(x) |
| 44 | + abs_diff_x = torch.abs(abs_x.unsqueeze(-1) - kE2M1) # [m, n, 8] |
| 45 | + abs_indices = torch.argmin(abs_diff_x, dim=-1) # [m, n] |
| 46 | + |
| 47 | + # Apply sign bit (bit 3) to get final 4-bit representation |
| 48 | + indices = abs_indices + (torch.signbit(x).to(torch.long) << 3) |
| 49 | + |
| 50 | + # Reshape to prepare for packing pairs of values |
| 51 | + indices = indices.reshape(-1) |
| 52 | + |
| 53 | + # Handle odd length by padding if necessary |
| 54 | + assert indices.numel() % 2 != 0, f"Expected even number of elements, got {indices.numel()}" |
| 55 | + |
| 56 | + # Reshape to pair consecutive elements |
| 57 | + indices = indices.reshape(-1, 2) |
| 58 | + |
| 59 | + # Pack pairs of 4-bit values into 8-bit values |
| 60 | + packed = (indices[:, 0] | (indices[:, 1] << 4)).to(torch.uint8) |
| 61 | + |
| 62 | + return packed.reshape(m, n // 2) |
| 63 | + |
| 64 | + |
| 65 | +def unpack_fp4_from_uint8( |
| 66 | + a: torch.Tensor, m: int, n: int, dtype: Optional[torch.dtype] = torch.bfloat16 |
| 67 | +) -> torch.Tensor: |
| 68 | + """ |
| 69 | + Unpacks uint8 values into fp4. Each uint8 consists of two fp4 values |
| 70 | + (i.e. first four bits correspond to one fp4 value, last four correspond to a |
| 71 | + consecutive fp4 value). The bits represent an index, which are mapped to an fp4 |
| 72 | + value. |
| 73 | +
|
| 74 | + :param a: tensor to unpack |
| 75 | + :param m: original dim 0 size of the unpacked tensor |
| 76 | + :param n: original dim 1 size of the unpacked tensor |
| 77 | + :param dtype: dense dtype to cast the unpacked tensor to |
| 78 | + """ |
| 79 | + assert a.dtype == torch.uint8, f"expected uint8, got {a.dtype}" |
| 80 | + |
| 81 | + # Vectorized nibble processing |
| 82 | + a_flat = a.flatten() |
| 83 | + high = (a_flat & 0xF0) >> 4 # Upper nibbles |
| 84 | + low = a_flat & 0x0F # Lower nibbles |
| 85 | + |
| 86 | + # Combine nibbles for batch processing |
| 87 | + combined = torch.stack((low, high), dim=1).flatten() |
| 88 | + |
| 89 | + # Vectorized sign and magnitude extraction |
| 90 | + signs = (combined & 0x08).to(torch.bool) # Sign bits |
| 91 | + abs_vals = (combined & 0x07).to(torch.long) # Magnitude indices |
| 92 | + |
| 93 | + # Device-aware lookup and sign application |
| 94 | + kE2M1 = get_e2m1_tensor(a.device) |
| 95 | + |
| 96 | + values = kE2M1[abs_vals] * torch.where(signs, -1.0, 1.0) |
| 97 | + |
| 98 | + # Reshape to final form |
| 99 | + return values.reshape(m, n).to(dtype=dtype) |
| 100 | + |
| 101 | + |
| 102 | +def cast_to_fp4(x): |
| 103 | + sign = torch.sign(x) |
| 104 | + x = torch.abs(x) |
| 105 | + x[(x >= 0.0) & (x <= 0.25)] = 0.0 |
| 106 | + x[(x > 0.25) & (x < 0.75)] = 0.5 |
| 107 | + x[(x >= 0.75) & (x <= 1.25)] = 1.0 |
| 108 | + x[(x > 1.25) & (x < 1.75)] = 1.5 |
| 109 | + x[(x >= 1.75) & (x <= 2.5)] = 2.0 |
| 110 | + x[(x > 2.5) & (x < 3.5)] = 3.0 |
| 111 | + x[(x >= 3.5) & (x <= 5.0)] = 4.0 |
| 112 | + x[x > 5.0] = 6.0 |
| 113 | + return x * sign |
0 commit comments