- 
                Notifications
    You must be signed in to change notification settings 
- Fork 292
          [Bugfix] Fix missing host cuTensorMapEncodeIm2col call
          #1094
        
          New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -106,6 +106,35 @@ def call({}): | |
| \t}} | ||
| """ | ||
|  | ||
| TMA_IM2COL_DESC_INIT_FUNC = """ | ||
| \tCUtensorMap {0}; | ||
| \tCUtensorMapDataType {0}_type= (CUtensorMapDataType){1}; | ||
| \tcuuint32_t {0}_tensorRank= {2}; | ||
| \tvoid *{0}_globalAddress= {3}; | ||
| \tcuuint64_t {0}_globalDim[{2}]= {{{4}}}; | ||
| \tcuuint64_t {0}_globalStride[{2}]= {{{5}}}; | ||
| \tcuuint32_t {0}_elementStrides[{2}]= {{{6}}}; | ||
| \tint {0}_lowerCorner[{2} - 2]= {{{7}}}; | ||
| \tint {0}_upperCorner[{2} - 2]= {{{8}}}; | ||
| \tcuuint32_t {0}_channelsPerPixel= {9}; | ||
| \tcuuint32_t {0}_pixelsPerColumn= {10}; | ||
| \tCUtensorMapInterleave {0}_interleave= (CUtensorMapInterleave){11}; | ||
| \tCUtensorMapSwizzle {0}_swizzle= (CUtensorMapSwizzle){12}; | ||
| \tCUtensorMapL2promotion {0}_l2Promotion= (CUtensorMapL2promotion){13}; | ||
| \tCUtensorMapFloatOOBfill {0}_oobFill= (CUtensorMapFloatOOBfill){14}; | ||
|  | ||
| \tCUresult {0}_result = CUTLASS_CUDA_DRIVER_WRAPPER_CALL(cuTensorMapEncodeIm2col)( | ||
| &{0}, {0}_type, {0}_tensorRank, {0}_globalAddress, {0}_globalDim, {0}_globalStride + 1, | ||
| {0}_lowerCorner, {0}_upperCorner, {0}_channelsPerPixel, {0}_pixelsPerColumn, {0}_elementStrides, {0}_interleave, {0}_swizzle, {0}_l2Promotion, {0}_oobFill); | ||
|  | ||
| \tif ({0}_result != CUDA_SUCCESS) {{ | ||
| \t\tstd::stringstream ss; | ||
| \t\tss << "Error: Failed to initialize the TMA descriptor {0}"; | ||
| \t\tsnprintf(error_buf, ERROR_BUF_SIZE, "%s", ss.str().c_str()); | ||
| \t\treturn -1; | ||
| \t}} | ||
| """ | ||
|  | ||
| TMA_DESC_INIT_FUNC_PY = """ | ||
| \t{0}_type = cuda.bindings.driver.CUtensorMapDataType({1}) | ||
| \t{0}_tensorRank = {2} | ||
|  | @@ -401,50 +430,92 @@ def generate_tma_descriptor_args(self, desc_name_map: Dict[str, str], | |
| if len(args) < 3: | ||
| raise ValueError( | ||
| f"TMA descriptor args too short: {len(args)} elements, expected at least 3") | ||
| _, dtype, tensor_rank, globalAddress, *remaining_args = args[1:] | ||
|  | ||
| tma_create_str, _, dtype, tensor_rank, globalAddress, *remaining_args = args | ||
|  | ||
| is_img2col = (tma_create_str.value == "__tvm_tensormap_create_im2col") | ||
| dtype = self._pythonic_expr(dtype) | ||
| tensor_rank = int(self._pythonic_expr(tensor_rank)) | ||
|  | ||
| # Validate tensor_rank | ||
| if not isinstance(tensor_rank, int) or tensor_rank <= 0: | ||
| raise ValueError(f"Invalid tensor_rank: {tensor_rank}. Must be a positive integer") | ||
|  | ||
| # Calculate required length for remaining_args | ||
| expected_args_len = 4 * tensor_rank + 4 # 4 groups of tensor_rank size + 4 parameters | ||
| if len(remaining_args) < expected_args_len: | ||
| raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, " | ||
| f"expected {expected_args_len} for tensor_rank {tensor_rank}") | ||
|  | ||
| # Extract dimensions and strides using list slicing | ||
| global_dim = remaining_args[:tensor_rank] | ||
| global_stride = remaining_args[tensor_rank:2 * tensor_rank] | ||
| box_dim = remaining_args[2 * tensor_rank:3 * tensor_rank] | ||
| element_strides = remaining_args[3 * tensor_rank:4 * tensor_rank] | ||
|  | ||
| global_dim = [self._pythonic_expr(i) for i in global_dim] | ||
| global_stride = [self._pythonic_expr(i) for i in global_stride] | ||
| box_dim = [self._pythonic_expr(i) for i in box_dim] | ||
| element_strides = [self._pythonic_expr(i) for i in element_strides] | ||
|  | ||
| # Extract remaining parameters | ||
| try: | ||
| interleave, swizzle, l2Promotion, oobFill = remaining_args[4 * tensor_rank:4 * | ||
| tensor_rank + 4] | ||
| interleave = self._pythonic_expr(interleave) | ||
| swizzle = self._pythonic_expr(swizzle) | ||
| l2Promotion = self._pythonic_expr(l2Promotion) | ||
| oobFill = self._pythonic_expr(oobFill) | ||
| except ValueError as e: | ||
| raise ValueError( | ||
| "Failed to unpack the final 4 TMA parameters (interleave, swizzle, l2Promotion, oobFill)" | ||
| ) from e | ||
| if not is_img2col: | ||
| # Calculate required length for remaining_args | ||
| expected_args_len = 4 * tensor_rank + 4 # 4 groups of tensor_rank size + 4 parameters | ||
| if len(remaining_args) < expected_args_len: | ||
| raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, " | ||
| f"expected {expected_args_len} for tensor_rank {tensor_rank}") | ||
|  | ||
| # Extract dimensions and strides using list slicing | ||
| global_dim = remaining_args[:tensor_rank] | ||
| global_stride = remaining_args[tensor_rank:2 * tensor_rank] | ||
| box_dim = remaining_args[2 * tensor_rank:3 * tensor_rank] | ||
| element_strides = remaining_args[3 * tensor_rank:4 * tensor_rank] | ||
|  | ||
| global_dim = [self._pythonic_expr(i) for i in global_dim] | ||
| global_stride = [self._pythonic_expr(i) for i in global_stride] | ||
| box_dim = [self._pythonic_expr(i) for i in box_dim] | ||
| element_strides = [self._pythonic_expr(i) for i in element_strides] | ||
|  | ||
| # Extract remaining parameters | ||
| try: | ||
| interleave, swizzle, l2Promotion, oobFill = remaining_args[4 * tensor_rank:4 * | ||
| tensor_rank + 4] | ||
| interleave = self._pythonic_expr(interleave) | ||
| swizzle = self._pythonic_expr(swizzle) | ||
| l2Promotion = self._pythonic_expr(l2Promotion) | ||
| oobFill = self._pythonic_expr(oobFill) | ||
| except ValueError as e: | ||
| raise ValueError( | ||
| "Failed to unpack the final 4 TMA parameters (interleave, swizzle, l2Promotion, oobFill)" | ||
| ) from e | ||
|  | ||
| tma_descripter_init += TMA_DESC_INIT_FUNC.format( | ||
| handle_name, dtype, tensor_rank, globalAddress, ",".join(global_dim), | ||
| ",".join(global_stride), ",".join(box_dim), ",".join(element_strides), | ||
| interleave, swizzle, l2Promotion, oobFill) | ||
| else: | ||
| # Calculate required length for remaining_args | ||
| expected_args_len = 5 * tensor_rank + 2 | ||
| if len(remaining_args) < expected_args_len: | ||
| raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, " | ||
| f"expected {expected_args_len} for tensor_rank {tensor_rank}") | ||
|  | ||
| # Extract dimensions and strides using list slicing | ||
| global_dim = remaining_args[:tensor_rank] | ||
| global_stride = remaining_args[tensor_rank:2 * tensor_rank] | ||
| element_strides = remaining_args[2 * tensor_rank:3 * tensor_rank] | ||
| lower_corner = remaining_args[3 * tensor_rank:4 * tensor_rank - 2] | ||
| upper_corner = remaining_args[4 * tensor_rank - 2:5 * tensor_rank - 4] | ||
| global_dim = [self._pythonic_expr(i) for i in global_dim] | ||
| global_stride = [self._pythonic_expr(i) for i in global_stride] | ||
| element_strides = [self._pythonic_expr(i) for i in element_strides] | ||
| lower_corner = [self._pythonic_expr(i) for i in lower_corner] | ||
| upper_corner = [self._pythonic_expr(i) for i in upper_corner] | ||
|  | ||
| # Extract remaining parameters | ||
| try: | ||
| smem_box_pixel, smem_box_channel, interleave, swizzle, l2Promotion, oobFill = remaining_args[ | ||
| 5 * tensor_rank - 4:5 * tensor_rank + 2] | ||
| smem_box_pixel = self._pythonic_expr(smem_box_pixel) | ||
| smem_box_channel = self._pythonic_expr(smem_box_channel) | ||
| interleave = self._pythonic_expr(interleave) | ||
| swizzle = self._pythonic_expr(swizzle) | ||
| l2Promotion = self._pythonic_expr(l2Promotion) | ||
| oobFill = self._pythonic_expr(oobFill) | ||
| except ValueError as e: | ||
| raise ValueError( | ||
| "Failed to unpack the final 6 TMA parameters (smem_box_pixel, smem_box_channel, interleave, swizzle, l2Promotion, oobFill)" | ||
| ) from e | ||
|  | ||
| tma_descripter_init += TMA_IM2COL_DESC_INIT_FUNC.format( | ||
| handle_name, dtype, tensor_rank, globalAddress, ",".join(global_dim), | ||
| ",".join(global_stride), ",".join(element_strides), ",".join(lower_corner), | ||
| ",".join(upper_corner), smem_box_channel, smem_box_pixel, interleave, swizzle, | ||
| l2Promotion, oobFill) | ||
|  | ||
| tma_descripter_init += TMA_DESC_INIT_FUNC.format(handle_name, dtype, tensor_rank, | ||
| globalAddress, ",".join(global_dim), | ||
| ",".join(global_stride), | ||
| ",".join(box_dim), | ||
| ",".join(element_strides), interleave, | ||
| swizzle, l2Promotion, oobFill) | ||
| return tma_descripter_init | ||
| 
      Comment on lines
    
      441
     to 
      519
    
   There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 
 The new branch in  Useful? React with 👍 / 👎. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can support in future. | ||
|  | ||
| def parse_source_information(self): | ||
|  | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
NVRTC path will mis-parse/new layout and lacks im2col support.
You changed the descriptor-arg layout (tma_create_str first) and added im2col handling for the CUDA host path, but TLNVRTCSourceWrapper.generate_tma_descriptor_args still:
This will break the NVRTC backend and Python wrapper when tma_descriptor_args follow the new layout or when im2col is used.
Apply the following updates:
Please confirm the NVRTC bindings expose cuTensorMapEncodeIm2col. If not, we’ll need to add it to the Python driver first.
🏁 Script executed:
Length of output: 4322
NVRTC path will break: lacks im2col support and uses old argument layout.
The NVRTC backend (TLNVRTCSourceWrapper) was not updated alongside the CUDA host changes. Two critical gaps:
args[1:]), but the new CUDA path now parsestma_create_strfirst, without theargs[1:]offset.cuTensorMapEncodeTiledis available in the Python templates; im2col support is absent.The proposed patch correctly adds
TMA_IM2COL_DESC_INIT_FUNC_PYand updates argument parsing for both branches. However, verify that cuda.bindings provides full coverage of and 1:1 access to the CUDA host APIs from Python, which should includecuTensorMapEncodeIm2col. If this binding is unavailable in the installed version, it must be added to the Python driver first before the patch can work.🧰 Tools
🪛 Ruff (0.14.1)
442-442: Avoid specifying long messages outside the exception class
(TRY003)
448-449: Avoid specifying long messages outside the exception class
(TRY003)
471-473: Avoid specifying long messages outside the exception class
(TRY003)
483-484: Avoid specifying long messages outside the exception class
(TRY003)
509-511: Avoid specifying long messages outside the exception class
(TRY003)
🤖 Prompt for AI Agents