Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clang cuda functions not handling concepts correctly. #67507

Open
rnburn opened this issue Sep 27, 2023 · 20 comments · May be fixed by #67721
Open

Clang cuda functions not handling concepts correctly. #67507

rnburn opened this issue Sep 27, 2023 · 20 comments · May be fixed by #67721
Labels
c++20 clang:frontend Language frontend issues, e.g. anything involving "Sema" cuda

Comments

@rnburn
Copy link

rnburn commented Sep 27, 2023

Take the example code posted below.

If I compile, I get this error

> clang++ -x cuda -c --cuda-gpu-arch=sm_70 -std=c++20 main.cc
clang++: warning: CUDA version is newer than the latest partially supported version 12.1 [-Wunknown-cuda-version]
root@3aa37681f2bd:/src/concept_bug# clang++ -x cuda -c --cuda-gpu-arch=sm_70 -std=c++20 main.cc
clang++: warning: CUDA version is newer than the latest partially supported version 12.1 [-Wunknown-cuda-version]
main.cc:116:3: error: no matching function for call to 'kernel'
  116 |   kernel<<<1, 1>>>(bucket_sums, generators, scalars, 0);
      |   ^~~~~~
main.cc:26:17: note: candidate template ignored: constraints not satisfied [with T = E]
   26 | __global__ void kernel(T* bucket_sums, const T* generators, const uint8_t* scalars,
      |                 ^
main.cc:25:11: note: because 'sxt::bascrv::element97' does not satisfy 'element'
   25 | template <bascrv::element T>
      |           ^
main.cc:13:3: note: because 'double_element(res, e)' would be invalid: no matching function for call to 'double_element'
   13 |   double_element(res, e);
      |   ^
1 error generated when compiling for sm_70.

But the concept element should be satified. In fact, if I uncomment the line

  // f(bucket_sums, generators, scalars, 0); // uncomment this line and things compile. ¯\_(ツ)_/

the compilation will succeed.

Here is the version of clang I'm using

clang++ --version
Ubuntu clang version 18.0.0 (++20230913042131+f3fdc967a87d-1~exp1~20230913042254.1182)
Target: x86_64-pc-linux-gnu
Thread model: posix
InstalledDir: /usr/bin

code:

#include <iostream>

// element.h
#include <cstdint>
#include <concepts>

namespace sxt::bascrv {
//--------------------------------------------------------------------------------------------------
// element
//--------------------------------------------------------------------------------------------------
template <class T>
concept element = requires(T& res, const T& e) {
  double_element(res, e);
  add(res, e, e);
  neg(res, e);
  add_inplace(res, res);
  { T::identity() } noexcept -> std::same_as<T>;
  mark(res);
  { is_marked(e) } noexcept -> std::same_as<bool>;
};
} // namespace sxt::bascrv

// kernel.h
namespace sxt::mtxbk {
template <bascrv::element T>
__global__ void kernel(T* bucket_sums, const T* generators, const uint8_t* scalars,
                       unsigned length) {
  (void)bucket_sums;
  (void)generators;
  (void)scalars;
  (void)length;
}

template <bascrv::element T>
void f(T* bucket_sums, const T* generators, const uint8_t* scalars, unsigned length) {
  (void)bucket_sums;
  (void)generators;
  (void)scalars;
  (void)length;
}
} // namespace sxt::mtxbk

// example_element.h
#include <cstdint>

namespace sxt::bascrv {
//--------------------------------------------------------------------------------------------------
// element97
//--------------------------------------------------------------------------------------------------
struct element97 {
  uint32_t value;
  bool marked = false;

  static element97 identity() noexcept {
    return {0};
  }

  bool operator==(const element97&) const noexcept = default;
};

//--------------------------------------------------------------------------------------------------
// double_element
//--------------------------------------------------------------------------------------------------
inline void double_element(element97& res, const element97& e) noexcept {
  res.value = (e.value + e.value) % 97u;
}

//--------------------------------------------------------------------------------------------------
// neg
//--------------------------------------------------------------------------------------------------
inline void neg(element97& res, const element97& e) noexcept {
  res.value = (97u - e.value) % 97u;
}

//--------------------------------------------------------------------------------------------------
// add
//--------------------------------------------------------------------------------------------------
inline void add(element97& res, const element97& x, const element97& y) noexcept {
  res.value = (x.value + y.value) % 97u;
}

//--------------------------------------------------------------------------------------------------
// add_inplace
//--------------------------------------------------------------------------------------------------
inline void add_inplace(element97& res, const element97& x) noexcept {
  res.value = (res.value + x.value) % 97u;
}

//--------------------------------------------------------------------------------------------------
// mark
//--------------------------------------------------------------------------------------------------
inline void mark(element97& res) noexcept {
  res.marked = true;
}

//--------------------------------------------------------------------------------------------------
// is_marked
//--------------------------------------------------------------------------------------------------
inline bool is_marked(const element97& e) noexcept {
  return e.marked;
}
} // namespace sxt::bascrv

// main
using namespace sxt;
using namespace sxt::mtxbk;

int main() {
  using E = bascrv::element97;
  E* bucket_sums = nullptr;
  const E* generators = nullptr;
  const uint8_t* scalars = nullptr;

  // f(bucket_sums, generators, scalars, 0); // uncomment this line and things compile. ¯\_(ツ)_/

  kernel<<<1, 1>>>(bucket_sums, generators, scalars, 0);

  return 0;
}
@github-actions github-actions bot added the clang Clang issues not falling into any other category label Sep 27, 2023
@Artem-B
Copy link
Member

Artem-B commented Sep 27, 2023

That's an interesting problem.

main.cc:13:3: note: because 'double_element(res, e)' would be invalid: no matching function for call to 'double_element'
   13 |   double_element(res, e);
      |   ^
1 error generated when compiling for sm_70.

The key here is generated when compiling for sm_70.. I don't know when/where/how concepts consider the double_element() function, but I suspect compiler tries to do an overload resolution and, because it happens during GPU-side compilation, it finds a host-only function double_element and rightfully complains that it is not callable on the GPU.

If you were to make the function constexpr or __host__ __device__, or provide a device overload, things may work a bit better.

@rnburn
Copy link
Author

rnburn commented Sep 27, 2023

I was able to find some workarounds, but the behavior definitely doesn't seem like it's correct.

I suspect concept satisfiability is cached, so if I call the a host functions with the same signature; then call the GPU function, it works as expected.

I don't think concept satisfiability should dependent on whether the functions have device or host markups. You may never invoke associated functions in device code; but they should still be used to check the concept requirements.

@rnburn
Copy link
Author

rnburn commented Sep 27, 2023

Also, nvcc compiles the example fine.

@Artem-B
Copy link
Member

Artem-B commented Sep 27, 2023

@yxsamliu ^^^ FYI.

@rnburn Would it be possible to further reduce the reproducer?

@rnburn
Copy link
Author

rnburn commented Sep 27, 2023

Reduced version:

// element.h

namespace sxt::bascrv {
//--------------------------------------------------------------------------------------------------
// element
//--------------------------------------------------------------------------------------------------
template <class T>
concept element = requires(T& res, const T& e) {
  double_element(res, e);
};
} // namespace sxt::bascrv

// kernel.h
namespace sxt::mtxbk {
template <bascrv::element T>
__global__ void kernel(T* bucket_sums, const T* generators, const uint8_t* scalars,
                       unsigned length) {
  (void)bucket_sums;
  (void)generators;
  (void)scalars;
  (void)length;
}

template <bascrv::element T>
void f(T* bucket_sums, const T* generators, const uint8_t* scalars, unsigned length) {
  (void)bucket_sums;
  (void)generators;
  (void)scalars;
  (void)length;
}
} // namespace sxt::mtxbk

// example_element.h

namespace sxt::bascrv {
//--------------------------------------------------------------------------------------------------
// element97
//--------------------------------------------------------------------------------------------------
struct element97 {
  unsigned value;
};

//--------------------------------------------------------------------------------------------------
// double_element
//--------------------------------------------------------------------------------------------------
inline void double_element(element97& res, const element97& e) noexcept {
  res.value = (e.value + e.value) % 97u;
}
} // namespace sxt::bascrv

// main
using namespace sxt;
using namespace sxt::mtxbk;

int main() {
  using E = bascrv::element97;
  E* bucket_sums = nullptr;
  const E* generators = nullptr;
  const uint8_t* scalars = nullptr;

  // f(bucket_sums, generators, scalars, 0); // uncomment this line and things compile. ¯\_(ツ)_/

  kernel<<<1, 1>>>(bucket_sums, generators, scalars, 0);

  return 0;
}

@Artem-B
Copy link
Member

Artem-B commented Sep 27, 2023

As I suggested above, double_element must be __host_- __device__ or constexpr. https://godbolt.org/z/MYh6TKjKK
Otherwise there will be an inconsistency in how the same concept is handled during host and GPU compilation.

I believe clang is correct to error out here, because the kernel, which must satisfy the requirement is not allowed to call a host-only double_element(res, e); function.

The main issue is that it's not clear how concepts are supposed to interact with target-based function overloading.
I.e. the concept expects a function to be callable, but that would normally depend on the callee context. The same function may be callable from some functions, but not others. That context may not propagate correctly to the place where we consider the requires.

This is largely unexplored territory. Collecting the corner cases like these and the workarounds would be useful as the starting point, so we can assess what the issues are and what would be the way to deal with them.

@rnburn
Copy link
Author

rnburn commented Sep 27, 2023

As I suggested above, double_element must be _host- device or constexpr. https://godbolt.org/z/MYh6TKjKK
Otherwise there will be an inconsistency in how the same concept is handled during host and GPU compilation.

I believe clang is correct to error out here, because the kernel, which must satisfy the requirement is not allowed to call a host-only double_element(res, e); function.

This cannot be correct. If I call

1: host_funct_with_same_signature(args...); // this is valid
2: kernel<<<1, 1>>>(bucket_sums, generators, scalars, 0); // now this works because the compiler already determined the concept as satisfied and it won't check again

Remove 1 and you get the compilation error. Even if your argument is that clang is correct to error (which I very much don't think it is). Then it should error consistently.

The main issue is that it's not clear how concepts are supposed to interact with target-based function overloading.
I.e. the concept expects a function to be callable, but that would normally depend on the callee context.

Concept satisfiability should be completely separate from whether a function is host, device, etc. This is what nvcc appears to do. And a concept can have requirements that aren't invoked from device code.

@Artem-B
Copy link
Member

Artem-B commented Sep 27, 2023

As I said -- it's unexplored territory. I do agree that compiler should behave consistently. Clang not complaining in the second case looks like a bug to me, not the reason why the original error is wrong.

Concept satisfiability should be completely separate from whether a function is host, device, etc.
I'm not sure that will be possible to do.
E.g. in your example you could apply the same concept to both GPU-only kernel and a CPU-only function.

The 'requires' will be satisfied for the host function, but not for the GPU kernel. Granted, my familiarity with concepts is rather rudimentary and it's likely that I'm misunderstanding something. If I'm wrong here, please tell me why you would expect the example above, as written, to compile on the GPU.

This is what nvcc appears to do

NVCC not reporting an error may be a bug, too. I do not see how that concept can be satisfied for the GPU kernel, when double_element() is not callable on the GPU.

@rnburn
Copy link
Author

rnburn commented Sep 27, 2023

Concepts can be high level and bundle a lot of behavior. You might have a generic concept such as geometric_shape, for example, that builds on other lower level concept primitives and would require a type to implement hundreds of different function requirements. Perhaps only a few would be relevant to GPU code. Nonetheless you may want to express generic APIs in terms of geometric_shape as having a collection of known behavior could be simpler and more maintainable than going through individual generic functions and determining minimal requirements.

@Artem-B
Copy link
Member

Artem-B commented Sep 27, 2023

That's all good. But how do we reconcile it with the fact that with heterogeneous computing in the picture, some of the assumptions concepts may be implicitly relying on are no longer true? As I said, it's an unexplored territory.

You expectation appears to be that the concepts written for the host-only code should magically work for GPU code, too, and I don't think it's always doable, at least for the concepts that depend on call-ability of some functions. It would work for a subset of such functions, the ones that are callable on both host and device -- that would be mostly constexpr functions and lambdas, but even there will be issues as the overload sets may be different during the host and the device compilation.

@rnburn
Copy link
Author

rnburn commented Sep 27, 2023

You expectation appears to be that the concepts written for the host-only code should magically work for GPU code, too, and I don't think it's always doable

No, I would expect an error if they get invoked from device code, but no error otherwise. For example, geometric_shape might require a function compute_area. You might use that function to compute the areas for a large array of objects on the GPU. But it may also require a function draw_object that interacts with display drivers and makes no sense to invoke from device code.

I would expect geometric_shape to be satisfied if the functions are implemented. I would expect a compilation error if you try to invoke compute_area from device code and it doesn't have __device__. I would not expect an error if draw_object is not marked with __device__ because it should never be called from device code.

@Artem-B
Copy link
Member

Artem-B commented Sep 27, 2023

Can you give me a specific example? Here's a compiler explorer page with both host and device side CUDA compilation: https://cuda.godbolt.org/z/4rjda6r6T that you can use as a starting point.

Please fill in an example of the scenario you've outlined above with the expectations you would have for the code.
This would help me to understand what you have in mind, what happens now and, maybe, what should happen.

@rnburn
Copy link
Author

rnburn commented Sep 27, 2023

Here's a sketch of what should work

#include <concepts>

template <class T>
concept geometric_shape = requires(const T& sh) {
  { area(sh) } -> std::convertible_to<double>;
  draw(sh);
};

struct triangle {
  double x1, y1, x2, y2, x3, y3;
};

__host__ __device__
double area(const triangle& tr) {
  return 123;
}

void draw(const triangle& tr) {
  // interact with display
}

template <geometric_shape T>
__global__ void compute_areas(double* areas, const T* shapes, unsigned n) {
  // invoke areas[i] = area(shapes[i]) for all i
}

int main() {
  double* areas = nullptr;
  triangle* shapes = nullptr;
  // static_assert(geometric_shape<triangle>); // this will not error
  compute_areas<<<32, 64>>>(areas, shapes, 0); // this should work
  return 0;
}

@Artem-B
Copy link
Member

Artem-B commented Sep 27, 2023

Surprisingly enough it's std::convertible that gives us trouble here: https://cuda.godbolt.org/z/d1W1sM7rj
This is something I should look at, but it's somewhat unrelated to this discussion.

template <class T>
concept geometric_shape = requires(const T& sh) {
  { area(sh) } -> std::convertible_to<double>;
  draw(sh);
};

What are the expectations enforced by the concept?

  • { area(sh) } -> std::convertible_to<double> Does this only care that the return type of area() is convertible to double?
  • draw(sh) -- does it require draw to be callable? If so, from where? For CUDA it does matter. Considering that geometric_shape is used in the template parameter type, it's not clear. When would this check be expected to fail? E.g. should the requirement be unsatisfied if a kernel would not be able to call draw(sh) ? Or should it fail if the host can't call it?

What would a concept look like for the following conditions:

  • "GPU can call func(a)"
  • "GPU can not call func(a)")
  • "host can call func(a)"
  • "host can not call func(a)"

@rnburn
Copy link
Author

rnburn commented Sep 27, 2023

{ area(sh) } -> std::convertible_to Does this only care that the return type of area() is convertible to double?

Yes. That's what it's expressing. For an example, you could also write as std::same_as<double>.

What would a concept look like for the following conditions:

I'm not expecting host/device to factor into whether a concept is satisfied. So, if you call

template <geometric_shape T>
__global__ void compute_areas(double* areas, const T* shapes, unsigned n) {
  // ....
     draw(shapes[i]);
 // ...
}

I'd expect to not get a substitution failure, but still get an error that's equivalent to calling the function with no concept constraint.

template <class T>
__global__ void compute_areas(double* areas, const T* shapes, unsigned n) {
  // ....
     draw(shapes[i]);
 // ...
}

Perhaps it might be useful to consider extending concept syntax to be able to express host-device specific requirements; but I image the most basic version would ignore __device__, __host__ markup (or at least ignore __device__). You'd still get errors if you get to the point where you're trying to mix calls to non-device functions in device-only IR.

@yxsamliu
Copy link
Collaborator

If we allow host function to fulfill requirement of a device function, we may end up with a fake satisfaction of a real requirement, which defeats the purpose of concept.

A better solution might be using different concepts for host and device templates. The concept for host only contains requirements on the host side. The concept for device only contains requirements on the device side.

@yxsamliu
Copy link
Collaborator

yxsamliu commented Sep 28, 2023

However, I think we may need to check concept requirements in host context by default, even if the check is done for device template.

This is to be compatible with C++ constexpr templates and also nvcc.

The rationale is that usually, users usually have different requirements on the host and device side. The requirements in a concept for C++ are for the host side by default. It may not be the users' intention to transfer them to the device side by function names.

By doing this, we keep the compatibility with C++ and nvcc, but lose the capability of expressing requirements on device side.

I think that may be acceptable as a baseline.

Then we can consider introducing some new extension for expressing requirements on device side, e.g.

  1. we may use syntax like public/private:

__device__: // requriements for device side follows

  1. we may allow adding __device__ attribute to the function calls in requirements to indicate it is a requirement for host side

  2. we may allow adding __device__ __host__ attributes to the concept to indicate all the requirements are for a specific side

we can use conditional macros #if __has_extension to enable these device side requirements for clang only.

@yxsamliu
Copy link
Collaborator

Currently, constraints are checked in Sema::FinishTemplateArgumentDeduction, where the current function in ASTContext is set to the instantiated template function. When resolving functions for the constraints, clang assumes the caller is the current function, that's why host functions failed candidacy in template kernels.

clang caches the constraint checking result per concept/type matching: https://github.com/llvm/llvm-project/blob/main/clang/lib/Sema/SemaConcept.cpp#L502 . It assumes the result does not depend on the instantiation context. That's why after a successful constraints-checking with a host function instantiation, it does not do the checking for the kernel instantiation.

I think the fix should be to let constraint checking have its own host/device context and by default it is host to be compatible with C++. This will make the constraint checking independent of callers and make the caching valid. Later on, we may introduce device constraints by other means, e.g. adding __device__ attribute per function call in constraints.

@yxsamliu yxsamliu linked a pull request Sep 28, 2023 that will close this issue
yxsamliu added a commit to yxsamliu/llvm-project that referenced this issue Sep 28, 2023
Currently, constraints are checked in Sema::FinishTemplateArgumentDeduction,
where the current function in ASTContext is set to the instantiated template
function. When resolving functions for the constraints, clang assumes the
caller is the current function, This causes incompatibility with nvcc and
also for constexpr template functions with C++.

clang caches the constraint checking result per concept/type matching. It
assumes the result does not depend on the instantiation context.

This patch let constraint checking have its own host/device context and by
default it is host to be compatible with C++. This makes the constraint
checking independent of callers and make the caching valid.

In the future, we may introduce device constraints by other means,
e.g. adding __device__ attribute per function call in constraints.

Fixes: llvm#67507
@Endilll Endilll added c++20 cuda clang:frontend Language frontend issues, e.g. anything involving "Sema" and removed clang Clang issues not falling into any other category labels Oct 2, 2023
@llvmbot
Copy link
Collaborator

llvmbot commented Oct 2, 2023

@llvm/issue-subscribers-c-20

Take the example code posted below.

If I compile, I get this error

&gt; clang++ -x cuda -c --cuda-gpu-arch=sm_70 -std=c++20 main.cc
clang++: warning: CUDA version is newer than the latest partially supported version 12.1 [-Wunknown-cuda-version]
root@<!-- -->3aa37681f2bd:/src/concept_bug# clang++ -x cuda -c --cuda-gpu-arch=sm_70 -std=c++20 main.cc
clang++: warning: CUDA version is newer than the latest partially supported version 12.1 [-Wunknown-cuda-version]
main.cc:116:3: error: no matching function for call to 'kernel'
  116 |   kernel&lt;&lt;&lt;1, 1&gt;&gt;&gt;(bucket_sums, generators, scalars, 0);
      |   ^~~~~~
main.cc:26:17: note: candidate template ignored: constraints not satisfied [with T = E]
   26 | __global__ void kernel(T* bucket_sums, const T* generators, const uint8_t* scalars,
      |                 ^
main.cc:25:11: note: because 'sxt::bascrv::element97' does not satisfy 'element'
   25 | template &lt;bascrv::element T&gt;
      |           ^
main.cc:13:3: note: because 'double_element(res, e)' would be invalid: no matching function for call to 'double_element'
   13 |   double_element(res, e);
      |   ^
1 error generated when compiling for sm_70.

But the concept element should be satified. In fact, if I uncomment the line

  // f(bucket_sums, generators, scalars, 0); // uncomment this line and things compile. ¯\_(ツ)_/

the compilation will succeed.

Here is the version of clang I'm using

clang++ --version
Ubuntu clang version 18.0.0 (++20230913042131+f3fdc967a87d-1~exp1~20230913042254.1182)
Target: x86_64-pc-linux-gnu
Thread model: posix
InstalledDir: /usr/bin

code:

#include &lt;iostream&gt;

// element.h
#include &lt;cstdint&gt;
#include &lt;concepts&gt;

namespace sxt::bascrv {
//--------------------------------------------------------------------------------------------------
// element
//--------------------------------------------------------------------------------------------------
template &lt;class T&gt;
concept element = requires(T&amp; res, const T&amp; e) {
  double_element(res, e);
  add(res, e, e);
  neg(res, e);
  add_inplace(res, res);
  { T::identity() } noexcept -&gt; std::same_as&lt;T&gt;;
  mark(res);
  { is_marked(e) } noexcept -&gt; std::same_as&lt;bool&gt;;
};
} // namespace sxt::bascrv

// kernel.h
namespace sxt::mtxbk {
template &lt;bascrv::element T&gt;
__global__ void kernel(T* bucket_sums, const T* generators, const uint8_t* scalars,
                       unsigned length) {
  (void)bucket_sums;
  (void)generators;
  (void)scalars;
  (void)length;
}

template &lt;bascrv::element T&gt;
void f(T* bucket_sums, const T* generators, const uint8_t* scalars, unsigned length) {
  (void)bucket_sums;
  (void)generators;
  (void)scalars;
  (void)length;
}
} // namespace sxt::mtxbk

// example_element.h
#include &lt;cstdint&gt;

namespace sxt::bascrv {
//--------------------------------------------------------------------------------------------------
// element97
//--------------------------------------------------------------------------------------------------
struct element97 {
  uint32_t value;
  bool marked = false;

  static element97 identity() noexcept {
    return {0};
  }

  bool operator==(const element97&amp;) const noexcept = default;
};

//--------------------------------------------------------------------------------------------------
// double_element
//--------------------------------------------------------------------------------------------------
inline void double_element(element97&amp; res, const element97&amp; e) noexcept {
  res.value = (e.value + e.value) % 97u;
}

//--------------------------------------------------------------------------------------------------
// neg
//--------------------------------------------------------------------------------------------------
inline void neg(element97&amp; res, const element97&amp; e) noexcept {
  res.value = (97u - e.value) % 97u;
}

//--------------------------------------------------------------------------------------------------
// add
//--------------------------------------------------------------------------------------------------
inline void add(element97&amp; res, const element97&amp; x, const element97&amp; y) noexcept {
  res.value = (x.value + y.value) % 97u;
}

//--------------------------------------------------------------------------------------------------
// add_inplace
//--------------------------------------------------------------------------------------------------
inline void add_inplace(element97&amp; res, const element97&amp; x) noexcept {
  res.value = (res.value + x.value) % 97u;
}

//--------------------------------------------------------------------------------------------------
// mark
//--------------------------------------------------------------------------------------------------
inline void mark(element97&amp; res) noexcept {
  res.marked = true;
}

//--------------------------------------------------------------------------------------------------
// is_marked
//--------------------------------------------------------------------------------------------------
inline bool is_marked(const element97&amp; e) noexcept {
  return e.marked;
}
} // namespace sxt::bascrv

// main
using namespace sxt;
using namespace sxt::mtxbk;

int main() {
  using E = bascrv::element97;
  E* bucket_sums = nullptr;
  const E* generators = nullptr;
  const uint8_t* scalars = nullptr;

  // f(bucket_sums, generators, scalars, 0); // uncomment this line and things compile. ¯\_(ツ)_/

  kernel&lt;&lt;&lt;1, 1&gt;&gt;&gt;(bucket_sums, generators, scalars, 0);

  return 0;
}

@llvmbot
Copy link
Collaborator

llvmbot commented Oct 2, 2023

@llvm/issue-subscribers-clang-frontend

Take the example code posted below.

If I compile, I get this error

&gt; clang++ -x cuda -c --cuda-gpu-arch=sm_70 -std=c++20 main.cc
clang++: warning: CUDA version is newer than the latest partially supported version 12.1 [-Wunknown-cuda-version]
root@<!-- -->3aa37681f2bd:/src/concept_bug# clang++ -x cuda -c --cuda-gpu-arch=sm_70 -std=c++20 main.cc
clang++: warning: CUDA version is newer than the latest partially supported version 12.1 [-Wunknown-cuda-version]
main.cc:116:3: error: no matching function for call to 'kernel'
  116 |   kernel&lt;&lt;&lt;1, 1&gt;&gt;&gt;(bucket_sums, generators, scalars, 0);
      |   ^~~~~~
main.cc:26:17: note: candidate template ignored: constraints not satisfied [with T = E]
   26 | __global__ void kernel(T* bucket_sums, const T* generators, const uint8_t* scalars,
      |                 ^
main.cc:25:11: note: because 'sxt::bascrv::element97' does not satisfy 'element'
   25 | template &lt;bascrv::element T&gt;
      |           ^
main.cc:13:3: note: because 'double_element(res, e)' would be invalid: no matching function for call to 'double_element'
   13 |   double_element(res, e);
      |   ^
1 error generated when compiling for sm_70.

But the concept element should be satified. In fact, if I uncomment the line

  // f(bucket_sums, generators, scalars, 0); // uncomment this line and things compile. ¯\_(ツ)_/

the compilation will succeed.

Here is the version of clang I'm using

clang++ --version
Ubuntu clang version 18.0.0 (++20230913042131+f3fdc967a87d-1~exp1~20230913042254.1182)
Target: x86_64-pc-linux-gnu
Thread model: posix
InstalledDir: /usr/bin

code:

#include &lt;iostream&gt;

// element.h
#include &lt;cstdint&gt;
#include &lt;concepts&gt;

namespace sxt::bascrv {
//--------------------------------------------------------------------------------------------------
// element
//--------------------------------------------------------------------------------------------------
template &lt;class T&gt;
concept element = requires(T&amp; res, const T&amp; e) {
  double_element(res, e);
  add(res, e, e);
  neg(res, e);
  add_inplace(res, res);
  { T::identity() } noexcept -&gt; std::same_as&lt;T&gt;;
  mark(res);
  { is_marked(e) } noexcept -&gt; std::same_as&lt;bool&gt;;
};
} // namespace sxt::bascrv

// kernel.h
namespace sxt::mtxbk {
template &lt;bascrv::element T&gt;
__global__ void kernel(T* bucket_sums, const T* generators, const uint8_t* scalars,
                       unsigned length) {
  (void)bucket_sums;
  (void)generators;
  (void)scalars;
  (void)length;
}

template &lt;bascrv::element T&gt;
void f(T* bucket_sums, const T* generators, const uint8_t* scalars, unsigned length) {
  (void)bucket_sums;
  (void)generators;
  (void)scalars;
  (void)length;
}
} // namespace sxt::mtxbk

// example_element.h
#include &lt;cstdint&gt;

namespace sxt::bascrv {
//--------------------------------------------------------------------------------------------------
// element97
//--------------------------------------------------------------------------------------------------
struct element97 {
  uint32_t value;
  bool marked = false;

  static element97 identity() noexcept {
    return {0};
  }

  bool operator==(const element97&amp;) const noexcept = default;
};

//--------------------------------------------------------------------------------------------------
// double_element
//--------------------------------------------------------------------------------------------------
inline void double_element(element97&amp; res, const element97&amp; e) noexcept {
  res.value = (e.value + e.value) % 97u;
}

//--------------------------------------------------------------------------------------------------
// neg
//--------------------------------------------------------------------------------------------------
inline void neg(element97&amp; res, const element97&amp; e) noexcept {
  res.value = (97u - e.value) % 97u;
}

//--------------------------------------------------------------------------------------------------
// add
//--------------------------------------------------------------------------------------------------
inline void add(element97&amp; res, const element97&amp; x, const element97&amp; y) noexcept {
  res.value = (x.value + y.value) % 97u;
}

//--------------------------------------------------------------------------------------------------
// add_inplace
//--------------------------------------------------------------------------------------------------
inline void add_inplace(element97&amp; res, const element97&amp; x) noexcept {
  res.value = (res.value + x.value) % 97u;
}

//--------------------------------------------------------------------------------------------------
// mark
//--------------------------------------------------------------------------------------------------
inline void mark(element97&amp; res) noexcept {
  res.marked = true;
}

//--------------------------------------------------------------------------------------------------
// is_marked
//--------------------------------------------------------------------------------------------------
inline bool is_marked(const element97&amp; e) noexcept {
  return e.marked;
}
} // namespace sxt::bascrv

// main
using namespace sxt;
using namespace sxt::mtxbk;

int main() {
  using E = bascrv::element97;
  E* bucket_sums = nullptr;
  const E* generators = nullptr;
  const uint8_t* scalars = nullptr;

  // f(bucket_sums, generators, scalars, 0); // uncomment this line and things compile. ¯\_(ツ)_/

  kernel&lt;&lt;&lt;1, 1&gt;&gt;&gt;(bucket_sums, generators, scalars, 0);

  return 0;
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
c++20 clang:frontend Language frontend issues, e.g. anything involving "Sema" cuda
Projects
None yet
Development

Successfully merging a pull request may close this issue.

5 participants