stdml
diff --git a/‎include/ttl/bits/experimental/std_zip.hpp‎
Lines changed: 119 additions & 0 deletions b/‎include/ttl/bits/experimental/std_zip.hpp‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎include/ttl/bits/fake_cuda_runtime.hpp‎
Lines changed: 14 additions & 5 deletions b/‎include/ttl/bits/fake_cuda_runtime.hpp‎
Lines changed: 14 additions & 5 deletions
diff --git a/‎include/ttl/bits/flat_tensor_mixin.hpp‎
Lines changed: 0 additions & 18 deletions b/‎include/ttl/bits/flat_tensor_mixin.hpp‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎include/ttl/bits/raw_tensor_mixin.hpp‎
Lines changed: 0 additions & 15 deletions b/‎include/ttl/bits/raw_tensor_mixin.hpp‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎include/ttl/bits/std_cuda_allocator.hpp‎
Lines changed: 25 additions & 11 deletions b/‎include/ttl/bits/std_cuda_allocator.hpp‎
Lines changed: 25 additions & 11 deletions
diff --git a/‎include/ttl/bits/std_encoding.hpp‎
Lines changed: 16 additions & 1 deletion b/‎include/ttl/bits/std_encoding.hpp‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎include/ttl/bits/std_reflect.hpp‎
Lines changed: 50 additions & 0 deletions b/‎include/ttl/bits/std_reflect.hpp‎
Lines changed: 50 additions & 0 deletions
@@ -0,0 +1,119 @@
+#pragma once
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma message("ttl::experimental::zip is error-prone, use with care!")
+#endif
+
+#include <array>
+#include <functional>
+#include <numeric>
+#include <tuple>
+
+namespace ttl
+{
+namespace experimental
+{
+namespace internal
+{
+template <typename... Ts>
+class zipper_t
+{
+    static constexpr auto arity = sizeof...(Ts);
+
+    const std::tuple<const Ts &...> ranges_;
+
+    template <typename... Iters>
+    class iterator
+    {
+        std::tuple<Iters...> is_;
+
+        template <size_t... Is>
+        auto operator*(std::index_sequence<Is...>)
+        {
+            return std::make_tuple(*std::get<Is>(is_)...);
+        }
+
+        template <typename... P>
+        static void noop(const P &...)
+        {
+        }
+
+        template <typename Iter>
+        int incr(Iter &i)
+        {
+            ++i;
+            return 0;
+        }
+
+        template <size_t... Is>
+        void _advance(std::index_sequence<Is...>)
+        {
+            noop(incr(std::get<Is>(is_))...);
+        }
+
+        template <size_t... Is>
+        bool neq(std::index_sequence<Is...>, const iterator &p) const
+        {
+            // TODO: expand the expression
+            std::array<bool, arity> neqs(
+                {(std::get<Is>(is_) != std::get<Is>(p.is_))...});
+            return std::accumulate(neqs.begin(), neqs.end(), false,
+                                   std::logical_or<bool>());
+        }
+
+      public:
+        iterator(const Iters &... i) : is_(i...) {}
+
+        bool operator!=(const iterator &p) const
+        {
+            // return get<0>(is_) != get<0>(p.is_) || get<1>(is_) !=
+            // get<1>(p.is_);
+            return neq(std::make_index_sequence<arity>(), p);
+        }
+
+        void operator++()
+        {
+            _advance(std::make_index_sequence<arity>());
+            // ++get<0>(is_), ++get<1>(is_);
+        }
+
+        auto operator*()
+        {
+            return (operator*)(std::make_index_sequence<arity>());
+        }
+    };
+
+    template <typename... Iters>
+    static iterator<Iters...> make_iterator(const Iters &... is)
+    {
+        return iterator<Iters...>(is...);
+    }
+
+    template <size_t... Is>
+    auto begin(std::index_sequence<Is...>) const
+    {
+        return make_iterator(std::get<Is>(ranges_).begin()...);
+    }
+
+    template <size_t... Is>
+    auto end(std::index_sequence<Is...>) const
+    {
+        return make_iterator(std::get<Is>(ranges_).end()...);
+    }
+
+  public:
+    zipper_t(const Ts &... ranges) : ranges_(ranges...) {}
+
+    auto begin() const { return begin(std::make_index_sequence<arity>()); }
+
+    auto end() const { return end(std::make_index_sequence<arity>()); }
+};
+
+template <typename... Ts>
+zipper_t<Ts...> zip(const Ts &... ranges)
+{
+    return zipper_t<Ts...>(ranges...);
+}
+}  // namespace internal
+}  // namespace experimental
+}  // namespace ttl
@@ -1,4 +1,5 @@
 #pragma once
+#include <cstdio>
 #include <cstring>
 #include <map>
 #include <stdexcept>
@@ -12,7 +13,7 @@ constexpr const cudaMemcpyKind cudaMemcpyHostToDevice = 1;
 constexpr const cudaMemcpyKind cudaMemcpyDeviceToHost = 2;
 constexpr const cudaMemcpyKind cudaMemcpyDeviceToDevice = 3;
 
-class fake_device
+class fake_cuda_device
 {
     std::map<const void *, size_t> _allocs;
 
@@ -33,7 +34,9 @@ class fake_device
     }
 
   public:
-    ~fake_device() { check_leak(); }
+    fake_cuda_device() { std::printf("using fake_cuda_device!\n"); }
+
+    ~fake_cuda_device() { check_leak(); }
 
     void *alloc(size_t size)
     {
@@ -51,7 +54,8 @@ class fake_device
         _allocs.erase(data);
     }
 
-    void memcpy(void *dst, const void *src, int size, cudaMemcpyKind dir) const
+    void memcpy(void *dst, const void *src, size_t size,
+                cudaMemcpyKind dir) const
     {
         switch (dir) {
         case cudaMemcpyHostToDevice:
@@ -67,9 +71,9 @@ class fake_device
     }
 };
 
-fake_device fake_cuda;
+fake_cuda_device fake_cuda;
 
-cudaError_t cudaMalloc(void **ptr, int count)
+cudaError_t cudaMalloc(void **ptr, size_t count)
 {
     *ptr = fake_cuda.alloc(count);
     return cudaSuccess;
@@ -87,3 +91,8 @@ cudaError_t cudaMemcpy(void *dst, const void *src, size_t size,
     fake_cuda.memcpy(dst, src, size, dir);
     return cudaSuccess;
 }
+
+std::string cudaGetErrorString(const cudaError_t err)
+{
+    return "fake_cudaError_t(" + std::to_string(static_cast<int>(err)) + ")";
+}
@@ -27,12 +27,6 @@ class flat_tensor_mixin
     template <rank_t r, typename A1 = typename trait::Access>
     using T = basic_tensor<R, basic_shape<r, Dim>, D, A1>;
 
-    template <rank_t r, typename A1>
-    [[deprecated]] T<r, A1> ranked_as() const
-    {
-        return T<r, A1>(data_.get(), shape_.template as_ranked<r>());
-    }
-
   protected:
     using allocator = basic_allocator<R, D>;
 
@@ -67,18 +61,6 @@ class flat_tensor_mixin
         using T = basic_tensor<R, basic_shape<r, Dim>, D, Access>;
         return T(data(), shape_.template ranked<r>());
     }
-
-    template <rank_t r>
-    [[deprecated]] T<r, readwrite> ref_as() const
-    {
-        return ranked_as<r, readwrite>();
-    }
-
-    template <rank_t r>
-    [[deprecated]] T<r, readonly> view_as() const
-    {
-        return ranked_as<r, readonly>();
-    }
 };
 }  // namespace internal
 }  // namespace ttl
@@ -113,21 +113,6 @@ class raw_tensor_mixin
         using T = basic_tensor<R, basic_shape<r, Dim>, D, Access>;
         return T(data<R>(), shape_.template ranked<r>());
     }
-
-    template <typename R, rank_t r, typename A1 = A>
-    [[deprecated]] basic_tensor<R, basic_shape<r, Dim>, D, A1>
-    ranked_as() const {
-        return basic_tensor<R, basic_shape<r, Dim>, D, A1>(
-            data<R>(), shape_.template ranked<r>());
-    }
-
-    template <typename R, rank_t r>
-    [[deprecated]] basic_tensor<R, basic_shape<r, Dim>, D, readwrite> ref_as()
-        const { return ranked_as<R, r, readwrite>(); }
-
-    template <typename R, rank_t r>
-    [[deprecated]] basic_tensor<R, basic_shape<r, Dim>, D, readonly> view_as()
-        const { return ranked_as<R, r, readonly>(); }
 };
 }  // namespace internal
 }  // namespace ttl
@@ -1,6 +1,7 @@
 #pragma once
 #include <cstddef>
 #include <stdexcept>
+#include <string>
 
 #include <ttl/bits/std_cuda_runtime.hpp>
 #include <ttl/bits/std_device.hpp>
@@ -10,6 +11,23 @@ namespace ttl
 {
 namespace internal
 {
+class std_cuda_error_checker_t
+{
+    const std::string func_name_;
+
+  public:
+    std_cuda_error_checker_t(const char *func_name) : func_name_(func_name) {}
+
+    void operator<<(const cudaError_t err) const
+    {
+        if (err != cudaSuccess) {
+            throw std::runtime_error(func_name_ + " failed with: " +
+                                     std::to_string(static_cast<int>(err)) +
+                                     ": " + cudaGetErrorString(err));
+        }
+    }
+};  // namespace ttl
+
 struct cuda_copier {
     static constexpr auto h2d = cudaMemcpyHostToDevice;
     static constexpr auto d2h = cudaMemcpyDeviceToHost;
@@ -18,10 +36,8 @@ struct cuda_copier {
     template <cudaMemcpyKind dir>
     static void copy(void *dst, const void *src, size_t size)
     {
-        const cudaError_t err = cudaMemcpy(dst, src, size, dir);
-        if (err != cudaSuccess) {
-            throw std::runtime_error("cudaMemcpy failed");
-        }
+        static std_cuda_error_checker_t check("cudaMemcpy");
+        check << cudaMemcpy(dst, src, size, dir);
     }
 };
 
@@ -49,15 +65,13 @@ template <typename R>
 class basic_allocator<R, cuda_memory>
 {
   public:
-    R *operator()(int count)
+    R *operator()(size_t count)
     {
         void *deviceMem;
         // cudaMalloc<R>(&deviceMem, count);
         // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY
-        const cudaError_t err = cudaMalloc(&deviceMem, count * sizeof(R));
-        if (err != cudaSuccess) {
-            throw std::runtime_error("cudaMalloc failed");
-        }
+        static std_cuda_error_checker_t check("cudaMalloc");
+        check << cudaMalloc(&deviceMem, count * sizeof(R));
         return reinterpret_cast<R *>(deviceMem);
     }
 };
@@ -68,8 +82,8 @@ class basic_deallocator<R, cuda_memory>
   public:
     void operator()(R *data)
     {
-        const cudaError_t err = cudaFree(data);
-        if (err != cudaSuccess) { throw std::runtime_error("cudaFree failed"); }
+        static std_cuda_error_checker_t check("cudaFree");
+        check << cudaFree(data);
     }
 };
 }  // namespace internal
 
@@ -37,6 +37,21 @@ class basic_scalar_encoding
   public:
     static constexpr V value = (category << 16) | (byte_num << 8) | byte_size;
 };
+
+enum class scaler_type : uint32_t {
+    u8 = basic_scalar_encoding<uint8_t, uint32_t>::value,
+    u16 = basic_scalar_encoding<uint16_t, uint32_t>::value,
+    u32 = basic_scalar_encoding<uint32_t, uint32_t>::value,
+    u64 = basic_scalar_encoding<uint64_t, uint32_t>::value,
+
+    i8 = basic_scalar_encoding<int8_t, uint32_t>::value,
+    i16 = basic_scalar_encoding<int16_t, uint32_t>::value,
+    i32 = basic_scalar_encoding<int32_t, uint32_t>::value,
+    i64 = basic_scalar_encoding<int64_t, uint32_t>::value,
+
+    f32 = basic_scalar_encoding<float, uint32_t>::value,
+    f64 = basic_scalar_encoding<double, uint32_t>::value,
+};
 }  // namespace internal
 
 namespace experimental
@@ -52,7 +67,7 @@ struct std_encoding {
     template <typename R>
     static constexpr value_type value()
     {
-        return internal::basic_scalar_encoding<R, value_type>::value;
+        return ttl::internal::basic_scalar_encoding<R, value_type>::value;
     }
 };
 }  // namespace experimental
 
@@ -0,0 +1,50 @@
+#pragma once
+#include <cxxabi.h>
+
+#include <climits>
+#include <string>
+
+namespace ttl
+{
+namespace internal
+{
+template <typename T>
+std::string demangled_type_info_name()
+{
+    int status = 0;
+    return abi::__cxa_demangle(typeid(T).name(), 0, 0, &status);
+}
+
+template <typename R>
+constexpr char scalar_type_prefix()
+{
+    if (std::is_floating_point<R>::value) {
+        return 'f';
+    } else if (std::is_integral<R>::value) {
+        return std::is_signed<R>::value ? 'i' : 'u';
+    } else {
+        return 's';
+    }
+}
+
+template <bool, typename R>
+class scalar_type_name;
+
+template <typename R>
+class scalar_type_name<false, R>
+{
+  public:
+    std::string operator()() const { return demangled_type_info_name<R>(); }
+};
+
+template <typename R>
+class scalar_type_name<true, R>
+{
+  public:
+    std::string operator()() const
+    {
+        return scalar_type_prefix<R>() + std::to_string(sizeof(R) * CHAR_BIT);
+    }
+};
+}  // namespace internal
+}  // namespace ttl