Skip to content


[Bugfix] Fix support for dimension like integers and ScalarType (vllm…
Browse files Browse the repository at this point in the history
Rate limit · GitHub

Whoa there!

You have triggered an abuse detection mechanism.

Please wait a few minutes before you try again;
in some cases this may take up to an hour.

bnellnm authored Oct 17, 2024
1 parent 0f41fbe commit eca2c5f
Showing 22 changed files with 427 additions and 677 deletions.
14 changes: 6 additions & 8 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
@@ -230,14 +230,12 @@ steps:
- pytest -v -s compile/

# TODO: re-write in comparison tests, and fix symbolic shape
# for quantization ops.
# - label: "PyTorch Fullgraph Test" # 18min
# source_file_dependencies:
# - vllm/
# - tests/compile
# commands:
# - pytest -v -s compile/
- label: "PyTorch Fullgraph Test" # 18min
- vllm/
- tests/compile
- pytest -v -s compile/

- label: Kernels Test %N # 1h each
mirror_hardwares: [amd]
18 changes: 0 additions & 18 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -83,24 +83,6 @@ endif()
find_package(Torch REQUIRED)

message(STATUS "Enabling core extension.")

# Define _core_C extension
# built for (almost) every target platform, (excludes TPU and Neuron)



# Forward the non-CUDA device extensions to external CMake scripts.
209 changes: 4 additions & 205 deletions csrc/core/scalar_type.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include <torch/custom_class.h>
#include <torch/library.h>

namespace vllm {

@@ -9,12 +10,7 @@ namespace vllm {
// in particular it can be used to represent sub-byte data types (something
// that torch.dtype currently does not support).
// ScalarTypeTorch is a subclass of ScalarType that is compatible with
// TORCH_LIBRARY, making it accessible from Python as well meaning this class
// can be used as a argument for custom operators, helping to simplify these
// interfaces.
// The type definitions on the Python side can be found in: vllm/_core_ext.pyi
// The type definitions on the Python side can be found in: vllm/
// these type definitions should be kept up to date with any Python API changes
// here.
@@ -308,204 +304,7 @@ class ScalarType {

// Create a TORCH_LIBRARY compatible version of ScalarType (i.e. inherit from
// torch::CustomClassHolder), we use multiple inheritance here since we cannot
// have ScalarType inherit from torch::CustomClassHolder and have a constexpr
// constructor at the same time (torch::CustomClassHolder does not have a
// constexpr destructor)
// See also:
class ScalarTypeTorch : public torch::CustomClassHolder, public ScalarType {
ScalarTypeTorch(int64_t exponent, int64_t mantissa, int64_t bias,
bool _signed)
: ScalarType(exponent, mantissa, bias, _signed){};

ScalarTypeTorch(ScalarType type) : ScalarType(type){};

using Base = ScalarType;
using Self = ScalarTypeTorch;
using SelfPtr = c10::intrusive_ptr<Self>;

static void check_size_bits(int64_t size_bits, bool signed_) {
size_bits <=
"size_bits bit width is too large to be represented");

static void check_bias(int64_t bias) {
using Bias = decltype(std::declval<Self>().bias);
TORCH_CHECK(bias <= std::numeric_limits<Bias>::max() &&
bias >= std::numeric_limits<Bias>::min(),
"bias too large or small to be represented");

static void check_exponent(int64_t exponent) {
exponent <=
"exponent bit width is too large to be represented");

static void check_mantissa(int64_t mantissa) {
mantissa <=
"mantissa bit width is too large to be represented");

static SelfPtr int_(int64_t size_bits, c10::optional<int64_t> bias) {
check_size_bits(size_bits, true);
return c10::make_intrusive<Self>(
ScalarType::int_(size_bits, bias.value_or(0)));

static SelfPtr uint(int64_t size_bits, c10::optional<int64_t> bias) {
check_size_bits(size_bits, true);
return c10::make_intrusive<Self>(
ScalarType::uint(size_bits, bias.value_or(0)));

static SelfPtr float_IEEE754(int64_t exponent, int64_t mantissa) {
return c10::make_intrusive<Self>(
ScalarType::float_IEEE754(exponent, mantissa));

static SelfPtr float_(int64_t exponent, int64_t mantissa,
bool finite_values_only, int64_t nan_repr) {
return c10::make_intrusive<Self>(ScalarType::float_(
exponent, mantissa, finite_values_only, NanRepr(nan_repr)));

// This needs to be implemented and throw a TypeError in order for
// PyTorch's opcheck to work on ops that use ScalarTypes.
int64_t len() const {
throw c10::TypeError({__func__, __FILE__, static_cast<uint32_t>(__LINE__)},
"__len__ not implemented");
return 0;

// Serialize a ScalarType into a tuple of pairs. Where each pair
// is a (fieldname, value).
// For simplicity, we are just going to convert to a ScalarTypeId.
std::tuple<std::tuple<std::string, int64_t>> obj_flatten() const {
return {{"ScalarType", id()}};

// Deserialize a scalar type that has been serialized by obj_flatten,
// ostensibly from a tuple of (member name, value) pairs, but in reality
// just a ScalarTypeId.
static SelfPtr obj_unflatten(
std::tuple<std::tuple<std::string, int64_t>> const& flat_type) {
return c10::make_intrusive<Self>(

template <typename T>
static void bind_readonly_property(torch::class_<Self>& cls,
std::string const& name, T Base::*field) {
auto getter_func_helper = [field = std::move(field)](SelfPtr const& self) {
if constexpr (std::is_member_function_pointer_v<decltype(field)>) {
return (self.get()->*field)();
} else {
return self.get()->*field;

auto getter_func = [field = std::move(field),
getter_func_helper = std::move(getter_func_helper)](
SelfPtr const& self) {
auto val = getter_func_helper(self);
// upconvert uint8_t, int32_t etc. to int64_t for python
if constexpr (std::is_integral_v<T>) {
return static_cast<int64_t>(val);
} else {
return val;

cls.def_property(name, getter_func);

template <typename MemberFunc, typename Cls>
static void bind_function(torch::class_<Self>& cls, const std::string& name,
MemberFunc Cls::*member) {
cls.def(name, [member = std::move(member)](SelfPtr const& self) {
return (self.get()->*member)();

template <typename Func>
static void bind_function(torch::class_<Self>& cls, const std::string& name,
Func func) {
cls.def(name, func);

template <typename Func>
static void bind_static_function(torch::class_<Self>& cls,
const std::string& name, Func func) {
cls.def_static(name, func);

static void bind_class(torch::Library& lib) {
auto cls = lib.class_<ScalarTypeTorch>("ScalarType")
.def(torch::init<int64_t, int64_t, int64_t, bool>());

// Bind Properties
bind_readonly_property(cls, "mantissa", &Base::mantissa);
bind_readonly_property(cls, "exponent", &Base::exponent);
bind_readonly_property(cls, "bias", &Base::bias);
bind_readonly_property(cls, "signed", &Base::is_signed);
bind_readonly_property(cls, "size_bits", &Base::size_bits);

// Bind member functions
bind_function(cls, "is_signed", &Base::is_signed);
bind_function(cls, "is_integer", &Base::is_integer);
bind_function(cls, "is_floating_point", &Base::is_floating_point);
bind_function(cls, "is_ieee_754", &Base::is_ieee_754);
bind_function(cls, "has_nans", &Base::has_nans);
bind_function(cls, "has_infs", &Base::has_infs);
bind_function(cls, "has_bias", &Base::has_bias);

bind_function(cls, "max", [](SelfPtr const& self) {
return std::visit([](auto arg) { return c10::IValue(arg); },
bind_function(cls, "min", [](SelfPtr const& self) {
return std::visit([](auto arg) { return c10::IValue(arg); },

bind_function(cls, "__len__", &ScalarTypeTorch::len);
bind_function(cls, "__str__", &Base::str);
bind_function(cls, "__eq__", [](SelfPtr const& self, SelfPtr const& other) {
return *self == *other;
bind_function(cls, "__repr__", [](SelfPtr const& self) {
return "ScalarType." + self.get()->str();

bind_function(cls, "__obj_flatten__", &ScalarTypeTorch::obj_flatten);
bind_static_function(cls, "__obj_unflatten__",

// Bind static functions (convenience constructors)
bind_static_function(cls, "int_", &ScalarTypeTorch::int_);
bind_static_function(cls, "uint", &ScalarTypeTorch::uint);
bind_static_function(cls, "float_IEEE754", &ScalarTypeTorch::float_IEEE754);
bind_static_function(cls, "float_", &ScalarTypeTorch::float_);

using ScalarTypeId = int64_t;
using ScalarTypeTorchPtr = c10::intrusive_ptr<ScalarTypeTorch>;
using ScalarTypeId = ScalarType::Id;

// "rust style" names generally following:
16 changes: 0 additions & 16 deletions csrc/core/torch_bindings.cpp

This file was deleted.

15 changes: 8 additions & 7 deletions csrc/moe/
Original file line number Diff line number Diff line change
@@ -484,21 +484,22 @@ torch::Tensor marlin_gemm_moe(
const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
torch::Tensor& b_zeros, const torch::Tensor& g_idx,
const torch::Tensor& perm, torch::Tensor& workspace,
vllm::ScalarTypeTorchPtr const& b_q_type, int64_t size_m, int64_t size_n,
vllm::ScalarTypeId const b_q_type_id, int64_t size_m, int64_t size_n,
int64_t size_k, bool is_k_full, int64_t num_experts, int64_t topk,
int64_t moe_block_size, bool replicate_input, bool apply_weights) {
vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
bool has_zp = b_zeros.size(1) != 0;
if (has_zp) {
*b_q_type == vllm::kU4,
"b_q_type must be u4 when has_zp = True. Got = ", b_q_type->str());
b_q_type == vllm::kU4,
"b_q_type must be u4 when has_zp = True. Got = ", b_q_type.str());
} else {
*b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128,
"b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type->str());
b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128,
"b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type.str());

int pack_factor = 32 / b_q_type->size_bits();
int pack_factor = 32 / b_q_type.size_bits();

int max_par = 4;

@@ -575,7 +576,7 @@ torch::Tensor marlin_gemm_moe(
topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(),
b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(),
expert_offsets.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(),
*b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size,
b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size,
num_experts, topk, moe_block_size, dev,
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par,
replicate_input, apply_weights);
5 changes: 3 additions & 2 deletions csrc/moe/torch_bindings.cpp
Original file line number Diff line number Diff line change
"marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
"Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
"b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
"__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, "
"int size_n, int size_k, bool is_k_full, int num_experts, int topk, "
"int b_q_type, SymInt size_m, "
"SymInt size_n, SymInt size_k, bool is_k_full, int num_experts, int "
"topk, "
"int moe_block_size, bool replicate_input, bool apply_weights)"
" -> Tensor");
// conditionally compiled so impl registration is in source file
Rate limit · GitHub

Whoa there!

You have triggered an abuse detection mechanism.

Please wait a few minutes before you try again;
in some cases this may take up to an hour.

0 comments on commit eca2c5f

Please sign in to comment.