Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

[WIP][DO NOT MERGE] Experimental vector types #513

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .jenkins/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ WITH_CAFFE2=ON CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda CLANG_PREFIX=$(${CONDA_PREF
python setup.py install
./test_python/run_test.sh

for f in $(find ./python/examples -name "*.py"); do
python $f
done

FILTER_OUT="benchmark_MLP_model benchmark_kronecker" ./test.sh
# 2LUT can OOM on smaller Maxwells on our CI machines
./build/tc/benchmarks/benchmark_MLP_model --gtest_filter=-*2LUT*
Expand Down
190 changes: 190 additions & 0 deletions python/examples/min_distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
# Copyright (c) 2017-present, Facebook, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##############################################################################
import tensor_comprehensions as tc
from tensor_comprehensions.tc import set_logtostderr
from tensor_comprehensions.tc import set_debug_tc_mapper
from tensor_comprehensions.tc import set_debug_cuda

import numpy as np
import torch

#
## Example submitted by @mdouze, originally related to uint8 type support
#

debug = False
set_logtostderr(debug)
set_debug_tc_mapper(debug)
set_debug_cuda(debug)

N = 1000
M = 32

codes = np.random.randint(1<<32, size=(N, M // 4)).astype('uint32')
codes = codes.view('uint8')
luts = np.random.randn(M, 256).astype('float32')

codes_t = torch.from_numpy(codes).cuda()
luts_t = torch.from_numpy(luts).cuda()

lang = """
# mindis as a single kernel will require grid synchronization to run efficiently
def mindis(float(M, 256) L, uint8(N, M) Codes) -> (S, v, min_idx) {
S(n) +=! L(r_m, int32(Codes(n, r_m)))
v min=! S(r_n)
min_idx min=! (S(r_n) == v) ? r_n : N
}

# Even when splitting in 3 kernels, global device reduction will be needed to
# run efficiently
# don't try to run it with large sizes for now
def reduce_codes(float(M, 256) L, uint8(N, M) Codes) -> (S) {
S(n) +=! L(r_m, int32(Codes(n, r_m)))
}
def min_2d(float(N) S) -> (v) {
v min=! S(r_n)
}
def argmin_2d(float(N) S, float v) -> (min_idx) {
min_idx min=! (S(r_n) == v) ? r_n : N
}
"""

mindis = tc.define(lang, name="mindis")
S, v, min_idx = mindis(luts_t, codes_t)
print("minval: {} minidx: {}".format(v, min_idx))

reduce_codes = tc.define(lang, name="reduce_codes")
min_2d = tc.define(lang, name="min_2d")
argmin_2d = tc.define(lang, name="argmin_2d")

S = reduce_codes(luts_t, codes_t)
v = min_2d(S)
min_idx = argmin_2d(S, v)

print("minval: {} minidx: {}".format(v, min_idx))

################################################################################
# Each reduction is probably easier to optimize with a 2-staged TC where we
# artifically increase parallelism and finish the reduction in a second kernel.
# Properly choosing D such that N = D * (N / D) should result in a good version
# with 5 kernels total.
################################################################################
N = 10 ** 5 # bump to 10**7 when ready for primetime
D = 1000
assert N % D == 0, "D={} must divide N={}".format(D, N)
M = 32

lang = """
def reduce_codes(float(M, 256) L, uint8(N, M) Codes) -> (S) {
S(n) +=! L(r_m, int32(Codes(n, r_m)))
}
def min_2d(float(D, NBYD) S) -> (V) {
V(d) min=! S(d, r_nbyd)
}
def min_1d(float(D) V) -> (v) {
v min=! V(r_d)
}
def argmin_2d(float(D, NBYD) S, float v) -> (MinIdx) {
MinIdx(d) min=! (S(d, r_nbyd) == v) ? d * NBYD + r_nbyd : N
}
def argmin_1d(float(N) S, int32(D) MinIdx) -> (min_idx) {
min_idx min=! (MinIdx(r_d) < N) ? r_d : N
}
"""

codes = np.random.randint(1<<32, size=(N, M // 4)).astype('uint32')
codes = codes.view('uint8')
luts = np.random.randn(M, 256).astype('float32')

codes_t = torch.from_numpy(codes).cuda()
luts_t = torch.from_numpy(luts).cuda()

reduce_codes = tc.define(lang, name="reduce_codes")
min_2d = tc.define(lang, name="min_2d")
min_1d = tc.define(lang, name="min_1d")
argmin_2d = tc.define(lang, name="argmin_2d")
argmin_1d = tc.define(lang, name="argmin_1d")

S = reduce_codes(luts_t, codes_t)
V = min_2d(S.view((D, N / D)))
v = min_1d(V)
MinIdx = argmin_2d(S.view((D, N / D)), v)
min_idx = argmin_1d(S, MinIdx)
print("minval: {} minidx: {}".format(v, min_idx))

################################################################################
# Longer form version has an extra k dimension we could use for parallelism
# Unfortunately is it a small dimension (16) so it won't saturate Pascal/Volta.
# So we may want to split in 5 to run efficiently.
################################################################################
N = 10 ** 7 # bump to 10**7 when ready for primetime
D = 1000
assert N % D == 0, "D={} must divide N={}".format(D, N)
M = 32
K = 16
codes = np.random.randint(1<<32, size=(N, M // 4)).astype('uint32')
codes = codes.view('uint8')
luts = np.random.randn(K, M, 256).astype('float32')

codes_t = torch.from_numpy(codes).cuda()
luts_t = torch.from_numpy(luts).cuda()

lang = """
def mindis(float(K, M, 256) L, uint8(N, M) Codes) -> (S, V, MinIdx) {
S(k, n) +=! L(k, r_m, int32(Codes(n, r_m)))
V(k) min=! S(k, r_n)
MinIdx(k) min=! (S(k, r_n) == V(k)) ? r_n : N
}
"""

debug = False
set_logtostderr(debug)
set_debug_tc_mapper(debug)
set_debug_cuda(debug)

mindis = tc.define(lang, name="mindis")
S, V, MinIdx = mindis(luts_t, codes_t)
print("minvals: {}\nminidxs: {}".format(V, MinIdx))

lang = """
def reduce_codes(float(K, M, 256) L, uint8(N, M) Codes) -> (S) {
S(k, n) +=! L(k, r_m, int32(Codes(n, r_m)))
}
def min_2d(float(K, D, NBYD) S) -> (V2) {
V2(k, d) min=! S(k, d, r_nbyd)
}
def min_1d(float(K, D) V2) -> (V) {
V(k) min=! V2(k, r_d)
}
def argmin_2d(float(K, D, NBYD) S, float(K) V) -> (MinIdx2) {
MinIdx2(k, d) min=! (S(k, d, r_nbyd) == V(k)) ? d * NBYD + r_nbyd : N
}
def argmin_1d(float(K, N) S, int32(K, D) MinIdx2) -> (MinIdx) {
MinIdx(k) min=! (MinIdx2(k, r_d) < N) ? r_d : N
}
"""

reduce_codes = tc.define(lang, name="reduce_codes")
min_2d = tc.define(lang, name="min_2d")
min_1d = tc.define(lang, name="min_1d")
argmin_2d = tc.define(lang, name="argmin_2d")
argmin_1d = tc.define(lang, name="argmin_1d")

S = reduce_codes(luts_t, codes_t)
V2 = min_2d(S.view((K, D, N / D)))
V = min_1d(V2)
MinIdx2 = argmin_2d(S.view((K, D, N / D)), V)
MinIdx = argmin_1d(S, MinIdx2)
print("minval: {} minidx: {}".format(V, MinIdx))
19 changes: 16 additions & 3 deletions tc/aten/aten_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,22 @@ std::vector<at::Tensor> prepareOutputs(
auto atenBackend = inputs[0].type().backend();
for (size_t i = 0; i < outTensorInfo.size(); ++i) {
TensorInfo info(outTensorInfo[i]);
auto stype = at::toScalarType(info.dtype);
outputs.push_back(
at::getType(atenBackend, stype).tensor(at::IntList(info.shape)));
if (info.dtype.lanes == 1) {
auto stype = at::toScalarType(info.dtype);
outputs.push_back(
at::getType(atenBackend, stype).tensor(at::IntList(info.shape)));
} else {
// "Cast" to a larger strided tensor with 1 lane
auto lanes = info.dtype.lanes;
TC_CHECK(lanes == 2 || lanes == 4);
info.dtype.lanes = 1;
info.shape[info.shape.size() - 1] *= lanes;
auto stype = at::toScalarType(info.dtype);
auto T = at::getType(atenBackend, stype).tensor(at::IntList(info.shape));

info.shape[info.shape.size() - 1] /= lanes;
outputs.push_back(T.set_(*T.storage(), 0, info.shape));
}
}
return outputs;
}
Expand Down
81 changes: 81 additions & 0 deletions tc/core/libraries.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,92 @@ namespace code {
namespace c {

constexpr auto types = R"C(
// Can't include system dependencies with NVRTC
// Can't include cuda_fp16.h with NVRTC due to transitive system dependencies
// #include <cuda_fp16.h>

// Halide type handling
typedef char int8;
typedef short int16;
typedef int int32;
typedef long int64;
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint32;
typedef unsigned long uint64;
// typedef half float16;
typedef float float32;
typedef double float64;

template <typename T>
struct array2 {
T x, y;
array2(T t) : x(t), y(t) {}
array2(T x, T y) : x(x), y(y) {}
array2<T> operator+(const array2<T>& a) const {
return array2<T>{
static_cast<T>(x + a.x),
static_cast<T>(y + a.y)
};
}
array2<T>& operator+=(const array2<T>& a) {
x += a.x;
y += a.y;
return *this;
}
};
template <typename Type>
array2<Type> x2(Type t) {
return array2<Type>(t);
}
typedef array2<char> int8x2;
typedef array2<short> int16x2;
typedef array2<int> int32x2;
typedef array2<long> int64x2;
typedef array2<unsigned char> uint8x2;
typedef array2<unsigned short> uint16x2;
typedef array2<unsigned int> uint32x2;
typedef array2<unsigned long> uint64x2;
// typedef array2<half> float16x2;
typedef array2<float> float32x2;
typedef array2<double> float64x2;

template <typename T>
struct array4 {
T x, y, z, w;
array4(T t) : x(t), y(t), z(t), w(t) {}
array4(T x, T y, T z, T w) : x(x), y(y), z(z), w(w) {}
array4<T> operator+(const array4<T>& a) const {
return array4<T>{
static_cast<T>(x + a.x),
static_cast<T>(y + a.y),
static_cast<T>(z + a.z),
static_cast<T>(w + a.w)
};
}
array4<T>& operator+=(const array4<T>& a) {
x += a.x;
y += a.y;
z += a.z;
w += a.w;
return *this;
}
};
template <typename Type>
array4<Type> x4(Type t) {
return array4<Type>(t);
}
typedef array4<char> int8x4;
typedef array4<short> int16x4;
typedef array4<int> int32x4;
typedef array4<long> int64x4;
typedef array4<unsigned char> uint8x4;
typedef array4<unsigned short> uint16x4;
typedef array4<unsigned int> uint32x4;
typedef array4<unsigned long> uint64x4;
// typedef array4<half> float16x4;
typedef array4<float> float32x4;
typedef array4<double> float64x4;
)C";

constexpr auto defines = R"C(
Expand Down
Loading