Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions THCUNN.lua
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,8 @@ local replacements =
{
{
['THTensor'] = 'THCudaTensor',
['THIndexTensor'] = 'THCudaTensor',
['THIntegerTensor'] = 'THCudaTensor',
['THIndex_t'] = 'float',
['THIndexTensor'] = 'THCudaLongTensor',
['THIndex_t'] = 'long',
['THInteger_t'] = 'float'
}
}
Expand Down
56 changes: 29 additions & 27 deletions lib/THCUNN/LookupTable.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ __device__ __forceinline__ bool warpHasCollision(int val)
}

__global__ void cunn_LookupTable_accGradParametersKernelByFeature(
float *input, float *gradOutput, float *gradWeight, float scale, long numel,
long *input, float *gradOutput, float *gradWeight, float scale, long numel,
long stride, int paddingValue) {

const int featureDim = blockIdx.x * 4 + threadIdx.x / 32;
Expand Down Expand Up @@ -96,8 +96,8 @@ __global__ void cunn_LookupTable_accGradParametersKernelByFeature(
}

__global__ void cunn_LookupTable_accGradParametersKernel(
float *input, float *indices, float *gradOutput, float *gradWeight,
float *count, float defaultScale, long numel, long stride, int paddingValue) {
long *input, long *indices, float *gradOutput, float *gradWeight,
long *count, float defaultScale, long numel, long stride, int paddingValue) {

int idx = blockIdx.x * 4 + threadIdx.y;

Expand Down Expand Up @@ -164,33 +164,33 @@ void THNN_CudaLookupTable_accGradParameters(
THIndexTensor *input,
THCudaTensor *gradOutput,
THCudaTensor *gradWeight,
THIntegerTensor *count,
THCudaTensor *sorted,
THCudaTensor *indices,
THIndexTensor *count,
THIndexTensor *sorted,
THIndexTensor *indices,
bool scaleGradByFreq,
int paddingValue,
float scale)
{
THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, sorted, indices);
if (!(THCudaTensor_isContiguous(state, input) &&
if (!(THIndexTensor_(isContiguous)(state, input) &&
THCudaTensor_isContiguous(state, gradOutput) &&
THCudaTensor_isContiguous(state, gradWeight)))
{
THError("Tensors must be contiguous");
}

int nDim = THCudaTensor_nDimension(state, input);
int nDim = THIndexTensor_(nDimension)(state, input);
if (nDim != 1 && nDim != 2)
THError("input must be a vector or matrix");

long numel = THCudaTensor_nElement(state, input);
long numel = THIndexTensor_(nElement)(state, input);
long stride = gradWeight->stride[0];

cudaStream_t stream = THCState_getCurrentStream(state);

if (numel <= 768 && !scaleGradByFreq) {
cunn_LookupTable_accGradParametersKernelByFeature<<<DIVUP(stride,4), 128, 0, stream>>>(
THCudaTensor_data(state, input),
THIndexTensor_(data)(state, input),
THCudaTensor_data(state, gradOutput),
THCudaTensor_data(state, gradWeight),
scale,
Expand All @@ -201,23 +201,25 @@ void THNN_CudaLookupTable_accGradParameters(
return;
}

THCudaTensor_resizeAs(state, sorted, input);
THCudaTensor_resizeAs(state, indices, input);
THLongStorage *inputSize = THIndexTensor_(newSizeOf)(state, input);
THIndexTensor_(resize)(state, sorted, inputSize, NULL);
THIndexTensor_(resize)(state, indices, inputSize, NULL);
THLongStorage_free(inputSize);

// Sort the inputs into sorted with the corresponding indices
THCudaTensor_sort(state, sorted, indices, input, 0, 0);
THIndexTensor_(sort)(state, sorted, indices, input, 0, 0);

float *sorted_data = THCudaTensor_data(state, sorted);
float *indices_data = THCudaTensor_data(state, indices);
float *count_data = NULL;
long *sorted_data = THIndexTensor_(data)(state, sorted);
long *indices_data = THIndexTensor_(data)(state, indices);
long *count_data = NULL;

if (scaleGradByFreq)
{
THIntegerTensor_(resizeAs)(state, count, input);
count_data = THIntegerTensor_(data)(state, count);
THIndexTensor_(resizeAs)(state, count, input);
count_data = THIndexTensor_(data)(state, count);

thrust::device_ptr<float> sorted_ptr(sorted_data);
thrust::device_ptr<float> count_ptr(count_data);
thrust::device_ptr<long> sorted_ptr(sorted_data);
thrust::device_ptr<long> count_ptr(count_data);

// Compute an increasing sequence per unique item in sorted:
// sorted: 2 5 5 5 7 7 8 9 9
Expand All @@ -243,8 +245,8 @@ void THNN_CudaLookupTable_accGradParameters(
thrust::make_reverse_iterator(sorted_ptr),
thrust::make_reverse_iterator(count_ptr + numel),
thrust::make_reverse_iterator(count_ptr + numel),
thrust::equal_to<float>(),
thrust::maximum<float>()
thrust::equal_to<long>(),
thrust::maximum<long>()
);
}

Expand Down Expand Up @@ -302,23 +304,23 @@ void THNN_CudaLookupTable_renorm(
float normType)
{
THCUNN_assertSameGPU(state, 2, idx, weight);
if (!(THCudaTensor_isContiguous(state, idx) &&
if (!(THIndexTensor_(isContiguous)(state, idx) &&
THCudaTensor_isContiguous(state, weight)))
{
THError("Tensors must be contiguous");
}
if (THCudaTensor_nDimension(state, idx) != 1)
if (THIndexTensor_(nDimension)(state, idx) != 1)
THError("idx must be a vector");
if (normType <= 0)
THError("non-positive-norm not supported");

long numel = THCudaTensor_nElement(state, idx);
long numel = THIndexTensor_(nElement)(state, idx);
long stride = weight->stride[0];

// get the unique indices
thrust::device_ptr<float> weight_ptr(THCudaTensor_data(state, weight));
thrust::device_ptr<float> idx_ptr(THCudaTensor_data(state, idx));
thrust::device_ptr<float> end_ptr = thrust::unique(idx_ptr, idx_ptr+numel);
thrust::device_ptr<long> idx_ptr(THIndexTensor_(data)(state, idx));
thrust::device_ptr<long> end_ptr = thrust::unique(idx_ptr, idx_ptr+numel);
numel = end_ptr - idx_ptr;

pow_v<float> unary_pow(normType);
Expand Down
17 changes: 8 additions & 9 deletions lib/THCUNN/SparseLinear.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ void THNN_CudaSparseLinear_updateOutput(THCState *state,

init_cusparse();
cusparseXcoo2csr(cusparse_handle,
THCudaIntTensor_data(state, rowbuf), nnz, batchnum,
THCudaIntTensor_data(state, rowbuf), nnz, batchnum,
THCudaIntTensor_data(state, csrPtrs), CUSPARSE_INDEX_BASE_ONE);

// output = bias
Expand All @@ -82,13 +82,13 @@ void THNN_CudaSparseLinear_updateOutput(THCState *state,
THCudaTensor_select(state, sel, buffer, 1, h);
THCudaTensor_copy(state, sel, bias);
}

// output = W * x
float one = 1;
cusparseMatDescr_t descr = 0;
cusparseCreateMatDescr(&descr);
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
cusparseScsrmm(cusparse_handle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
batchnum, outDim, inDim, nnz,
Expand All @@ -105,7 +105,7 @@ void THNN_CudaSparseLinear_updateOutput(THCState *state,
// We do work in the buffer to keep the output contiguous
THCudaTensor_copy(state, output, buffer);

cusparseDestroyMatDescr(descr);
cusparseDestroyMatDescr(descr);
descr = 0;
THCudaTensor_free(state, buffer);
THCudaTensor_free(state, sel);
Expand Down Expand Up @@ -141,7 +141,7 @@ void THNN_CudaSparseLinear_accGradParameters(
THCudaTensor *buf = THCudaTensor_new(state);
THCudaTensor *cols = THCudaTensor_new(state);
THCudaTensor *sel = THCudaTensor_new(state);
THCudaTensor *inds = THCudaTensor_new(state);
THCudaLongTensor *inds = THCudaLongTensor_new(state);
THCudaTensor *values = THCudaTensor_new(state);
THCudaIntTensor *colbuf = THCudaIntTensor_new(state);
THCudaIntTensor *colPtrs = THCudaIntTensor_new(state);
Expand Down Expand Up @@ -169,7 +169,7 @@ void THNN_CudaSparseLinear_accGradParameters(
init_cusparse();
// Secretly coo2csc
cusparseXcoo2csr(cusparse_handle,
THCudaIntTensor_data(state, colbuf), nnz, inDim,
THCudaIntTensor_data(state, colbuf), nnz, inDim,
THCudaIntTensor_data(state, colPtrs), CUSPARSE_INDEX_BASE_ONE);

// FORTRAN expects contiguous col-major matricies
Expand All @@ -182,7 +182,7 @@ void THNN_CudaSparseLinear_accGradParameters(
cusparseMatDescr_t descr = 0;
cusparseCreateMatDescr(&descr);
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
cusparseScsrmm(cusparse_handle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
inDim, outDim, batchnum, nnz,
Expand All @@ -208,7 +208,7 @@ void THNN_CudaSparseLinear_accGradParameters(
THCudaTensor_free(state, buf);
THCudaTensor_free(state, sel);
THCudaTensor_free(state, cols);
THCudaTensor_free(state, inds);
THCudaLongTensor_free(state, inds);
THCudaTensor_free(state, values);
THCudaIntTensor_free(state, colbuf);
THCudaIntTensor_free(state, rowInds);
Expand Down Expand Up @@ -260,4 +260,3 @@ TH_API void THNN_CudaSparseLinear_updateParameters(

void THNN_CudaSparseLinear_cudaClearState(THCState *state) {
}

13 changes: 5 additions & 8 deletions lib/THCUNN/THCUNN.h
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
#include <THC/THC.h>
#include <THC/THCApply.cuh>

#define THIndexTensor THCudaTensor
#define THIndexTensor_(NAME) THCudaTensor_ ## NAME

#define THIntegerTensor THCudaTensor
#define THIntegerTensor_(NAME) THCudaTensor_ ## NAME
#define THIndexTensor THCudaLongTensor
#define THIndexTensor_(NAME) THCudaLongTensor_ ## NAME

TH_API void THNN_CudaAbs_updateOutput(
THCState *state,
Expand Down Expand Up @@ -159,9 +156,9 @@ TH_API void THNN_CudaLookupTable_accGradParameters(
THIndexTensor *input,
THCudaTensor *gradOutput,
THCudaTensor *gradWeight,
THIntegerTensor *count,
THCudaTensor *sorted, // [OPTIONAL]
THCudaTensor *indices, // [OPTIONAL]
THIndexTensor *count,
THIndexTensor *sorted, // [OPTIONAL]
THIndexTensor *indices, // [OPTIONAL]
bool scaleGradByFreq,
int paddingValue,
float scale);
Expand Down