Skip to content

Commit

Permalink
- Fixed problem for CUDA 9.0 compatibility: the Census kernel was com…
Browse files Browse the repository at this point in the history
…piled with too many registers and it was not able to run, because of the 1024 threads per block requirement (not enough threads)

- Fixed an indexing bug, thanks to @ankit1089sny
  • Loading branch information
dhernandez committed Feb 8, 2018
1 parent 8528946 commit 0f78039
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 2 deletions.
2 changes: 1 addition & 1 deletion cost_aggregation.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ __device__ __forceinline__ void CostAggregationGenericIndexesIncrement(int *inde
template<int add_index, bool recompute, bool join_dispcomputation>
__device__ __forceinline__ void CostAggregationDiagonalGenericIndexesIncrement(int *index, int *index_im, int *col, const int cols, const int initial_row, const int i, const int dis) {
*col += add_index;
if(add_index > 0 && *col > cols) {
if(add_index > 0 && *col >= cols) {
*col = 0;
} else if(*col < 0) {
*col = cols-1;
Expand Down
4 changes: 3 additions & 1 deletion costs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
#include "costs.h"
#include <stdio.h>

__global__ void CenterSymmetricCensusKernelSM2(const uint8_t *im, const uint8_t *im2, cost_t *transform, cost_t *transform2, const uint32_t rows, const uint32_t cols) {
__global__ void
__launch_bounds__(1024, 2)
CenterSymmetricCensusKernelSM2(const uint8_t *im, const uint8_t *im2, cost_t *transform, cost_t *transform2, const uint32_t rows, const uint32_t cols) {
const int idx = blockIdx.x*blockDim.x+threadIdx.x;
const int idy = blockIdx.y*blockDim.y+threadIdx.y;

Expand Down

0 comments on commit 0f78039

Please sign in to comment.