Skip to content

Commit

Permalink
Merge pull request NVIDIA#305 from NVIDIA/fix_epilogue_spill
Browse files Browse the repository at this point in the history
fix epilogue register spill
  • Loading branch information
Manish Gupta authored Jul 29, 2021
2 parents 4516b83 + a77c658 commit 1227351
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -703,7 +703,7 @@ class EpilogueWithReduction :

int output_row = destination_iterator.thread_start_row() + row_offset;

fetch = (output_row < destination_iterator.extent().row() && column_guard);
fetch = (output_row < destination_iterator.extent_row() && column_guard);
}
else {
fetch = true;
Expand Down Expand Up @@ -785,7 +785,7 @@ class EpilogueWithReduction :

int output_row = destination_iterator.thread_start_row() + row_offset;

fetch = (output_row < destination_iterator.extent().row() && column_guard);
fetch = (output_row < destination_iterator.extent_row() && column_guard);
}
else {
fetch = true;
Expand Down
18 changes: 9 additions & 9 deletions include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ class PredicatedTileIterator {
Mask mask_;

/// Extent of the matrix tile in rows
TensorCoord extent_;
Index extent_row_;

/// A thread's starting row position (assuming steady-state predicates have been computed)
Index thread_start_row_;
Expand All @@ -184,7 +184,7 @@ class PredicatedTileIterator {
// Static asserts about internal strides
//

static_assert(sizeof(extent_.row()) == 4, "Expected 32b extents");
static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");

Expand All @@ -209,20 +209,20 @@ class PredicatedTileIterator {
int thread_idx,
TensorCoord threadblock_offset = TensorCoord()
):
params_(params),
extent_(extent)
params_(params)
{

TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;

extent_row_ = extent.row();
thread_start_row_ = thread_offset.row();

// Initialize predicates
CUTLASS_PRAGMA_UNROLL
for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {

mask_.predicates[c] = ((thread_offset.column()
+ ThreadMap::Delta::kColumn * c) < extent_.column());
+ ThreadMap::Delta::kColumn * c) < extent.column());
}

// Null pointer performs no accesses
Expand Down Expand Up @@ -268,7 +268,7 @@ class PredicatedTileIterator {
+ group * ThreadMap::Delta::kGroup
+ cluster * ThreadMap::Delta::kCluster;

bool row_guard = ((row_offset + thread_start_row_) < extent_.row());
bool row_guard = ((row_offset + thread_start_row_) < extent_row_);

AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);

Expand Down Expand Up @@ -332,7 +332,7 @@ class PredicatedTileIterator {
+ group * ThreadMap::Delta::kGroup
+ cluster * ThreadMap::Delta::kCluster;

bool row_guard = ((row_offset + thread_start_row_) < extent_.row());
bool row_guard = ((row_offset + thread_start_row_) < extent_row_);

AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);

Expand Down Expand Up @@ -379,8 +379,8 @@ class PredicatedTileIterator {

/// Extent of the matrix in rows
CUTLASS_DEVICE
TensorCoord extent() const {
return extent_;
Index extent_row() const {
return extent_row_;
}

/// Advances to the next position to load or store
Expand Down

0 comments on commit 1227351

Please sign in to comment.