Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,11 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,

constexpr int kNumTensor = MaxTensorNumPerLaunch;
constexpr int kNumChunk = MaxChunkNumPerLaunch;
#ifdef PADDLE_WITH_HIP
constexpr int kBlockDim = 256;
#else
constexpr int kBlockDim = 512;
#endif

int max_chunk_num = -1;
int vec_size = 8;
Expand Down Expand Up @@ -805,7 +809,11 @@ static void MultiTensorUpdateLambParamAndBetaPows(
platform::errors::InvalidArgument("Beta2Pow should be nullptr."));
}

#ifdef PADDLE_WITH_HIP
const int block_dim = 256;
#else
const int block_dim = 512;
#endif

int vec_size = 8;
for (int i = 0; i < n; ++i) {
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,11 @@ __device__ T reduceSum(T val, int tid, int len) {
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
#ifdef PADDLE_WITH_HIP
const int warpSize = 64;
#else
const int warpSize = 32;
#endif
__shared__ T shm[warpSize];
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, tid < len);
Expand Down