Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmarks: Revision - Support flexible warmup and non-random data initialization in cublas-benchmark #479

Merged
merged 8 commits into from
Feb 27, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def add_parser_arguments(self):
self._parser.add_argument(
cp5555 marked this conversation as resolved.
Show resolved Hide resolved
'--num_warmup',
type=int,
default=8,
default=8 * 1000,
cp5555 marked this conversation as resolved.
Show resolved Hide resolved
required=False,
help='The number of warmup step.',
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,17 +70,17 @@ class CublasFunction {
/**
* @brief Fill the random data into the input
*/
template <typename T> void fill_data(T *Parameter_0_0_host, T *Parameter_1_0_host);
template <typename T> void fill_data(T *Parameter_0_0_host, T *Parameter_1_0_host, bool random = true);
/**
* @brief Prepare memory and data of the input and output
*/
template <typename T>
void prepare_tensor_template(T **Parameter_0_0, T **Parameter_1_0, T **Result_3_0, T **Parameter_0_0_host,
T **Parameter_1_0_host);
T **Parameter_1_0_host, bool random = true);
/**
* @brief Prepare memory and data of the input and output for kernel running
*/
virtual void prepare_tensor() {}
virtual void prepare_tensor(bool random = true) {}
/**
* @brief Execute the kernel/function
*/
Expand Down Expand Up @@ -228,47 +228,67 @@ class CublasFunction {
/**
* @brief Fill the random data into the input in float type
*/
template <> void CublasFunction::fill_data(float *Parameter_0_0_host, float *Parameter_1_0_host) {
srand(random_seed);
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
Parameter_0_0_host[i] = ((float)rand() / (float)(RAND_MAX));
}
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
Parameter_1_0_host[i] = ((float)rand() / (float)(RAND_MAX));
template <> void CublasFunction::fill_data(float *Parameter_0_0_host, float *Parameter_1_0_host, bool random) {
if (random) {
srand(random_seed);
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
Parameter_0_0_host[i] = ((float)rand() / (float)(RAND_MAX));
}
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
Parameter_1_0_host[i] = ((float)rand() / (float)(RAND_MAX));
}
} else {
// memset the input data to fixed float value
memset(Parameter_0_0_host, 1,
cp5555 marked this conversation as resolved.
Show resolved Hide resolved
(unsigned long)m_ * (unsigned long)k_ * (unsigned long)batch_count_ * sizeof(float));
memset(Parameter_1_0_host, 2,
(unsigned long)k_ * (unsigned long)n_ * (unsigned long)batch_count_ * sizeof(float));
}
}
/**
* @brief Fill the random data into the input in half type
*/
template <> void CublasFunction::fill_data(half *Parameter_0_0_host, half *Parameter_1_0_host) {
srand(random_seed);
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
Parameter_0_0_host[i] = half((float)rand() / (float)(RAND_MAX));
}
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
Parameter_1_0_host[i] = half((float)rand() / (float)(RAND_MAX));
template <> void CublasFunction::fill_data(half *Parameter_0_0_host, half *Parameter_1_0_host, bool random) {
if (random) {
srand(random_seed);
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
Parameter_0_0_host[i] = half((float)rand() / (float)(RAND_MAX));
}
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
Parameter_1_0_host[i] = half((float)rand() / (float)(RAND_MAX));
}
} else {
// memset the input data to fixed float value
std::fill(Parameter_0_0_host, Parameter_0_0_host + m_ * k_ * batch_count_, half(1.0));
std::fill(Parameter_1_0_host, Parameter_1_0_host + k_ * n_ * batch_count_, half(2.0));
}
}
/**
* @brief Fill the random data into the input in cuComplex type
*/
template <> void CublasFunction::fill_data(cuComplex *Parameter_0_0_host, cuComplex *Parameter_1_0_host) {
srand(random_seed);
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
Parameter_0_0_host[i] =
make_cuComplex(((float)rand() / (float)(RAND_MAX)), ((float)rand() / (float)(RAND_MAX)));
}
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
Parameter_1_0_host[i] =
make_cuComplex(((float)rand() / (float)(RAND_MAX)), ((float)rand() / (float)(RAND_MAX)));
template <> void CublasFunction::fill_data(cuComplex *Parameter_0_0_host, cuComplex *Parameter_1_0_host, bool random) {
if (random) {
srand(random_seed);
for (int i = 0; i < m_ * k_ * batch_count_; i++) {
Parameter_0_0_host[i] =
make_cuComplex(((float)rand() / (float)(RAND_MAX)), ((float)rand() / (float)(RAND_MAX)));
}
for (int i = 0; i < k_ * n_ * batch_count_; ++i) {
Parameter_1_0_host[i] =
make_cuComplex(((float)rand() / (float)(RAND_MAX)), ((float)rand() / (float)(RAND_MAX)));
}
} else {
// memset the input data to fixed float value
std::fill(Parameter_0_0_host, Parameter_0_0_host + m_ * k_ * batch_count_, make_cuComplex(1.0f, 1.0f));
std::fill(Parameter_1_0_host, Parameter_1_0_host + k_ * n_ * batch_count_, make_cuComplex(2.0f, 2.0f));
}
}
/**
* @brief Prepare memory and data of the input and output
*/
template <typename T>
void CublasFunction::prepare_tensor_template(T **Parameter_0_0, T **Parameter_1_0, T **Result_3_0,
T **Parameter_0_0_host, T **Parameter_1_0_host) {
T **Parameter_0_0_host, T **Parameter_1_0_host, bool random) {
int m = this->m_, n = this->n_, k = this->k_, batch_count = this->batch_count_;
// input argument
CUDA_SAFE_CALL(cudaMallocHost((void **)Parameter_0_0_host, sizeof(T) * m * k * batch_count_));
Expand All @@ -278,7 +298,7 @@ void CublasFunction::prepare_tensor_template(T **Parameter_0_0, T **Parameter_1_
CUDA_SAFE_CALL(cudaMalloc((void **)Parameter_1_0, sizeof(T) * n * k * batch_count_));

// fill input values
fill_data(reinterpret_cast<T *>(*Parameter_0_0_host), reinterpret_cast<T *>(*Parameter_1_0_host));
fill_data(reinterpret_cast<T *>(*Parameter_0_0_host), reinterpret_cast<T *>(*Parameter_1_0_host), random);

// copy input data from host to device
CUDA_SAFE_CALL(
Expand Down Expand Up @@ -469,13 +489,12 @@ int CublasFunction::check_result(int batch_count, cuComplex *Result_3_0, std::co
*/
void CublasFunction::benchmark() {
// Malloc memory for input and output data
this->prepare_tensor();
bool random_data = this->correctness ? true : false;
cp5555 marked this conversation as resolved.
Show resolved Hide resolved
this->prepare_tensor(random_data);

// Warm up
for (int i_ = 0; i_ < warm_up; i_++) {
for (int j = 0; j < num_in_step; j++) {
this->kernel_entry();
}
this->kernel_entry();
}
CUDA_SAFE_CALL(cudaDeviceSynchronize());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ class SgemmFunction : public CublasFunction {
/**
* @brief Prepare memory and data of the input and output for kernel running
*/
virtual void prepare_tensor() {
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host);
virtual void prepare_tensor(bool random) {
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host,
random);
}
/**
* @brief Check the correctness of function calculation result
Expand Down Expand Up @@ -107,8 +108,9 @@ class CgemmFunction : public CublasFunction {
/**
* @brief Prepare memory and data of the input and output for kernel running
*/
virtual void prepare_tensor() {
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host);
virtual void prepare_tensor(bool random) {
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host,
random);
}
/**
* @brief Check the correctness of function calculation result
Expand Down Expand Up @@ -169,17 +171,17 @@ class GemmExFunction : public CublasFunction {
/**
* @brief Prepare memory and data of the input and output for kernel running
*/
virtual void prepare_tensor() {
virtual void prepare_tensor(bool random) {
if (this->datatype_.compare("half") == 0) {
CublasFunction::prepare_tensor_template<half>(
reinterpret_cast<half **>(&Parameter_0_0), reinterpret_cast<half **>(&Parameter_1_0),
reinterpret_cast<half **>(&Result_3_0), reinterpret_cast<half **>(&Parameter_0_0_host),
reinterpret_cast<half **>(&Parameter_1_0_host));
reinterpret_cast<half **>(&Parameter_1_0_host), random);
} else if (this->datatype_.compare("float") == 0) {
CublasFunction::prepare_tensor_template<float>(
reinterpret_cast<float **>(&Parameter_0_0), reinterpret_cast<float **>(&Parameter_1_0),
reinterpret_cast<float **>(&Result_3_0), reinterpret_cast<float **>(&Parameter_0_0_host),
reinterpret_cast<float **>(&Parameter_1_0_host));
reinterpret_cast<float **>(&Parameter_1_0_host), random);
}
}
/**
Expand Down Expand Up @@ -265,17 +267,17 @@ class GemmStridedBatchedExFunction : public CublasFunction {
/**
* @brief Prepare memory and data of the input and output for kernel running
*/
virtual void prepare_tensor() {
virtual void prepare_tensor(bool random) {
if (this->datatype_.compare("half") == 0) {
prepare_tensor_template<half>(
reinterpret_cast<half **>(&Parameter_0_0), reinterpret_cast<half **>(&Parameter_1_0),
reinterpret_cast<half **>(&Result_3_0), reinterpret_cast<half **>(&Parameter_0_0_host),
reinterpret_cast<half **>(&Parameter_1_0_host));
reinterpret_cast<half **>(&Parameter_1_0_host), random);
} else if (this->datatype_.compare("float") == 0) {
prepare_tensor_template<float>(
reinterpret_cast<float **>(&Parameter_0_0), reinterpret_cast<float **>(&Parameter_1_0),
reinterpret_cast<float **>(&Result_3_0), reinterpret_cast<float **>(&Parameter_0_0_host),
reinterpret_cast<float **>(&Parameter_1_0_host));
reinterpret_cast<float **>(&Parameter_1_0_host), random);
}
}
/**
Expand Down Expand Up @@ -355,8 +357,9 @@ class SgemmStridedBatchedFunction : public CublasFunction {
/**
* @brief Prepare memory and data of the input and output for kernel running
*/
virtual void prepare_tensor() {
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host);
virtual void prepare_tensor(bool random) {
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host,
random);
}
/**
* @brief Function calculation on CPU side
Expand Down Expand Up @@ -419,8 +422,9 @@ class Cgemm3mStridedBatchedFunction : public CublasFunction {
/**
* @brief Prepare memory and data of the input and output for kernel running
*/
virtual void prepare_tensor() {
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host);
virtual void prepare_tensor(bool random) {
prepare_tensor_template(&Parameter_0_0, &Parameter_1_0, &Result_3_0, &Parameter_0_0_host, &Parameter_1_0_host,
random);
}
/**
* @brief Function calculation on CPU side
Expand Down