-
Notifications
You must be signed in to change notification settings - Fork 6.8k
[MXNET-404] elemwise_add/sub between rsp and rsp on GPU #11179
Conversation
Benchmark script: import mxnet as mx
import sys
import os
import scipy
import numpy as np
from mxnet.test_utils import rand_ndarray, assert_almost_equal
import time
def measure_cost(repeat, a, b, out=None):
# start bench
start = time.time()
results = []
for i in range(repeat):
results.append(mx.nd.elemwise_add(a, b, out=out))
for result in results:
result.wait_to_read()
end = time.time()
diff = end - start
return diff / repeat
def measure_fallback(repeat, a):
# start bench
start = time.time()
results = []
for i in range(repeat):
results.append(a.tostype('default'))
for result in results:
result.wait_to_read()
end = time.time()
diff = end - start
return diff / repeat
def main():
shape = (1000000, 512)
context = mx.gpu(0)
# context = mx.cpu()
for lhs_density in [0.01, 0.005, 0.001, 0.0005, 0.0001, 0.000]:
mx_lhs = rand_ndarray(shape, stype='row_sparse', density=lhs_density).as_in_context(context)
mx_lhs_dns = mx_lhs.tostype('default')
for rhs_density in [0.01, 0.005, 0.001, 0.0005, 0.0001, 0.000]:
mx_rhs = rand_ndarray(shape=shape, stype='row_sparse', density=rhs_density).as_in_context(context)
mx_rhs_dns = mx_rhs.tostype('default')
#warmup
sparse_cost = 0.0
dns_cost = 0.0
np_lhs = mx_lhs_dns.asnumpy()
check = mx.nd.elemwise_add(mx_lhs, mx_rhs)
np_lhs = np_lhs + mx_rhs.asnumpy()
assert_almost_equal(check.asnumpy(), np_lhs, atol=1e-5, rtol=1e-4)
mx.nd.waitall()
for i in range(100):
sparse_cost += measure_cost(1, mx_lhs, mx_rhs)
dns_cost += measure_cost(1, mx_lhs_dns, mx_rhs_dns)
print("%.2f %% %.2f %%" % (lhs_density*100, rhs_density*100), dns_cost / sparse_cost)
for rhs_density in [1.000, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.000]:
mx_lhs_dns = mx.nd.ones(shape, ctx=context)
mx_lhs = mx_lhs_dns.tostype('row_sparse')
mx_rhs = rand_ndarray(shape=shape, stype='row_sparse', density=rhs_density).as_in_context(context)
mx_rhs_dns = mx_rhs.tostype('default')
#warmup
sparse_cost = 0.0
dns_cost = 0.0
np_lhs = mx_lhs_dns.asnumpy()
mx.nd.elemwise_add(mx_lhs, mx_rhs, out=mx_lhs)
np_lhs = np_lhs + mx_rhs.asnumpy()
assert_almost_equal(mx_lhs.asnumpy(), np_lhs, atol=1e-5, rtol=1e-4)
mx.nd.waitall()
for i in range(100):
sparse_cost += measure_cost(1, mx_lhs, mx_rhs, out=mx_lhs)
dns_cost += measure_cost(1, mx_lhs_dns, mx_rhs_dns, out=mx_lhs_dns)
print("%.2f %% %.2f %%" % (1.00000*100, rhs_density*100), dns_cost / sparse_cost)
for lhs_density in [1.000, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.000]:
mx_rhs_dns = mx.nd.ones(shape, ctx=context)
mx_rhs = mx_rhs_dns.tostype('row_sparse')
mx_lhs = rand_ndarray(shape=shape, stype='row_sparse', density=lhs_density).as_in_context(context)
mx_lhs_dns = mx_lhs.tostype('default')
#warmup
sparse_cost = 0.0
dns_cost = 0.0
np_rhs = mx_rhs_dns.asnumpy()
mx.nd.elemwise_add(mx_lhs, mx_rhs, out=mx_rhs)
np_rhs = np_rhs + mx_lhs.asnumpy()
assert_almost_equal(mx_rhs.asnumpy(), np_rhs, atol=1e-5, rtol=1e-4)
mx.nd.waitall()
for i in range(100):
sparse_cost += measure_cost(1, mx_lhs, mx_rhs, out=mx_rhs)
dns_cost += measure_cost(1, mx_lhs_dns, mx_rhs_dns, out=mx_rhs_dns)
print("%.2f %% %.2f %%" % (1.00000*100, lhs_density*100), dns_cost / sparse_cost)
if __name__ == "__main__":
main() |
Benchmark results: |
@eric-haibin-lin Please give a review when you have time, thanks! |
@@ -156,7 +156,7 @@ class NDArray { | |||
} | |||
|
|||
/* \brief Check whether the two arrays are the same array */ | |||
inline bool IsSame(const NDArray& other) { | |||
inline bool IsSame(const NDArray& other) const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@piiswrong I made the change here so that I can also call this function when I have a const NDArray
object.
}; | ||
|
||
template<typename OP> | ||
void ElemwiseBinaryOp::RspRspOp(mshadow::Stream<gpu> *s, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we have unit test for write inplace?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In-place case shares the same code as in-place case between dns and rsp, which already has a unit test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW correctness is double-checked in benchmark script during the warmup.
|
||
CHECK(!scatter) << "scatter is not supported in RspRspOp on GPU yet..."; | ||
CHECK(lhs.storage_type() == kRowSparseStorage && rhs.storage_type() == kRowSparseStorage); | ||
CHECK(output.storage_type() == kRowSparseStorage); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does it support kAddTo? CHECK_NE(kAddTo)?
ElemwiseBinaryOp::DnsRspDnsOp<gpu, OP>(s, attrs, ctx, dns, rsp, req, output, reverse); | ||
return; | ||
} | ||
CHECK(req == kWriteTo) << "Should be kWriteTo but got " << req; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this function assumes req is never kNullOp, better document it in the header.
lhs.data().FlatTo1D<gpu, DType>(), s); | ||
Copy(output.aux_data(kIdx).FlatTo1D<gpu, IType>(), | ||
lhs.aux_data(kIdx).FlatTo1D<gpu, IType>(), s); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what about kWriteInplace in all these branches? should we add a check?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Extra checks and tests added.
16c160b
to
a25a2cd
Compare
a25a2cd
to
d7e67a8
Compare
@eric-haibin-lin should be good for merge |
* Support for elemwise_add/sub between rsp and rsp on GPU * add extra test coverage for inplace cases
* Support for elemwise_add/sub between rsp and rsp on GPU * add extra test coverage for inplace cases
Description
As title
Checklist
Essentials
Changes
Comments
For performance benchmark results please see comments.