Skip to content

Commit

Permalink
[benchmarks] flatten/unflatten benchmarks (#919)
Browse files Browse the repository at this point in the history
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
  • Loading branch information
stas00 and jeffra authored Apr 7, 2021
1 parent c79184e commit a128f34
Show file tree
Hide file tree
Showing 2 changed files with 277 additions and 0 deletions.
134 changes: 134 additions & 0 deletions tests/benchmarks/flatten_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#!/usr/bin/env python
# run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l)
#
# usage:
# ./flatten_bench.py -t
# ./flatten_bench.py -c
# kernprof -l flatten_bench.py -l; python -m line_profiler flatten_bench.py.lprof

import argparse

import gc

import torch
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from deepspeed.ops.op_builder import UtilsBuilder

from apex_C import flatten as flatten_apex

util_ops = UtilsBuilder().load()
flatten = util_ops.flatten
unflatten = util_ops.unflatten

torch.manual_seed(0)
# emulate a small typical model weights
x = [
torch.rand((512,
512)).cuda(),
torch.rand((512,
1024)).cuda(),
torch.rand((512,
30000)).cuda()
]
t = x * 30

# warm up and check that the same output is produced
flat_py = _flatten_dense_tensors(t)
flat_cpp = flatten(t)
flat_apex = flatten_apex(t)
#numel = flat_cpp.numel()
assert torch.eq(flat_py, flat_cpp).all(), "both produce the same tensor"
assert torch.eq(flat_py, flat_apex).all(), "both produce the same tensor"

TIMES = 1000


# the programs being tested
def py():
for i in range(TIMES):
flat = _flatten_dense_tensors(t)


def cpp():
for i in range(TIMES):
flat = flatten(t)


def apex():
for i in range(TIMES):
flat = flatten_apex(t)


#### cProfile ####

import cProfile


def cprofileme():
print("--------------- cProfile -----------------")
print("py")
cProfile.run("py()", sort=-1)
gc.collect()
torch.cuda.empty_cache()
print("cpp")
cProfile.run("cpp()", sort=-1)
gc.collect()
torch.cuda.empty_cache()
print("apex")
cProfile.run("apex()", sort=-1)
gc.collect()
torch.cuda.empty_cache()


#### timeit ####

import timeit


def timeme():
print("--------------- timeit -----------------")
print(f'py ={timeit.Timer("py()", globals=globals()).timeit(number=1)}')
gc.collect()
torch.cuda.empty_cache()
print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}')
gc.collect()
torch.cuda.empty_cache()
print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}')
gc.collect()
torch.cuda.empty_cache()


#### line_profiler ####
# this one requires a special way to be called
# pip install line_profiler
# kernprof -l flatten_bench.py -l; python -m line_profiler flatten_bench.py.lprof


def line_profileme():
print("--------------- line_profier -----------------")
print("py")
profile(py)()
gc.collect()
torch.cuda.empty_cache()
print("cpp")
profile(cpp)()
gc.collect()
torch.cuda.empty_cache()
print("apex")
profile(apex)()
gc.collect()
torch.cuda.empty_cache()


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-l", action='store_true')
parser.add_argument("-c", action='store_true')
parser.add_argument("-t", action='store_true')
args = parser.parse_args()
if args.l:
line_profileme()
elif args.c:
cprofileme()
elif args.t:
timeme()
143 changes: 143 additions & 0 deletions tests/benchmarks/unflatten_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#!/usr/bin/env python

# run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l)
#
# usage:
# ./unflatten_bench.py -t
# ./unflatten_bench.py -c
# kernprof -l unflatten_bench.py -l; python -m line_profiler unflatten_bench.py.lprof

import argparse
import gc
import torch
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from deepspeed.ops.op_builder import UtilsBuilder

from apex_C import flatten as flatten_apex
from apex_C import unflatten as unflatten_apex

util_ops = UtilsBuilder().load()
flatten = util_ops.flatten
unflatten = util_ops.unflatten

torch.manual_seed(0)
# emulate a small typical model weights
x = [
torch.rand((512,
512)).cuda(),
torch.rand((512,
1024)).cuda(),
torch.rand((512,
30000)).cuda()
]
unflat_t = x * 30

# warm up and check that the same output is produced
flat_py = _flatten_dense_tensors(unflat_t)
flat_cpp = flatten(unflat_t)
flat_apex = flatten_apex(unflat_t)
#numel = flat_cpp.numel()
assert torch.eq(flat_py, flat_cpp).all(), "both produce the same tensor"
assert torch.eq(flat_py, flat_apex).all(), "both produce the same tensor"

flat_t = flat_py
unflat_py = _unflatten_dense_tensors(flat_py, unflat_t)
for i in range(len(unflat_t)):
assert torch.eq(unflat_t[i], unflat_py[i]).all()
unflat_cpp = _unflatten_dense_tensors(flat_cpp, unflat_t)
for i in range(len(unflat_t)):
assert torch.eq(unflat_t[i], unflat_cpp[i]).all()
unflat_apex = _unflatten_dense_tensors(flat_apex, unflat_t)
for i in range(len(unflat_t)):
assert torch.eq(unflat_t[i], unflat_apex[i]).all()


# the programs being tested
def py():
for i in range(1000):
unflat = _unflatten_dense_tensors(flat_t, unflat_t)


def cpp():
for i in range(1000):
unflat = unflatten(flat_t, unflat_t)


def apex():
for i in range(1000):
unflat = unflatten_apex(flat_t, unflat_t)


#### cProfile ####

import cProfile


def cprofileme():
print("--------------- cProfile -----------------")
print("py")
cProfile.run("py()", sort=-1)
gc.collect()
torch.cuda.empty_cache()
print("cpp")
cProfile.run("cpp()", sort=-1)
gc.collect()
torch.cuda.empty_cache()
print("apex")
cProfile.run("apex()", sort=-1)
gc.collect()
torch.cuda.empty_cache()


#### timeit ####

import timeit


def timeme():
print("--------------- timeit -----------------")
print(f'py ={timeit.Timer("py()", globals=globals()).timeit(number=1)}')
gc.collect()
torch.cuda.empty_cache()
print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}')
gc.collect()
torch.cuda.empty_cache()
print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}')
gc.collect()
torch.cuda.empty_cache()


#### line_profiler ####
# this one requires a special way to be called
# pip install line_profiler
# kernprof -l unflatten_bench.py -l; python -m line_profiler unflatten_bench.py.lprof


def line_profileme():
print("--------------- line_profier -----------------")
print("py")
profile(py)()
gc.collect()
torch.cuda.empty_cache()
print("cpp")
profile(cpp)()
gc.collect()
torch.cuda.empty_cache()
print("apex")
profile(apex)()
gc.collect()
torch.cuda.empty_cache()


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-l", action='store_true')
parser.add_argument("-c", action='store_true')
parser.add_argument("-t", action='store_true')
args = parser.parse_args()
if args.l:
line_profileme()
elif args.c:
cprofileme()
elif args.t:
timeme()

0 comments on commit a128f34

Please sign in to comment.