Skip to content

Commit

Permalink
launcher save pid + require manual triton install for sparse-attn (#1727
Browse files Browse the repository at this point in the history
)
  • Loading branch information
jeffra authored Jan 27, 2022
1 parent df724e7 commit 171316f
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 0 deletions.
22 changes: 22 additions & 0 deletions deepspeed/launcher/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
DEEPSPEED_ENVIRONMENT_NAME = ".deepspeed_env"
DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
PDSH_MAX_FAN_OUT = 1024
PID_FILE_BASEPATH = "/tmp"


def parse_args(args=None):
Expand Down Expand Up @@ -126,6 +127,13 @@ def parse_args(args=None):
help="Force multi-node launcher mode, helps in cases where user "
"wants to launch on single remote node.")

parser.add_argument(
"--save_pid",
action="store_true",
help="Save file containing launcher process id (pid) at /tmp/<main-pid>.ds, "
"where <main-pid> is the pid of the first process that invoked `deepspeed`. "
"Useful when launching deepspeed processes programmatically.")

parser.add_argument(
"--autotuning",
default="",
Expand Down Expand Up @@ -428,8 +436,22 @@ def main(args=None):

logger.info(f"cmd = {' '.join(cmd)}")
result = subprocess.Popen(cmd, env=env)

pid_file = None
if args.save_pid:
main_pid = os.getpid()
launcher_pid = result.pid
pid_file = os.path.join(PID_FILE_BASEPATH, f"{main_pid}.deepspeed")
with open(pid_file, 'w') as fd:
fd.write(f"{launcher_pid}")

result.wait()

if args.save_pid and pid_file is not None:
# clean-up saved pid file
if os.path.isfile(pid_file):
os.remove(pid_file)

# In case of failure must propagate the error-condition back to the caller (usually shell). The
# actual error and traceback should have been printed in the subprocess, so in order to avoid
# unnecessary noise we just quietly exit here with the same code as the subprocess
Expand Down
17 changes: 17 additions & 0 deletions op_builder/sparse_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
import warnings
from .builder import OpBuilder
from packaging import version as pkg_version


class SparseAttnBuilder(OpBuilder):
Expand Down Expand Up @@ -52,4 +53,20 @@ def is_compatible(self, verbose=True):
f'{self.NAME} requires a torch version >= 1.5 but detected {TORCH_MAJOR}.{TORCH_MINOR}'
)

try:
import triton
except ImportError:
# auto-install of triton is broken on some systems, reverting to manual install for now
# see this issue: https://github.com/microsoft/DeepSpeed/issues/1710
self.warning(
f"please install triton==1.0.0 if you want to use sparse attention")
return False

installed_triton = pkg_version.parse(triton.__version__)
if installed_triton != pkg_version.parse("1.0.0"):
self.warning(
f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible"
)
return False

This comment has been minimized.

Copy link
@stadlerb

stadlerb Jan 12, 2023

Why return False here as the log message above is only a warning? It would be consistent to log an error and return False, or to log a warning and continue without a return. I found it very confusing that the build cancelled because of this.


return super().is_compatible(verbose) and torch_compatible and cuda_compatible

0 comments on commit 171316f

Please sign in to comment.