-
Notifications
You must be signed in to change notification settings - Fork 132
Add apex install to pytorch wheel #2280
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
5ede7ab
06353dc
3199657
a6444f5
ab8287b
b104c25
f2505c6
5c46d00
577481b
3b02d5c
0086777
df72923
1e9b724
3de1482
8e4aafa
914fd1b
49b0c0a
10c3847
f62006d
5711c36
ee403d2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -87,6 +87,7 @@ | |
| "zipp", | ||
| # ---- | ||
| "Pillow", | ||
| "apex", | ||
| "certifi", | ||
| "charset_normalizer", | ||
| "cmake", | ||
|
|
||
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,12 +28,14 @@ | |
| # in the former. | ||
| python pytorch_torch_repo.py checkout | ||
| python pytorch_audio_repo.py checkout | ||
| python pytorch_apex_repo.py checkout | ||
| python pytorch_vision_repo.py checkout | ||
| python pytorch_triton_repo.py checkout | ||
|
|
||
| # On Windows, using shorter paths to avoid compile command length limits: | ||
| python pytorch_torch_repo.py checkout --checkout-dir C:/b/pytorch | ||
| python pytorch_audio_repo.py checkout --checkout-dir C:/b/audio | ||
| python pytorch_apex_repo.py checkout --checkout-dir C:/b/apex | ||
| python pytorch_vision_repo.py checkout --checkout-dir C:/b/vision | ||
| ``` | ||
|
|
||
|
|
@@ -337,6 +339,7 @@ def do_build(args: argparse.Namespace): | |
| pytorch_dir: Path | None = args.pytorch_dir | ||
| pytorch_audio_dir: Path | None = args.pytorch_audio_dir | ||
| pytorch_vision_dir: Path | None = args.pytorch_vision_dir | ||
| apex_dir: Path | None = args.apex_dir | ||
|
|
||
| rocm_sdk_version = get_rocm_sdk_version() | ||
| cmake_prefix = get_rocm_path("cmake") | ||
|
|
@@ -495,6 +498,13 @@ def do_build(args: argparse.Namespace): | |
|
|
||
| print("--- Builds all completed") | ||
|
|
||
| # Build apex. | ||
| if args.build_apex or (args.build_apex is None and apex_dir): | ||
| assert apex_dir, "Must specify --apex-dir if --build-apex" | ||
| do_build_apex(args, apex_dir, dict(env)) | ||
| else: | ||
| print("--- Not build apex (no --apex-dir)") | ||
|
|
||
| if args.use_ccache: | ||
| ccache_stats_output = capture( | ||
| ["ccache", "--show-stats"], cwd=tempfile.gettempdir() | ||
|
|
@@ -881,6 +891,36 @@ def do_build_pytorch_vision( | |
| copy_to_output(args, built_wheel) | ||
|
|
||
|
|
||
| def do_build_apex(args: argparse.Namespace, apex_dir: Path, env: dict[str, str]): | ||
| # Compute version. | ||
| build_version = (apex_dir / "version.txt").read_text().strip() | ||
| build_version += args.version_suffix | ||
| print(f" Default apex BUILD_VERSION: {build_version}") | ||
| env["BUILD_VERSION"] = build_version | ||
| env["BUILD_NUMBER"] = args.pytorch_build_number | ||
|
|
||
| remove_dir_if_exists(apex_dir / "dist") | ||
| if args.clean: | ||
| remove_dir_if_exists(apex_dir / "build") | ||
|
|
||
| exec( | ||
| [ | ||
| sys.executable, | ||
| "-m", | ||
| "build", | ||
| "--wheel", | ||
| "--no-isolation", | ||
| "-C--build-option=--cpp_ext", | ||
| "-C--build-option=--cuda_ext", | ||
| ], | ||
| cwd=apex_dir, | ||
| env=env, | ||
| ) | ||
|
Comment on lines
+906
to
+918
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I pulled down a whl from https://rocm.nightlies.amd.com/v2-staging/gfx94X-dcgpu/apex/ (^), I see a bunch of
(^)(again do NOT test in prod - you should test with https://rocm.devreleases.amd.com/) |
||
| built_wheel = find_built_wheel(apex_dir / "dist", "apex") | ||
| print(f"Found built wheel: {built_wheel}") | ||
| copy_to_output(args, built_wheel) | ||
|
|
||
|
|
||
| def main(argv: list[str]): | ||
| p = argparse.ArgumentParser(prog="build_prod_wheels.py") | ||
|
|
||
|
|
@@ -952,6 +992,12 @@ def add_common(p: argparse.ArgumentParser): | |
| type=Path, | ||
| help="pinned triton directory", | ||
| ) | ||
| build_p.add_argument( | ||
| "--apex-dir", | ||
| default=directory_if_exists(script_dir / "apex"), | ||
| type=Path, | ||
| help="apex source directory", | ||
| ) | ||
| build_p.add_argument( | ||
| "--pytorch-rocm-arch", | ||
| help="gfx arch to build pytorch with (defaults to rocm-sdk targets)", | ||
|
|
@@ -977,6 +1023,12 @@ def add_common(p: argparse.ArgumentParser): | |
| default=None, | ||
| help="Enable building of torch vision (requires --pytorch-vision-dir)", | ||
| ) | ||
| build_p.add_argument( | ||
| "--build-apex", | ||
| action=argparse.BooleanOptionalAction, | ||
| default=None, | ||
| help="Enable building of apex (requires --apex-dir)", | ||
| ) | ||
| build_p.add_argument( | ||
| "--enable-pytorch-flash-attention-windows", | ||
| action=argparse.BooleanOptionalAction, | ||
|
|
||
amd-sriram marked this conversation as resolved.
Show resolved
Hide resolved
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,160 @@ | ||
| #!/usr/bin/env python | ||
| """Checks out Rocm Apex. | ||
|
|
||
| There is nothing that this script does which you couldn't do by hand, but because of | ||
| the following, getting PyTorch sources ready to build with ToT TheRock built SDKs | ||
| consists of multiple steps: | ||
|
|
||
| * Sources must be pre-processed with HIPIFY, creating dirty git trees that are hard | ||
| to develop on further. | ||
| * Both the ROCM SDK and PyTorch are moving targets that are eventually consistent. | ||
| We maintain patches for recent PyTorch revisions to adapt to packaging and library | ||
| compatibility differences until all releases are done and available. | ||
|
|
||
| Primary usage: | ||
|
|
||
| ./pytorch_apex_repo.py checkout | ||
|
|
||
| The checkout process combines the following activities: | ||
|
|
||
| * Clones the pytorch repository into `THIS_MAIN_REPO_NAME` with a requested `--repo-hashtag` | ||
| tag (default to latest release). | ||
| * Configures PyTorch submodules to be ignored for any local changes (so that | ||
| the result is suitable for development with local patches). | ||
| * Applies "base" patches to the pytorch repo and any submodules (by using | ||
| `git am` with patches from `patches/pytorch_ref_to_patches_dir_name(<repo-hashtag>)/<repo-name>/base`). | ||
| * Runs `hipify` to prepare sources for AMD GPU and commits the result to the | ||
| main repo and any modified submodules. | ||
| * Applies "hipified" patches to the pytorch repo and any submodules (by using | ||
| `git am` with patches from `patches/<repo-hashtag>/<repo-name>/hipified`). | ||
| * Records some tag information for subsequent activities. | ||
| """ | ||
| import argparse | ||
| from pathlib import Path | ||
| import sys | ||
|
|
||
| import repo_management | ||
|
|
||
| THIS_MAIN_REPO_NAME = "apex" | ||
| THIS_DIR = Path(__file__).resolve().parent | ||
|
|
||
| DEFAULT_ORIGIN = "https://github.com/ROCm/apex.git" | ||
| DEFAULT_HASHTAG = "master" | ||
| DEFAULT_PATCHES_DIR = THIS_DIR / "patches" / THIS_MAIN_REPO_NAME | ||
| DEFAULT_PATCHSET = None | ||
|
|
||
|
|
||
| def main(cl_args: list[str]): | ||
| def add_common(command_parser: argparse.ArgumentParser): | ||
| command_parser.add_argument( | ||
| "--checkout-dir", | ||
| type=Path, | ||
| default=THIS_DIR / THIS_MAIN_REPO_NAME, | ||
| help=f"Directory path where the git repo is cloned into. Default is {THIS_DIR / THIS_MAIN_REPO_NAME}", | ||
| ) | ||
| command_parser.add_argument( | ||
| "--gitrepo-origin", | ||
| type=str, | ||
| default=None, | ||
| help=f"Git repository url. Defaults to the origin in torch/related_commits (see --torch-dir), or '{DEFAULT_ORIGIN}'", | ||
| ) | ||
| command_parser.add_argument( | ||
| "--patch-dir", | ||
| type=Path, | ||
| default=DEFAULT_PATCHES_DIR, | ||
| help="Git repository patch path", | ||
| ) | ||
| command_parser.add_argument( | ||
| "--repo-name", | ||
| type=Path, | ||
| default=THIS_MAIN_REPO_NAME, | ||
| help="Subdirectory name in which to checkout repo", | ||
| ) | ||
| command_parser.add_argument( | ||
| "--repo-hashtag", | ||
| type=str, | ||
| default=None, | ||
| help=f"Git repository ref/tag to checkout. Defaults to the ref in torch/related_commits (see --torch-dir), or '{DEFAULT_HASHTAG}'", | ||
| ) | ||
| command_parser.add_argument( | ||
| "--patchset", | ||
| default=None, | ||
| help=f"Patch dir subdirectory. Defaults to rocm-custom if torch/related_commits exists (see --torch-dir), or '{DEFAULT_PATCHSET}'", | ||
| ) | ||
| command_parser.add_argument( | ||
| "--require-related-commit", | ||
| action=argparse.BooleanOptionalAction, | ||
| help="Require that a related commit was found from --torch-dir", | ||
| ) | ||
| command_parser.add_argument( | ||
| "--torch-dir", | ||
| type=Path, | ||
| default=THIS_DIR / "pytorch", | ||
| help="Directory of the torch checkout, for loading the related_commits file that can populate alternate default values for --gitrepo-origin, --repo-hashtag, and --patchset. If missing then fallback/upstream defaults will be used", | ||
| ) | ||
|
|
||
| p = argparse.ArgumentParser("pytorch_apex_repo.py") | ||
| sub_p = p.add_subparsers(required=True) | ||
| checkout_p = sub_p.add_parser("checkout", help="Clone Apex locally and checkout") | ||
| add_common(checkout_p) | ||
| checkout_p.add_argument("--depth", type=int, help="Fetch depth") | ||
| checkout_p.add_argument("--jobs", type=int, help="Number of fetch jobs") | ||
| checkout_p.add_argument( | ||
| "--hipify", | ||
| action=argparse.BooleanOptionalAction, | ||
| default=True, | ||
| help="Run hipify", | ||
| ) | ||
|
Comment on lines
+102
to
+107
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Out of curiousity, why does https://github.com/ROCm/apex require running HIPIFY here? As that repository is forked from https://github.com/NVIDIA/apex, could the sources be run through HIPIFY at rest? (I think running HIPIFY after checkout for consistency with how other repositories are set up is fine, but I do want to confirm first) |
||
| checkout_p.add_argument( | ||
| "--patch", | ||
| action=argparse.BooleanOptionalAction, | ||
| default=True, | ||
| help="Apply patches for the repo-hashtag", | ||
| ) | ||
| checkout_p.set_defaults(func=repo_management.do_checkout) | ||
|
|
||
| hipify_p = sub_p.add_parser("hipify", help="Run HIPIFY on the project") | ||
| add_common(hipify_p) | ||
| hipify_p.set_defaults(func=repo_management.do_hipify) | ||
|
|
||
| save_patches_p = sub_p.add_parser( | ||
| "save-patches", help="Save local commits as patch files for later application" | ||
| ) | ||
| add_common(save_patches_p) | ||
| save_patches_p.set_defaults(func=repo_management.do_save_patches) | ||
|
|
||
| args = p.parse_args(cl_args) | ||
|
|
||
| # Set default values based on the pin file in the pytorch repo. | ||
| ( | ||
| default_git_origin, | ||
| default_git_hashtag, | ||
| default_patchset, | ||
| has_related_commit, | ||
| ) = repo_management.read_pytorch_rocm_pins( | ||
| args.torch_dir, | ||
| os="centos", # Read pins for "centos" on Linux and Windows | ||
| project="apex", | ||
| default_origin=DEFAULT_ORIGIN, | ||
| default_hashtag=DEFAULT_HASHTAG, | ||
| default_patchset=DEFAULT_PATCHSET, | ||
| ) | ||
|
|
||
| if args.require_related_commit and not has_related_commit: | ||
| raise ValueError( | ||
| f"Could not find apex in '{args.torch_dir}/related_commits' (did you mean to set a different --torch-dir?)" | ||
| ) | ||
|
|
||
| # Priority order: | ||
| # 1. Explicitly set values | ||
| # 2. Values loaded from the pin in the torch repo | ||
| # 3. Fallback default values | ||
| args.gitrepo_origin = args.gitrepo_origin or default_git_origin | ||
| args.repo_hashtag = args.repo_hashtag or default_git_hashtag | ||
| args.patchset = args.patchset or default_patchset | ||
|
|
||
| args.func(args) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main(sys.argv[1:]) | ||

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should start with just the changes in
external-builds/pytorch/to allow local builds of apex and then in a second PR include the other changes to get the automation building the wheels. That may have also caught the "don't test in prod" issues and allowed for earlier feedback on the approach.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ScottTodd Which file(s) should I include in the second PR? Do you mean .github/workflows/build_portable_linux_pytorch_wheels.yml?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
1st PR: the changes to
external-builds/pytorch/2nd PR: all other changes (
.github/workflows/,build_tools/github_actions/)The changes to
build_tools/third_party/s3_management/manage.pycould go in either PR