Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update curriculum-learning.md #3031

Merged
merged 10 commits into from
Apr 7, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/_tutorials/curriculum-learning.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ In our [paper](https://arxiv.org/abs/2108.06084) section 5.4 we demonstrate that

### 2.3 Token-based training termination

Because curriculum learning changes length of each sequence/sample during training, it is very hard/impossible to use number of steps/samples to terminate the training exactly at the desired number of tokens. Thus, we add a `--train-tokens` config for accurate token-based termination. We recommend increasing your original `--train-samples` or `--train-iters` to a large enough number (e.g., 3X of what you used for baseline), and set `--train-tokens` at the exact desired number of training tokens.
Because curriculum learning changes the length of each sequence/sample during training, it is very hard/impossible to use a number of steps/samples to terminate the training exactly at the desired number of tokens. Thus, we add a `--train-tokens` config for accurate token-based termination. We recommend increasing your original `--train-samples` or `--train-iters` to a large enough number (e.g., 3X of what you used for baseline), and set `--train-tokens` at the exact desired number of training tokens.

### 2.4 Token-based LR decay

Expand Down
44 changes: 22 additions & 22 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
DeepSpeed library

To build wheel on Windows:
1. Install pytorch, such as pytorch 1.12 + cuda 11.6
2. Install visual cpp build tool
3. Include cuda toolkit
4. Launch cmd console with Administrator privilege for creating required symlink folders
1. Install pytorch, such as pytorch 1.12 + cuda 11.6.
2. Install visual cpp build tool.
3. Include cuda toolkit.
4. Launch cmd console with Administrator privilege for creating required symlink folders.

Create a new wheel via the following command:
build_win.bat
Expand All @@ -34,7 +34,7 @@
from op_builder.all_ops import ALL_OPS
from op_builder.builder import installed_cuda_version

# fetch rocm state
# Fetch rocm state.
is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
rocm_version = OpBuilder.installed_rocm_version()

Expand Down Expand Up @@ -66,12 +66,12 @@ def fetch_requirements(path):
'sd': fetch_requirements('requirements/requirements-sd.txt')
}

# Add specific cupy version to both onebit extension variants
# Add specific cupy version to both onebit extension variants.
if torch_available and torch.cuda.is_available():
cupy = None
if is_rocm_pytorch:
rocm_major, rocm_minor = rocm_version
# XXX cupy support for rocm 5 is not available yet
# XXX cupy support for rocm 5 is not available yet.
if rocm_major <= 4:
cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}"
else:
Expand All @@ -80,7 +80,7 @@ def fetch_requirements(path):
extras_require['1bit'].append(cupy)
extras_require['1bit_mpi'].append(cupy)

# Make an [all] extra that installs all needed dependencies
# Make an [all] extra that installs all needed dependencies.
all_extras = set()
for extra in extras_require.items():
for req in extra[1]:
Expand All @@ -89,7 +89,7 @@ def fetch_requirements(path):

cmdclass = {}

# For any pre-installed ops force disable ninja
# For any pre-installed ops force disable ninja.
if torch_available:
from accelerator import get_accelerator
cmdclass['build_ext'] = get_accelerator().build_extension().with_options(
Expand All @@ -103,7 +103,7 @@ def fetch_requirements(path):
TORCH_MINOR = "0"

if torch_available and not torch.cuda.is_available():
# Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486
# Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486.
print(
"[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only "
"you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
Expand Down Expand Up @@ -148,14 +148,14 @@ def op_enabled(op_name):
op_compatible = builder.is_compatible()
compatible_ops[op_name] = op_compatible

# If op is requested but not available, throw an error
# If op is requested but not available, throw an error.
if op_enabled(op_name) and not op_compatible:
env_var = op_envvar(op_name)
if env_var not in os.environ:
builder.warning(f"One can disable {op_name} with {env_var}=0")
abort(f"Unable to pre-compile {op_name}")

# if op is compatible but install is not enabled (JIT mode)
# If op is compatible but install is not enabled (JIT mode)
goodship1 marked this conversation as resolved.
Show resolved Hide resolved
if is_rocm_pytorch and op_compatible and not op_enabled(op_name):
builder.hipify_extension()

Expand All @@ -167,7 +167,7 @@ def op_enabled(op_name):

print(f'Install Ops={install_ops}')

# Write out version/git info
# Write out version/git info.
git_hash_cmd = "git rev-parse --short HEAD"
git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
if command_exists('git') and 'DS_BUILD_STRING' not in os.environ:
Expand Down Expand Up @@ -200,38 +200,38 @@ def create_dir_symlink(src, dest):
create_dir_symlink('..\\accelerator', '.\\deepspeed\\accelerator')
egg_info.manifest_maker.template = 'MANIFEST_win.in'

# Parse the DeepSpeed version string from version.txt
# Parse the DeepSpeed version string from version.txt.
version_str = open('version.txt', 'r').read().strip()

# Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
# example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel
# Example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel.

# Building wheel for distribution, update version file
# Building wheel for distribution, update version file.
if 'DS_BUILD_STRING' in os.environ:
# Build string env specified, probably building for distribution
# Build string env specified, probably building for distribution.
with open('build.txt', 'w') as fd:
fd.write(os.environ.get('DS_BUILD_STRING'))
version_str += os.environ.get('DS_BUILD_STRING')
elif os.path.isfile('build.txt'):
# build.txt exists, probably installing from distribution
# build.txt exists, probably installing from distribution.
with open('build.txt', 'r') as fd:
version_str += fd.read().strip()
else:
# None of the above, probably installing from source
# None of the above, probably installing from source.
version_str += f'+{git_hash}'

torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
bf16_support = False
# Set cuda_version to 0.0 if cpu-only
# Set cuda_version to 0.0 if cpu-only.
cuda_version = "0.0"
nccl_version = "0.0"
# Set hip_version to 0.0 if cpu-only
# Set hip_version to 0.0 if cpu-only.
hip_version = "0.0"
if torch_available and torch.version.cuda is not None:
cuda_version = ".".join(torch.version.cuda.split('.')[:2])
if sys.platform != "win32":
if isinstance(torch.cuda.nccl.version(), int):
# This will break if minor version > 9
# This will break if minor version > 9.
nccl_version = ".".join(str(torch.cuda.nccl.version())[:2])
else:
nccl_version = ".".join(map(str, torch.cuda.nccl.version()[:2]))
Expand Down