-
Notifications
You must be signed in to change notification settings - Fork 7.1k
MaxVit model #6342
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
MaxVit model #6342
Changes from 5 commits
f15fd92
c5b2839
5e8a222
aa95139
1fddecc
b7f0e97
872f40f
f561edf
314b82a
a4863e9
c4406e4
2111680
cc51c2b
d2dfe71
328f9b6
b334b7f
ebb8c16
e281371
20422bc
9ad86fe
775990c
a24e549
bb42548
ed21d3d
09e4ced
521d6d5
79cb004
97cbcd8
9fc6a5b
6b00ca8
45d3966
2aca920
cab35c1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
import argparse | ||
TeodorPoncu marked this conversation as resolved.
Show resolved
Hide resolved
|
||
import os | ||
import uuid | ||
from pathlib import Path | ||
|
||
import train | ||
import submitit | ||
|
||
|
||
def parse_args(): | ||
train_parser = train.get_args_parser(add_help=False) | ||
parser = argparse.ArgumentParser("Submitit for train", parents=[train_parser], add_help=True) | ||
parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node") | ||
parser.add_argument("--nodes", default=1, type=int, help="Number of nodes to request") | ||
parser.add_argument("--timeout", default=60*24*30, type=int, help="Duration of the job") | ||
parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.") | ||
parser.add_argument("--partition", default="train", type=str, help="the partition (default train).") | ||
return parser.parse_args() | ||
|
||
|
||
def get_shared_folder() -> Path: | ||
user = os.getenv("USER") | ||
path = "/data/checkpoints" | ||
if Path(path).is_dir(): | ||
p = Path(f"{path}/{user}/experiments") | ||
p.mkdir(exist_ok=True) | ||
return p | ||
raise RuntimeError("No shared folder available") | ||
|
||
|
||
def get_init_file_folder() -> Path: | ||
user = os.getenv("USER") | ||
path = "/shared" | ||
if Path(path).is_dir(): | ||
p = Path(f"{path}/{user}") | ||
p.mkdir(exist_ok=True) | ||
return p | ||
raise RuntimeError("No shared folder available") | ||
|
||
|
||
def get_init_file(): | ||
# Init file must not exist, but it's parent dir must exist. | ||
os.makedirs(str(get_init_file_folder()), exist_ok=True) | ||
init_file = get_init_file_folder() / f"{uuid.uuid4().hex}_init" | ||
if init_file.exists(): | ||
os.remove(str(init_file)) | ||
return init_file | ||
|
||
|
||
class Trainer(object): | ||
def __init__(self, args): | ||
self.args = args | ||
|
||
def __call__(self): | ||
import train | ||
|
||
self._setup_gpu_args() | ||
train.main(self.args) | ||
|
||
def checkpoint(self): | ||
import os | ||
import submitit | ||
from pathlib import Path | ||
|
||
self.args.dist_url = get_init_file().as_uri() | ||
checkpoint_file = os.path.join(self.args.output_dir, "checkpoint.pth") | ||
if os.path.exists(checkpoint_file): | ||
self.args.resume = checkpoint_file | ||
print("Requeuing ", self.args) | ||
empty_trainer = type(self)(self.args) | ||
return submitit.helpers.DelayedSubmission(empty_trainer) | ||
|
||
def _setup_gpu_args(self): | ||
import submitit | ||
from pathlib import Path | ||
|
||
job_env = submitit.JobEnvironment() | ||
self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id))) | ||
self.args.gpu = job_env.local_rank | ||
self.args.rank = job_env.global_rank | ||
self.args.world_size = job_env.num_tasks | ||
print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") | ||
|
||
|
||
def main(): | ||
args = parse_args() | ||
if args.job_dir == "": | ||
args.job_dir = get_shared_folder() / "%j" | ||
|
||
# Note that the folder will depend on the job_id, to easily track experiments | ||
executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=300) | ||
|
||
# cluster setup is defined by environment variables | ||
num_gpus_per_node = args.ngpus | ||
nodes = args.nodes | ||
timeout_min = args.timeout | ||
|
||
executor.update_parameters( | ||
#mem_gb=96 * num_gpus_per_node, # 768GB per machine | ||
gpus_per_node=num_gpus_per_node, | ||
tasks_per_node=num_gpus_per_node, # one task per GPU | ||
cpus_per_task=12, # 96 cpus per machine | ||
nodes=nodes, | ||
timeout_min=timeout_min, # max is 60 * 72 | ||
slurm_partition=args.partition, | ||
slurm_signal_delay_s=120, | ||
) | ||
|
||
|
||
executor.update_parameters(name="torchvision") | ||
|
||
args.dist_url = get_init_file().as_uri() | ||
args.output_dir = args.job_dir | ||
|
||
trainer = Trainer(args) | ||
job = executor.submit(trainer) | ||
|
||
print("Submitted job_id:", job.job_id) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import unittest | ||
|
||
import pytest | ||
import torch | ||
|
||
from torchvision.models.maxvit import SwapAxes, WindowDepartition, WindowPartition | ||
|
||
|
||
class MaxvitTester(unittest.TestCase): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand that here you are testing specific layers from MaxViT. This is not something we did previously, so perhaps it does need to be on a separate file. @YosuaMichael any thoughts here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry for a pretty late response! |
||
def test_maxvit_window_partition(self): | ||
input_shape = (1, 3, 224, 224) | ||
partition_size = 7 | ||
|
||
x = torch.randn(input_shape) | ||
|
||
partition = WindowPartition(partition_size=7) | ||
departition = WindowDepartition(partition_size=partition_size, n_partitions=(input_shape[3] // partition_size)) | ||
|
||
assert torch.allclose(x, departition(partition(x))) | ||
|
||
def test_maxvit_grid_partition(self): | ||
input_shape = (1, 3, 224, 224) | ||
partition_size = 7 | ||
|
||
x = torch.randn(input_shape) | ||
partition = torch.nn.Sequential( | ||
WindowPartition(partition_size=(input_shape[3] // partition_size)), | ||
SwapAxes(-2, -3), | ||
) | ||
departition = torch.nn.Sequential( | ||
SwapAxes(-2, -3), | ||
WindowDepartition(partition_size=(input_shape[3] // partition_size), n_partitions=partition_size), | ||
) | ||
|
||
assert torch.allclose(x, departition(partition(x))) | ||
|
||
|
||
if __name__ == "__main__": | ||
pytest.main([__file__]) |
Uh oh!
There was an error while loading. Please reload this page.