Skip to content

Commit bb0a668

Browse files
authored
[hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch * fix: use return_outputs=False to eliminate extra memory consumption * feat: add return_outputs warning * style: remove `return_outputs=False` as it is the default value
1 parent 5fcd779 commit bb0a668

File tree

24 files changed

+28
-36
lines changed

24 files changed

+28
-36
lines changed

applications/ColossalMoE/train.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,6 @@ def main():
238238
lambda x, y: x.loss,
239239
optimizer,
240240
return_loss=True,
241-
return_outputs=True,
242241
)
243242
# Backward and optimize
244243
if is_pp_last_stage:

colossalai/booster/plugin/hybrid_parallel_plugin.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1183,6 +1183,9 @@ def execute_pipeline(
11831183
) -> dict:
11841184
assert self.enable_pipeline_parallelism, "pipeline parallelism is not enabled"
11851185

1186+
if return_outputs:
1187+
warnings.warn("return_outputs may lead to significant extra memory consumption.")
1188+
11861189
# Create a context for gradient synchronization based on the optimizer type.
11871190
# If it's a HybridParallelZeroOptimizer, use optimizer.no_sync(); otherwise, use model.no_sync().
11881191
# This is to avoid redundant gradient reduction in pipeline parallelism (multiple microbatch values should be reduced once),

colossalai/pipeline/schedule/one_f_one_b.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from torch.utils._pytree import tree_map
88

99
from colossalai.accelerator import get_accelerator
10-
from colossalai.interface import ModelWrapper, OptimizerWrapper
10+
from colossalai.interface import OptimizerWrapper
1111
from colossalai.pipeline.p2p import PipelineP2PCommunication, create_send_metadata
1212
from colossalai.pipeline.stage_manager import PipelineStageManager
1313
from colossalai.utils import get_current_device
@@ -327,9 +327,7 @@ def run_forward_only(
327327
self.send_forward(output_obj)
328328

329329
if outputs is not None:
330-
if isinstance(model, ModelWrapper):
331-
model = model.unwrap()
332-
outputs = merge_batch(outputs, getattr(model, "batch_size_dim", 0))
330+
outputs = merge_batch(outputs)
333331
return {"loss": accum_loss, "outputs": outputs}
334332

335333
def run_forward_backward(
@@ -412,9 +410,7 @@ def run_forward_backward(
412410
assert all(len(v) == 0 for v in input_objs) and all(len(v) == 0 for v in output_objs)
413411

414412
if outputs is not None:
415-
if isinstance(model, ModelWrapper):
416-
model = model.unwrap()
417-
outputs = merge_batch(outputs, getattr(model, "batch_size_dim", 0))
413+
outputs = merge_batch(outputs)
418414
return {"loss": accum_loss, "outputs": outputs}
419415

420416
def forward_backward_step(

docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def train_epoch(
178178
for _ in pbar:
179179
if use_pipeline:
180180
outputs = booster.execute_pipeline(
181-
train_dataloader_iter, model, _criterion, optimizer, return_loss=True, return_outputs=True
181+
train_dataloader_iter, model, _criterion, optimizer, return_loss=True
182182
)
183183
# Backward and optimize
184184
if is_pp_last_stage:

docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ def run_forward_backward(
231231
if isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1:
232232
# run pipeline forward backward when enabling pp in hybrid parallel plugin
233233
output_dict = booster.execute_pipeline(
234-
data_iter, model, criterion, optimizer, return_loss=True, return_outputs=True
234+
data_iter, model, criterion, optimizer, return_loss=True
235235
)
236236
loss, outputs = output_dict["loss"], output_dict["outputs"]
237237
else:

docs/source/en/features/pipeline_parallel.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,8 +198,7 @@ def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, _criterion:
198198
model,
199199
_criterion,
200200
optimizer,
201-
return_loss=True,
202-
return_outputs=True)
201+
return_loss=True)
203202
# Backward and optimize
204203
if is_pp_last_stage:
205204
loss = outputs['loss']

docs/source/en/features/shardformer.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ However, if pipeline parallel is enabled, there are several usages different fro
271271
3. Do forward and backward passing through calling `Booster.execute_pipeline` method:
272272
```python
273273
outputs = booster.execute_pipeline(
274-
train_dataloader_iter, model, _criterion, optimizer, return_loss=True, return_outputs=True
274+
train_dataloader_iter, model, _criterion, optimizer, return_loss=True
275275
)
276276
```
277277
Backward passing has been completed by this method, so there is no need to call `loss.backward()` after executing this method.

docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ def train_epoch(
175175
for _ in pbar:
176176
if use_pipeline:
177177
outputs = booster.execute_pipeline(
178-
train_dataloader_iter, model, _criterion, optimizer, return_loss=True, return_outputs=True
178+
train_dataloader_iter, model, _criterion, optimizer, return_loss=True
179179
)
180180
# Backward and optimize
181181
if is_pp_last_stage:

docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ def run_forward_backward(
234234
if isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1:
235235
# run pipeline forward backward when enabling pp in hybrid parallel plugin
236236
output_dict = booster.execute_pipeline(
237-
data_iter, model, criterion, optimizer, return_loss=True, return_outputs=True
237+
data_iter, model, criterion, optimizer, return_loss=True
238238
)
239239
loss, outputs = output_dict["loss"], output_dict["outputs"]
240240
else:

docs/source/zh-Hans/features/pipeline_parallel.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,8 +193,7 @@ def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, _criterion:
193193
model,
194194
_criterion,
195195
optimizer,
196-
return_loss=True,
197-
return_outputs=True)
196+
return_loss=True)
198197
# Backward and optimize
199198
if is_pp_last_stage:
200199
loss = outputs['loss']

0 commit comments

Comments
 (0)