Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Local execution e2e training #1472

Open
wants to merge 46 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
6adb290
temporary weight adjust index
reyna-abhyankar Aug 25, 2024
61697c2
Loss function
reyna-abhyankar Aug 27, 2024
b56c046
Add cuda test for loss function
reyna-abhyankar Aug 27, 2024
f75a3d4
Format
reyna-abhyankar Aug 27, 2024
f74711f
Refactor and build optimizer kernels, op
reyna-abhyankar Aug 27, 2024
40c6252
Finish optimizer local backing
reyna-abhyankar Aug 27, 2024
ad9b9ea
Format
reyna-abhyankar Aug 27, 2024
1ddfade
E2E update test
reyna-abhyankar Aug 27, 2024
dde9496
Format
reyna-abhyankar Aug 27, 2024
59635d8
Small fixes
reyna-abhyankar Sep 11, 2024
103ef07
Format
reyna-abhyankar Sep 11, 2024
f48f9ff
Fix test and small issues
reyna-abhyankar Sep 18, 2024
189c9c8
Format
reyna-abhyankar Sep 18, 2024
d93f464
Merge branch 'repo-refactor' into local-e2e-training
reyna-abhyankar Oct 1, 2024
b5647c8
Pass tests after merge
reyna-abhyankar Oct 1, 2024
f5ff91e
Fix input/weight differentiation
reyna-abhyankar Oct 1, 2024
7470e71
Fix signature to use unified rep
reyna-abhyankar Oct 1, 2024
deece1b
Fix model training instance abstraction
reyna-abhyankar Oct 1, 2024
1d3cc94
Change subcase test name
reyna-abhyankar Oct 1, 2024
3cf5d08
Quick fixes
reyna-abhyankar Oct 16, 2024
79ef4c9
Refactor training backing and instance
reyna-abhyankar Oct 22, 2024
a73b1c3
Expose op folders publicly
reyna-abhyankar Nov 13, 2024
c6fed29
Add tensor type, operate over reduced tensor
reyna-abhyankar Nov 13, 2024
0cdfb1a
Fixes
reyna-abhyankar Jan 7, 2025
9d252b3
Remove tensor lower
reyna-abhyankar Jan 15, 2025
895c117
Add tensor and task lowering scheme
reyna-abhyankar Jan 17, 2025
411017d
Build local exec
reyna-abhyankar Jan 22, 2025
0128abb
Disaggregate local backend
reyna-abhyankar Feb 1, 2025
277f8c2
Update task binding interface and cost estimator
reyna-abhyankar Feb 1, 2025
377c6aa
Merge master into local execution
reyna-abhyankar Feb 4, 2025
8efaec7
Build
reyna-abhyankar Feb 6, 2025
1dc1398
Format
reyna-abhyankar Feb 6, 2025
17ad5c8
Split task spec files
reyna-abhyankar Feb 6, 2025
639c2c1
Delete outdated sim environment file
reyna-abhyankar Feb 6, 2025
a697044
Finish API
reyna-abhyankar Feb 13, 2025
187a8d5
Add tests for allocated and unallocated
reyna-abhyankar Feb 13, 2025
a0f8113
Fix nonnegative
reyna-abhyankar Feb 13, 2025
b1eab94
Format
reyna-abhyankar Feb 13, 2025
b532c50
Pass allocated-unallocated tests
reyna-abhyankar Feb 13, 2025
f28e5c2
Update task registry tests
reyna-abhyankar Feb 13, 2025
89752fa
Move local tensor backing to dtgen
reyna-abhyankar Feb 22, 2025
aef8ad5
Remove lowered tensor source
reyna-abhyankar Feb 22, 2025
f0a4285
Loss and update tests
reyna-abhyankar Feb 24, 2025
9047edc
Merge master
reyna-abhyankar Feb 24, 2025
350babf
Passing tests after merge issues
reyna-abhyankar Feb 24, 2025
aef7c6e
Pass gpu tests
reyna-abhyankar Feb 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Update task binding interface and cost estimator
  • Loading branch information
reyna-abhyankar committed Feb 1, 2025
commit 277f8c268632dfcc5622d96f55b65751d063d736
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ struct LocalTrainingBacking {
ComputationGraph computation_graph;
TaskRegistry task_registry;

private:
GradientTensorSource gradient_tensor_source;
};

Expand All @@ -42,7 +41,7 @@ std::optional<float> execute_forward(LocalTrainingBacking &,
layer_guid_t const &);
std::optional<float> execute_backward(LocalTrainingBacking &,
layer_guid_t const &);
void compute_loss(LocalTrainingBacking const &,
void compute_loss(LocalTrainingBacking &,
LossAttrs const &,
tensor_guid_t const &logit_tensor,
loss_tensor_t const &label_tensor);
Expand Down
8 changes: 5 additions & 3 deletions lib/local-execution/include/local-execution/loss_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,21 @@
#ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_
#define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_

#include "local-execution/loss_tensor_t.dtg.h"
#include "local-execution/task_impl_function.dtg.h"
#include "local-execution/task_invocation.dtg.h"
#include "local-execution/task_signature.h"
#include "op-attrs/ops/loss_functions.h"
#include "pcg/tensor_guid_t.dtg.h"
#include "local-execution/loss_tensor_t.dtg.h"

namespace FlexFlow {

TaskImplFunction get_loss_bwd_task_impl();
TaskSignature get_loss_bwd_signature();
TaskInvocation
backward(LossAttrs const &, tensor_guid_t logit, loss_tensor_t label);
TaskInvocation backward(LossAttrs const &,
tensor_guid_t logit,
gradient_tensor_t logit_grad,
loss_tensor_t label);

} // namespace FlexFlow

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ using PerLayerElapsedTime =

struct ModelTrainingInstance {
ModelTrainingInstance(LocalTrainingBacking const &,
tensor_guid_t const & logit_tensor,
TensorShape const & label_tensor_shape,
tensor_guid_t const &logit_tensor,
TensorShape const &label_tensor_shape,
LossAttrs const &,
OptimizerAttrs const &);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@

namespace FlexFlow {

TaskInvocation
lower_to_task_invocation(OpTaskInvocation const &,
layer_guid_t const &,
ComputationGraph const &,
std::optional<DeviceSpecificDeviceStates> const &);
TaskInvocation lower_to_task_invocation(
OpTaskInvocation const &,
layer_guid_t const &,
ComputationGraph const &,
std::unordered_map<tensor_guid_t, gradient_tensor_t> const &,
std::optional<DeviceSpecificDeviceStates> const &);

ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &,
RuntimeArgConfig const &);
Expand Down
3 changes: 3 additions & 0 deletions lib/local-execution/include/local-execution/optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,21 @@ TaskSignature get_update_signature(OptimizerAttrs const &);
TaskInvocation get_update_invocation(
OptimizerAttrs const &,
tensor_guid_t const &weight,
gradient_tensor_t const &weight_grad,
std::vector<optimizer_tensor_t> const &grad_buffer_tensors);
TaskImplFunction get_update_task_impl(OptimizerAttrs const &);

TaskSignature get_sgd_update_signature();
TaskInvocation sgd_update(SGDOptimizerAttrs const &,
tensor_guid_t const &weight,
gradient_tensor_t const &weight_grad,
optimizer_tensor_t const &sgd_v);
TaskImplFunction get_sgd_update_task_impl();

TaskSignature get_adam_update_signature();
TaskInvocation adam_update(AdamOptimizerAttrs const &,
tensor_guid_t const &weight,
gradient_tensor_t const &weight_grad,
optimizer_tensor_t const &adam_v,
optimizer_tensor_t const &adam_m);
TaskImplFunction get_adam_update_task_impl();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ struct TaskArgumentAccessor {

template <Permissions PRIV>
privilege_mode_to_accessor<PRIV> get_optimizer_tensor(int slot) const {
return this->get_tensor_grad<PRIV>(slot_id_t{slot});
return this->get_optimizer_tensor<PRIV>(slot_id_t{slot});
}

template <Permissions PRIV>
Expand All @@ -59,17 +59,16 @@ struct TaskArgumentAccessor {
this->ptr->get_tensor(slot, PRIV, TensorType::OPTIMIZER));
}

// template <Permissions PRIV>
// privilege_mode_to_accessor<PRIV> get_non_graph_tensor(int slot) const {
// return this->get_tensor_grad<PRIV>(slot_id_t{slot});
// }
template <Permissions PRIV>
privilege_mode_to_accessor<PRIV> get_loss_tensor(int slot) const {
return this->get_loss_tensor<PRIV>(slot_id_t{slot});
}

// template <Permissions PRIV>
// privilege_mode_to_accessor<PRIV> get_non_graph_tensor(slot_id_t slot) const
// {
// return std::get<privilege_mode_to_accessor<PRIV>>(
// this->ptr->get_tensor(slot, PRIV, TensorType::NON_GRAPH));
// }
template <Permissions PRIV>
privilege_mode_to_accessor<PRIV> get_loss_tensor(slot_id_t slot) const {
return std::get<privilege_mode_to_accessor<PRIV>>(
this->ptr->get_tensor(slot, PRIV, TensorType::LOSS));
}

// variadic tensors
template <Permissions PRIV>
Expand Down Expand Up @@ -101,7 +100,7 @@ struct TaskArgumentAccessor {
template <Permissions PRIV>
std::vector<privilege_mode_to_accessor<PRIV>>
get_variadic_optimizer_tensor(int slot) const {
return this->get_variadic_tensor_grad<PRIV>(slot_id_t{slot});
return this->get_variadic_optimizer_tensor<PRIV>(slot_id_t{slot});
}

template <Permissions PRIV>
Expand All @@ -111,18 +110,18 @@ struct TaskArgumentAccessor {
this->ptr->get_variadic_tensor(slot, PRIV, TensorType::OPTIMIZER));
}

// template <Permissions PRIV>
// std::vector<privilege_mode_to_accessor<PRIV>>
// get_variadic_non_graph_tensor(int slot) const {
// return this->get_variadic_tensor_grad<PRIV>(slot_id_t{slot});
// }
template <Permissions PRIV>
std::vector<privilege_mode_to_accessor<PRIV>>
get_variadic_loss_tensor(int slot) const {
return this->get_variadic_loss_tensor<PRIV>(slot_id_t{slot});
}

// template <Permissions PRIV>
// std::vector<privilege_mode_to_accessor<PRIV>>
// get_variadic_non_graph_tensor(slot_id_t slot) const {
// return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
// this->ptr->get_variadic_tensor(slot, PRIV, TensorType::NON_GRAPH));
// }
template <Permissions PRIV>
std::vector<privilege_mode_to_accessor<PRIV>>
get_variadic_loss_tensor(slot_id_t slot) const {
return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
this->ptr->get_variadic_tensor(slot, PRIV, TensorType::LOSS));
}

Allocator get_allocator() const {
return this->ptr->get_allocator();
Expand Down
12 changes: 6 additions & 6 deletions lib/local-execution/include/local-execution/task_binding.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ struct TaskBinding {
void bind(int, tensor_guid_t const &);
void bind(slot_id_t, tensor_guid_t const &);

void bind_grad(int, tensor_guid_t const &);
void bind_grad(slot_id_t, tensor_guid_t const &);
void bind_grad(int, gradient_tensor_t const &);
void bind_grad(slot_id_t, gradient_tensor_t const &);

void bind(int, optimizer_tensor_t const &);
void bind(slot_id_t, optimizer_tensor_t const &);
void bind_optimizer(int, optimizer_tensor_t const &);
void bind_optimizer(slot_id_t, optimizer_tensor_t const &);

void bind(int, loss_tensor_t const &);
void bind(slot_id_t, loss_tensor_t const &);
void bind_loss(int, loss_tensor_t const &);
void bind_loss(slot_id_t, loss_tensor_t const &);

template <typename T>
void bind_arg(int name, T const &t) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "local-execution/op_task_type.dtg.h"
#include "local-execution/task_registry.dtg.h"
#include "op-attrs/computation_graph_op_attrs.h"
#include "pcg/computation_graph.dtg.h"

namespace FlexFlow {

Expand Down
103 changes: 69 additions & 34 deletions lib/local-execution/src/local_cost_estimator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "pcg/computation_graph/layer_added_result.dtg.h"
#include "pcg/computation_graph_builder.h"
#include "pcg/parallel_tensor_attrs.h"
#include "utils/containers/concat_vectors.h"
#include "utils/containers/sum.h"
#include "utils/containers/transform.h"
#include "utils/containers/values.h"
Expand All @@ -17,6 +18,53 @@ namespace FlexFlow {
LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config)
: runtime_arg_config(config) {}

static ComputationGraph const &
create_computation_graph_for_local_cost_estimation(
PCGOperatorAttrs const &op,
std::vector<ParallelTensorShape> const &inputs,
std::vector<ParallelTensorAttrs> const &weights,
std::vector<ParallelTensorAttrs> const &outputs) {
ComputationGraph computation_graph = make_empty_computation_graph();

// create layer for inputs
auto get_vector_piece_attrs_from_parallel_tensor_shape =
[](std::vector<ParallelTensorShape> const &parallel_shapes) {
return transform(parallel_shapes, [](ParallelTensorShape const &p) {
return TensorAttrs{
get_piece_shape(p), std::nullopt, std::nullopt, CreateGrad::YES};
});
};

LayerAddedResult inputs_layer =
add_layer(computation_graph,
LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "inputs"},
{},
get_vector_piece_attrs_from_parallel_tensor_shape(inputs));

// create layer for weights
auto get_vector_piece_attrs_from_parallel_tensor_attrs =
[](std::vector<ParallelTensorAttrs> const &parallel_attrs) {
return transform(parallel_attrs, [](ParallelTensorAttrs const &p) {
return get_piece_attrs(p);
});
};

LayerAddedResult weights_layer =
add_layer(computation_graph,
LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "weights"},
{},
get_vector_piece_attrs_from_parallel_tensor_attrs(weights));

// create operator layer
LayerAddedResult operator_layer = add_layer(
computation_graph,
LayerAttrs{compgraph_op_attrs_from_pcg_op_attrs(op), "operator"},
concat_vectors(inputs_layer.outputs, weights_layer.outputs),
get_vector_piece_attrs_from_parallel_tensor_attrs(outputs));

return computation_graph;
}

CostDetails LocalCostEstimator::estimate_cost(
PCGOperatorAttrs const &op,
std::vector<ParallelTensorShape> const &inputs,
Expand All @@ -29,47 +77,34 @@ CostDetails LocalCostEstimator::estimate_cost(
return CostDetails{0, 0};
}

LayerAttrs layer_attrs =
LayerAttrs{compgraph_op_attrs_from_pcg_op_attrs(op), std::nullopt};
// construct computation graph
ComputationGraph computation_graph =
create_computation_graph_for_local_cost_estimation(
op, inputs, weights, outputs);

// allocate memory for inputs
// allocate memory
std::shared_ptr<TrackedAllocator> tracked_allocator_ptr =
std::make_shared<TrackedAllocator>(create_local_cuda_memory_allocator());
Allocator allocator = Allocator(tracked_allocator_ptr);
std::vector<tensor_guid_t> input_tensor_ids;

ComputationGraphBuilder cg_builder;
for (ParallelTensorShape const &input : inputs) {
TensorShape tensor_shape = get_piece_shape(input);
tensor_guid_t tensor_id =
cg_builder.create_input(tensor_shape, CreateGrad::YES);
input_tensor_ids.push_back(tensor_id);
}

auto get_vector_piece_attrs =
[](std::vector<ParallelTensorAttrs> const &parallel_attrs) {
return transform(parallel_attrs, [](ParallelTensorAttrs const &p) {
return get_piece_attrs(p);
});
};
LocalTrainingBacking local_backing(
allocator,
computation_graph,
LocalTensorBacking{},
LocalArgsBacking{this->runtime_arg_config});

// add operator to graph
LayerAddedResult layer_added_result =
cg_builder.add_layer_and_get_layer_added_result(
layer_attrs,
input_tensor_ids,
transform(get_vector_piece_attrs(weights),
[&](TensorAttrs const &a) {
return cg_builder.create_weight(a);
}),
get_vector_piece_attrs(outputs));
allocate_all_computation_graph_tensors(local_backing.local_tensor_backing,
local_backing.gradient_tensor_source,
local_backing.computation_graph,
local_backing.allocator);

// execute layer
layer_guid_t operator_layer_guid =
get_layer_by_name(computation_graph, "operator");
execute_init(local_backing, operator_layer_guid);
float fwd = execute_forward(local_backing, operator_layer_guid).value();
float bwd = execute_backward(local_backing, operator_layer_guid).value();

LocalTrainingBacking local_backing(
allocator, cg_builder.computation_graph, this->runtime_arg_config);
local_backing.register_and_allocate_layer(layer_added_result.layer);
local_backing.execute_init(layer_added_result.layer);
float fwd = local_backing.execute_forward(layer_added_result.layer).value();
float bwd = local_backing.execute_backward(layer_added_result.layer).value();
float total_execution_time = fwd + bwd;

return CostDetails{total_execution_time,
Expand Down
Loading