Add tensor and task lowering scheme

flexflow · reyna-abhyankar · Aug 25, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
commit 895c117100a0ac4cdb1cc1dead37f2efbe3786f9
diff --git a/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml b/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml
diff --git a/lib/local-execution/include/local-execution/local_args_backing.h b/lib/local-execution/include/local-execution/local_args_backing.h
@@ -0,0 +1,37 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ARGS_BACKING_H
+#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ARGS_BACKING_H
+
+#include "pcg/layer_guid_t.dtg.h"
+#include "pcg/computation_graph.h"
+#include "local-execution/per_device_op_state.h"
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/runtime_arg_config.h"
+#include "local-execution/task_invocation.dtg.h"
+#include "local-execution/local_task_argument_accessor.h"
+
+namespace FlexFlow {
+
+struct LocalArgsBacking {
+  LocalArgsBacking(RuntimeArgConfig const &);
+
+public:
+  void add_per_device_op_state(layer_guid_t const &,
+                               DeviceSpecificDeviceStates const &);
+
+  ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const;
+
+  ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &) const;
+  ConcreteArgSpec lower_to_concrete_arg_spec(OpArgRefSpec const &,
+                                             ComputationGraph const &,
+                                             layer_guid_t const &) const;
+
+public:
+  // arguments
+  std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
+      per_device_op_states;
+  RuntimeArgConfig runtime_arg_config;
+};
+
+}
+
+#endif
diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h
diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h
@@ -0,0 +1,58 @@
+
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TENSOR_BACKING_H
+#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TENSOR_BACKING_H
+
+#include "kernels/accessor.h"
+#include "local-execution/local_task_argument_accessor.h"
+#include "local-execution/task_invocation.dtg.h"
+#include "local-execution/tensor_role.dtg.h"
+#include "local-execution/lowered_tensor_t.dtg.h"
+#include "local-execution/lowered_tensor_source.h"
+#include "local-execution/optimizer_tensor_t.dtg.h"
+#include "local-execution/loss_tensor_t.dtg.h"
+#include "pcg/computation_graph.dtg.h"
+#include "pcg/tensor_guid_t.dtg.h"
+#include "pcg/layer_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+using TensorBackingMap =
+    std::unordered_map<lowered_tensor_t, GenericTensorAccessorW>;
+
+struct LocalTensorBacking {
+  LocalTensorBacking();
+
+public:
+  void allocate_layer_tensors(layer_guid_t const &,
+                              ComputationGraph const &,
+                              Allocator &);
+  void allocate_tensors_by_role(TensorRole const &,
+                                layer_guid_t const &,
+                                ComputationGraph const &,
+                                Allocator &);
+  void allocate_optimizer_tensors(tensor_guid_t const &,
+                                  std::vector<optimizer_tensor_t> const &,
+                                  Allocator &);
+  TensorSlotsBacking
+      construct_tensor_slots_backing(TaskBinding const &) const;
+
+  GenericTensorAccessorW const &
+      get_tensor_backing(lowered_tensor_t const &) const;
+
+  bool is_tensor_allocated(lowered_tensor_t const &) const;
+
+public:
+  // tensors
+  TensorBackingMap tensor_backings;
+
+  std::unordered_map<tensor_guid_t, lowered_tensor_t> tensor_lowering_mapping;
+  std::unordered_map<tensor_guid_t, lowered_tensor_t> gradient_tensor_lowering_mapping;
+  std::unordered_map<optimizer_tensor_t, lowered_tensor_t> optimizer_tensor_lowering_mapping;
+  std::unordered_map<loss_tensor_t, lowered_tensor_t> loss_tensor_lowering_mapping;
+
+  LoweredTensorSource lowered_tensor_source;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h
@@ -1,11 +1,13 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H
 
-#include "local-execution/local_slots_backing.h"
+#include "local-execution/local_tensor_backing.h"
+#include "local-execution/local_args_backing.h"
 #include "local-execution/task_registry.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
+#include "local-execution/optimizer_tensor_source.h"
 
 namespace FlexFlow {
 
@@ -15,8 +17,6 @@ using PerLayerElapsedTime =
 struct LocalTrainingBacking {
   LocalTrainingBacking(Allocator const &,
                        ComputationGraph const &,
-                       LayerTensorBackingMap const &allocated_forward_tensors,
-                       TensorBackingMap const &allocated_non_graph_tensors,
                        RuntimeArgConfig const &);
   void register_and_allocate_layer(layer_guid_t const &);
   void allocate_layer_optimizer_tensors(layer_guid_t const &,
@@ -25,17 +25,18 @@ struct LocalTrainingBacking {
   void execute_init(layer_guid_t const &);
   std::optional<float> execute_forward(layer_guid_t const &);
   void compute_loss(LossAttrs const &loss_attrs,
-                    reduced_tensor_t const &logit_tensor,
-                    reduced_tensor_t const &label_tensor);
+                    tensor_guid_t const &logit_tensor,
+                    loss_tensor_t const &label_tensor);
   std::optional<float> execute_backward(layer_guid_t const &);
   void execute_update(layer_guid_t const &, OptimizerAttrs const &);
 
   TaskArgumentAccessor
-      get_task_arg_accessor(TaskInvocation const &,
-                            std::optional<layer_guid_t> const &) const;
-  TaskArgumentAccessor get_op_task_arg_accessor(OpTaskInvocation const &,
-                                                layer_guid_t const &) const;
-  LocalSlotsBacking local_slots_backing;
+      get_task_arg_accessor(TaskInvocation const &) const;
+
+  TaskInvocation lower_to_task_invocation(OpTaskInvocation const &, layer_guid_t const &) const;
+
+  LocalTensorBacking local_tensor_backing;
+  LocalArgsBacking local_args_backing;
 
 private:
   DeviceSpecificDeviceStates call_init_task_impl(task_id_t,
@@ -46,6 +47,10 @@ struct LocalTrainingBacking {
   Allocator allocator;
   ComputationGraph computation_graph;
   TaskRegistry task_registry;
+
+  // optimizer
+  OptimizerTensorSource optimizer_tensor_source;
+  std::unordered_map<layer_guid_t, std::vector<optimizer_tensor_t>> layer_optimizer_tensor_ids;
 };
 
 } // namespace FlexFlow

diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h
@@ -20,13 +20,15 @@
 #include "local-execution/task_invocation.dtg.h"
 #include "local-execution/task_signature.h"
 #include "op-attrs/ops/loss_functions.h"
+#include "pcg/tensor_guid_t.dtg.h"
+#include "local-execution/loss_tensor_t.dtg.h"
 
 namespace FlexFlow {
 
 TaskImplFunction get_loss_bwd_task_impl();
 TaskSignature get_loss_bwd_signature();
 TaskInvocation
-    backward(LossAttrs const &, reduced_tensor_t logit, reduced_tensor_t label);
+    backward(LossAttrs const &, tensor_guid_t logit, loss_tensor_t label);
 
 } // namespace FlexFlow
 

diff --git a/...al-execution/reduced_tensor_t.struct.toml → ...local-execution/loss_tensor_t.struct.toml b/...al-execution/reduced_tensor_t.struct.toml → ...local-execution/loss_tensor_t.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "reduced_tensor_t"
+name = "loss_tensor_t"
 features = [
   "eq",
   "ord",

diff --git a/lib/local-execution/include/local-execution/lowered_tensor_source.h b/lib/local-execution/include/local-execution/lowered_tensor_source.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOWERED_TENSOR_SOURCE_H
+#define _FLEXFLOW_LOCAL_EXECUTION_LOWERED_TENSOR_SOURCE_H
+
+#include "local-execution/lowered_tensor_t.dtg.h"
+
+namespace FlexFlow {
+
+struct LoweredTensorSource {
+public:
+  LoweredTensorSource();
+
+  lowered_tensor_t new_lowered_tensor();
+
+private:
+  static size_t next_available_lowered_tensor_id;
+};
+
+} // namespace FlexFlow
+
+
+#endif
diff --git a/...ution/non_graph_tensor_guid_t.struct.toml → ...al-execution/lowered_tensor_t.struct.toml b/...ution/non_graph_tensor_guid_t.struct.toml → ...al-execution/lowered_tensor_t.struct.toml
@@ -1,13 +1,13 @@
 namespace = "FlexFlow"
-name = "non_graph_tensor_guid_t"
+name = "lowered_tensor_t"
 features = [
   "eq",
   "ord",
   "hash",
   "fmt",
-  "json",
 ]
 
+
 [[fields]]
-name = "raw_uid"
+name = "raw_index"
 type = "int"
diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -3,6 +3,8 @@
 
 #include "local-execution/local_training_backing.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+#include "pcg/tensor_guid_t.dtg.h"
+#include "local-execution/loss_tensor_t.dtg.h"
 
 namespace FlexFlow {
 
@@ -12,12 +14,10 @@ using PerLayerElapsedTime =
 struct ModelTrainingInstance {
   ModelTrainingInstance(Allocator const &,
                         ComputationGraph const &,
-                        LayerTensorBackingMap const &allocated_forward_tensors,
-                        TensorBackingMap const &allocated_non_graph_tensors,
                         RuntimeArgConfig const &,
                         LossAttrs const &,
-                        reduced_tensor_t const &logit_tensor,
-                        reduced_tensor_t const &label_tensor,
+                        tensor_guid_t const &logit_tensor,
+                        loss_tensor_t const &label_tensor,
                         OptimizerAttrs const &);
 
   void execute_init();
@@ -28,8 +28,8 @@ struct ModelTrainingInstance {
   ComputationGraph computation_graph;
   LocalTrainingBacking training_backing;
   LossAttrs loss_attrs;
-  reduced_tensor_t logit_tensor;
-  reduced_tensor_t label_tensor;
+  tensor_guid_t logit_tensor;
+  loss_tensor_t label_tensor;
   OptimizerAttrs optimizer_attrs;
 };
 

diff --git a/lib/local-execution/include/local-execution/op_task_invocation.h b/lib/local-execution/include/local-execution/op_task_invocation.h
@@ -10,7 +10,7 @@
 #include "local-execution/op_tensor_spec.h"
 #include "local-execution/profiling.h"
 #include "local-execution/runtime_arg_ref.h"
-#include "local-execution/slot_tensor_type_id.dtg.h"
+#include "local-execution/slot_grad_id.dtg.h"
 #include "local-execution/task_id_t.dtg.h"
 #include "local-execution/variadic_tensor_ref.h"
 #include <typeindex>
@@ -84,14 +84,14 @@ struct OpTaskBinding {
   bool operator==(OpTaskBinding const &other) const;
   bool operator!=(OpTaskBinding const &other) const;
 
-  std::unordered_map<SlotTensorTypeId, OpTensorSpec> const &
+  std::unordered_map<SlotGradId, OpTensorSpec> const &
       get_tensor_bindings() const;
   std::unordered_map<slot_id_t, OpArgSpec> const &get_arg_bindings() const;
 
   void bind_from_forward(OpTaskBinding const &fwd);
 
 private:
-  std::unordered_map<SlotTensorTypeId, OpTensorSpec> tensor_bindings;
+  std::unordered_map<SlotGradId, OpTensorSpec> tensor_bindings;
   std::unordered_map<slot_id_t, OpArgSpec> arg_bindings;
 
 private: