Fixes

flexflow · reyna-abhyankar · Aug 25, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
commit 0cdfb1a7edd9ea283f678f06950054a701be8600
diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
@@ -43,8 +43,6 @@ struct ArrayShape {
   std::optional<std::size_t> at_maybe(legion_dim_t) const;
   std::optional<std::size_t> at_maybe(ff_dim_t) const;
 
-  ArrayShape sub_shape(legion_dim_t start, ff_dim_t end) const;
-
   ArrayShape sub_shape(std::optional<ff_dim_t> start,
                        std::optional<ff_dim_t> end) const;
 

diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h
@@ -10,9 +10,6 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value);
 
 legion_dim_t legion_dim_from_ff_dim(ff_dim_t, int num_dimensions);
 
-std::optional<legion_dim_t> legion_dim_from_ff_dim(std::optional<ff_dim_t>,
-                                                   int num_dimensions);
-
 template <typename T>
 using LegionOrdered = DimOrdered<legion_dim_t, T>;
 

diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
@@ -1,6 +1,7 @@
 #include "kernels/array_shape.h"
 #include "op-attrs/dim_ordered/slice.h"
 #include "utils/containers/product.h"
+#include "utils/containers/transform.h"
 
 namespace FlexFlow {
 
@@ -54,17 +55,17 @@ std::size_t ArrayShape::at(ff_dim_t idx) const {
   return dims.at(legion_dim_from_ff_dim(idx, this->num_dims()));
 }
 
-ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const {
-  legion_dim_t legion_end = legion_dim_from_ff_dim(end, num_dims());
-  return this->sub_shape(start, legion_end);
-}
-
 ArrayShape ArrayShape::sub_shape(std::optional<ff_dim_t> start,
                                  std::optional<ff_dim_t> end) const {
-  std::optional<legion_dim_t> legion_start =
-      legion_dim_from_ff_dim(start, num_dims());
-  std::optional<legion_dim_t> legion_end =
-      legion_dim_from_ff_dim(end, num_dims());
+  std::optional<legion_dim_t> legion_start = transform(
+    start, [&](auto const &start_unwrapped) {
+      return legion_dim_from_ff_dim(start_unwrapped, num_dims());
+  });
+
+  std::optional<legion_dim_t> legion_end = transform(
+    end, [&](auto const &end_unwrapped) {
+      return legion_dim_from_ff_dim(end_unwrapped, num_dims());
+  });
   return this->sub_shape(legion_start, legion_end);
 }
 

diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu
@@ -15,6 +15,7 @@
 
 #include "device.h"
 #include "kernels/concat_kernels.h"
+#include "kernels/legion_dim.h"
 #include <cassert>
 
 namespace FlexFlow {
@@ -25,7 +26,8 @@ void calc_blk_size(size_t &num_blocks,
                    size_t &blk_size,
                    ArrayShape const &shape,
                    ff_dim_t axis) {
-  blk_size = shape.sub_shape(legion_dim_t{0}, axis).num_elements();
+  legion_dim_t axis_legion_dim = legion_dim_from_ff_dim(axis, shape.num_dims());
+  blk_size = shape.sub_shape(legion_dim_t{0}, axis_legion_dim).num_elements();
   num_blocks = shape.sub_shape(axis, std::nullopt).num_elements();
 }
 

diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc
@@ -10,13 +10,4 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, int num_dimensions) {
   return legion_dim_t(num_dimensions - ff_dim.value - 1);
 }
 
-std::optional<legion_dim_t>
-    legion_dim_from_ff_dim(std::optional<ff_dim_t> ff_dim, int num_dimensions) {
-  if (ff_dim.has_value()) {
-    return legion_dim_from_ff_dim(ff_dim.value(), num_dimensions);
-  } else {
-    return std::nullopt;
-  }
-}
-
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/arg_ref.h b/lib/local-execution/include/local-execution/arg_ref.h
@@ -82,7 +82,7 @@ template <typename LABEL_TYPE>
 struct hash<::FlexFlow::ArgRefSpec<LABEL_TYPE>> {
   size_t operator()(::FlexFlow::ArgRefSpec<LABEL_TYPE> const &s) const {
     size_t result = 0;
-    ::FlexFlow::hash_combine(result, s.type_idx);
+    ::FlexFlow::hash_combine(result, s.type_idx, s.get_ref_type());
     return result;
   }
 };

diff --git a/lib/local-execution/include/local-execution/concrete_arg.h b/lib/local-execution/include/local-execution/concrete_arg.h
@@ -24,6 +24,10 @@ struct ConcreteArgSpec {
     return this->type_idx;
   }
 
+  std::shared_ptr<void const> get_ptr() const {
+    return this->ptr;
+  }
+
   bool operator==(ConcreteArgSpec const &other) const;
   bool operator!=(ConcreteArgSpec const &other) const;
 
@@ -60,7 +64,7 @@ template <>
 struct hash<::FlexFlow::ConcreteArgSpec> {
   size_t operator()(::FlexFlow::ConcreteArgSpec const &s) const {
     size_t result = 0;
-    ::FlexFlow::hash_combine(result, s.get_type_index());
+    ::FlexFlow::hash_combine(result, s.get_type_index(), s.get_ptr());
     return result;
   }
 };

diff --git a/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml b/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml
@@ -4,8 +4,6 @@ features = [
   "eq",
   "ord",
   "hash",
-  "json",
-  "rapidcheck",
   "fmt",
 ]
 

diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h
@@ -35,6 +35,7 @@ struct LocalTrainingBacking {
                             std::optional<layer_guid_t> const &) const;
   TaskArgumentAccessor get_op_task_arg_accessor(OpTaskInvocation const &,
                                                 layer_guid_t const &) const;
+  LocalSlotsBacking local_slots_backing;
 
 private:
   DeviceSpecificDeviceStates call_init_task_impl(task_id_t,
@@ -45,7 +46,6 @@ struct LocalTrainingBacking {
   Allocator allocator;
   ComputationGraph computation_graph;
   TaskRegistry task_registry;
-  LocalSlotsBacking local_slots_backing;
 };
 
 } // namespace FlexFlow

diff --git a/lib/local-execution/include/local-execution/runtime_arg_ref.h b/lib/local-execution/include/local-execution/runtime_arg_ref.h
@@ -27,18 +27,6 @@ RuntimeArgRef<ProfilingSettings> profiling_settings();
 RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_handle();
 RuntimeArgRef<FFIterationConfig> iteration_config();
 
-// std::string format_as(RuntimeArgRefSpec const & x) {
-//   std::ostringstream oss;
-//   oss << "<RuntimeArgRefSpec";
-//   oss << " type_idx=" << x.get_type_index().name();
-//   oss << ">";
-//   return oss.str();
-// }
-
-// std::ostream &operator<<(std::ostream & s, RuntimeArgRefSpec const & x) {
-//   return (s << fmt::to_string(x));
-// }
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h
@@ -7,7 +7,6 @@
 #include "local-execution/task_arg_spec.dtg.h"
 #include "local-execution/task_id_t.dtg.h"
 #include "local-execution/task_signature.dtg.h"
-#include "utils/hash/unordered_map.h"
 
 namespace FlexFlow {
 
@@ -63,12 +62,7 @@ namespace std {
 
 template <>
 struct hash<::FlexFlow::TaskBinding> {
-  size_t operator()(::FlexFlow::TaskBinding const &s) const {
-    size_t result = 0;
-    hash_combine(result, s.get_tensor_bindings());
-    hash_combine(result, s.get_arg_bindings());
-    return result;
-  }
+  size_t operator()(::FlexFlow::TaskBinding const &s) const;
 };
 
 } // namespace std

diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h
@@ -14,7 +14,7 @@ void register_tasks_for_layer(TaskRegistry &,
                               layer_guid_t const &,
                               ComputationGraphOpAttrs const &attrs);
 
-bool registry_contains_op_task(TaskRegistry const &,
+bool registry_contains_task_for_layer(TaskRegistry const &,
                                layer_guid_t const &,
                                OpTaskType const &);
 

diff --git a/lib/local-execution/src/ops/attention.cc → ...tion/src/local-execution/ops/attention.cc b/lib/local-execution/src/ops/attention.cc → ...tion/src/local-execution/ops/attention.cc
diff --git a/lib/local-execution/src/ops/batch_matmul.cc → ...n/src/local-execution/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc → ...n/src/local-execution/ops/batch_matmul.cc
diff --git a/lib/local-execution/src/ops/batch_norm.cc → ...ion/src/local-execution/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc → ...ion/src/local-execution/ops/batch_norm.cc
diff --git a/lib/local-execution/src/ops/cast.cc → ...execution/src/local-execution/ops/cast.cc b/lib/local-execution/src/ops/cast.cc → ...execution/src/local-execution/ops/cast.cc
diff --git a/lib/local-execution/src/ops/combine.cc → ...cution/src/local-execution/ops/combine.cc b/lib/local-execution/src/ops/combine.cc → ...cution/src/local-execution/ops/combine.cc
diff --git a/lib/local-execution/src/ops/concat.cc → ...ecution/src/local-execution/ops/concat.cc b/lib/local-execution/src/ops/concat.cc → ...ecution/src/local-execution/ops/concat.cc
diff --git a/lib/local-execution/src/ops/conv_2d.cc → ...cution/src/local-execution/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc → ...cution/src/local-execution/ops/conv_2d.cc
diff --git a/lib/local-execution/src/ops/dropout.cc → ...cution/src/local-execution/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc → ...cution/src/local-execution/ops/dropout.cc
diff --git a/...local-execution/src/ops/element_binary.cc → ...src/local-execution/ops/element_binary.cc b/...local-execution/src/ops/element_binary.cc → ...src/local-execution/ops/element_binary.cc
diff --git a/lib/local-execution/src/ops/element_unary.cc → .../src/local-execution/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc → .../src/local-execution/ops/element_unary.cc
diff --git a/lib/local-execution/src/ops/flat.cc → ...execution/src/local-execution/ops/flat.cc b/lib/local-execution/src/ops/flat.cc → ...execution/src/local-execution/ops/flat.cc
diff --git a/lib/local-execution/src/ops/gather.cc → ...ecution/src/local-execution/ops/gather.cc b/lib/local-execution/src/ops/gather.cc → ...ecution/src/local-execution/ops/gather.cc
diff --git a/lib/local-execution/src/ops/input.cc → ...xecution/src/local-execution/ops/input.cc b/lib/local-execution/src/ops/input.cc → ...xecution/src/local-execution/ops/input.cc
diff --git a/lib/local-execution/src/ops/layer_norm.cc → ...ion/src/local-execution/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc → ...ion/src/local-execution/ops/layer_norm.cc
diff --git a/lib/local-execution/src/ops/linear.cc → ...ecution/src/local-execution/ops/linear.cc b/lib/local-execution/src/ops/linear.cc → ...ecution/src/local-execution/ops/linear.cc
diff --git a/lib/local-execution/src/ops/noop.cc → ...execution/src/local-execution/ops/noop.cc b/lib/local-execution/src/ops/noop.cc → ...execution/src/local-execution/ops/noop.cc
diff --git a/lib/local-execution/src/ops/pool_2d.cc → ...cution/src/local-execution/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc → ...cution/src/local-execution/ops/pool_2d.cc
diff --git a/lib/local-execution/src/ops/reduce.cc → ...ecution/src/local-execution/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc → ...ecution/src/local-execution/ops/reduce.cc
diff --git a/lib/local-execution/src/ops/reduction.cc → ...tion/src/local-execution/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc → ...tion/src/local-execution/ops/reduction.cc
diff --git a/lib/local-execution/src/ops/repartition.cc → ...on/src/local-execution/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc → ...on/src/local-execution/ops/repartition.cc
diff --git a/lib/local-execution/src/ops/replicate.cc → ...tion/src/local-execution/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc → ...tion/src/local-execution/ops/replicate.cc
diff --git a/lib/local-execution/src/ops/reshape.cc → ...cution/src/local-execution/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc → ...cution/src/local-execution/ops/reshape.cc
diff --git a/lib/local-execution/src/ops/reverse.cc → ...cution/src/local-execution/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc → ...cution/src/local-execution/ops/reverse.cc
diff --git a/lib/local-execution/src/ops/softmax.cc → ...cution/src/local-execution/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc → ...cution/src/local-execution/ops/softmax.cc
diff --git a/lib/local-execution/src/ops/split.cc → ...xecution/src/local-execution/ops/split.cc b/lib/local-execution/src/ops/split.cc → ...xecution/src/local-execution/ops/split.cc
diff --git a/lib/local-execution/src/ops/topk.cc → ...execution/src/local-execution/ops/topk.cc b/lib/local-execution/src/ops/topk.cc → ...execution/src/local-execution/ops/topk.cc
diff --git a/lib/local-execution/src/ops/transpose.cc → ...tion/src/local-execution/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc → ...tion/src/local-execution/ops/transpose.cc
diff --git a/lib/local-execution/src/ops/weight.cc → ...ecution/src/local-execution/ops/weight.cc b/lib/local-execution/src/ops/weight.cc → ...ecution/src/local-execution/ops/weight.cc
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
@@ -1,4 +1,5 @@
 #include "local-execution/local_cost_estimator.h"
+#include "local-execution/tensor_reduction.h"
 #include "kernels/device.h"
 #include "kernels/local_cuda_allocator.h"
 #include "local-execution/tracked_allocator.h"
@@ -8,21 +9,11 @@
 #include "pcg/computation_graph_builder.h"
 #include "pcg/parallel_tensor_attrs.h"
 #include "utils/containers/transform.h"
+#include "utils/containers/values.h"
+#include "utils/containers/sum.h"
 
 namespace FlexFlow {
 
-static float get_total_elapsed_time(PerLayerElapsedTime const &fwd,
-                                    PerLayerElapsedTime const &bwd) {
-  float total_elapsed_time = 0;
-  for (auto const &layer_elapsed_time : fwd) {
-    layer_guid_t layer_id = layer_elapsed_time.first;
-    float fwd_time = layer_elapsed_time.second.value();
-    float bwd_time = bwd.at(layer_id).value();
-    total_elapsed_time += fwd_time + bwd_time;
-  }
-  return total_elapsed_time;
-}
-
 LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config)
     : runtime_arg_config(config) {}
 
@@ -45,17 +36,13 @@ CostDetails LocalCostEstimator::estimate_cost(
   std::shared_ptr<TrackedAllocator> tracked_allocator_ptr =
       std::make_shared<TrackedAllocator>(create_local_cuda_memory_allocator());
   Allocator allocator = Allocator(tracked_allocator_ptr);
-  TensorBackingMap tensor_backing_map;
   std::vector<tensor_guid_t> input_tensor_ids;
 
   ComputationGraphBuilder cg_builder;
   for (ParallelTensorShape const &input : inputs) {
     TensorShape tensor_shape = get_piece_shape(input);
     tensor_guid_t tensor_id =
         cg_builder.create_input(tensor_shape, CreateGrad::YES);
-    GenericTensorAccessorW tensor_backing =
-        allocator.allocate_tensor(tensor_shape);
-    tensor_backing_map.insert({tensor_id, tensor_backing});
     input_tensor_ids.push_back(tensor_id);
   }
 
@@ -79,7 +66,8 @@ CostDetails LocalCostEstimator::estimate_cost(
 
   LocalTrainingBacking local_backing(allocator,
                                      cg_builder.computation_graph,
-                                     tensor_backing_map,
+                                     LayerTensorBackingMap{},
+                                     TensorBackingMap{},
                                      this->runtime_arg_config);
   local_backing.register_and_allocate_layer(layer_added_result.layer);
   local_backing.execute_init(layer_added_result.layer);

diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
@@ -68,7 +68,7 @@ std::optional<float>
 }
 
 void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) {
-  if (registry_contains_op_task(
+  if (registry_contains_task_for_layer(
           this->task_registry, operator_node, OpTaskType::INIT)) {
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(this->computation_graph, operator_node).attrs;
@@ -85,7 +85,7 @@ void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) {
 
 std::optional<float>
     LocalTrainingBacking::execute_forward(layer_guid_t const &operator_node) {
-  if (registry_contains_op_task(
+  if (registry_contains_task_for_layer(
           this->task_registry, operator_node, OpTaskType::FWD)) {
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(this->computation_graph, operator_node).attrs;
@@ -102,11 +102,10 @@ std::optional<float>
 void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs,
                                         reduced_tensor_t const &logit_tensor,
                                         reduced_tensor_t const &label_tensor) {
-  assert(
-      this->local_slots_backing.is_non_graph_tensor_allocated(logit_tensor) &&
-      this->local_slots_backing.is_non_graph_tensor_allocated(label_tensor));
+  assert(this->local_slots_backing.is_non_graph_tensor_allocated(label_tensor));
   TaskInvocation loss_invocation =
       backward(loss_attrs, logit_tensor, label_tensor);
+  // TODO: https://github.com/flexflow/flexflow-train/issues/1442
   // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
   TaskArgumentAccessor loss_accessor =
       this->get_task_arg_accessor(loss_invocation, std::nullopt);
@@ -116,7 +115,7 @@ void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs,
 
 std::optional<float>
     LocalTrainingBacking::execute_backward(layer_guid_t const &operator_node) {
-  if (registry_contains_op_task(
+  if (registry_contains_task_for_layer(
           this->task_registry, operator_node, OpTaskType::BWD)) {
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(this->computation_graph, operator_node).attrs;
@@ -143,6 +142,8 @@ void LocalTrainingBacking::execute_update(
     // get invocation
     TaskInvocation invocation = get_update_invocation(
         optimizer_attrs, weight_tensor, optimizer_buffer_tensors);
+
+    // TODO: https://github.com/flexflow/flexflow-train/issues/1442
     // assert(is_invocation_valid(get_update_signature(attrs), invocation));
 
     // execute update

diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
@@ -71,7 +71,7 @@ void ModelTrainingInstance::execute_update() {
     this->training_backing.execute_update(node, this->optimizer_attrs);
   }
   this->optimizer_attrs =
-      get_next_iteration_optimizer_attrs(this->optimizer_attrs);
+      get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc
@@ -1,6 +1,7 @@
 #include "local-execution/task_binding.h"
 #include "utils/containers/contains_key.h"
 #include "utils/fmt/unordered_map.h"
+#include "utils/hash/unordered_map.h"
 
 namespace FlexFlow {
 
@@ -58,3 +59,15 @@ std::ostream &operator<<(std::ostream &s, TaskBinding const &x) {
 }
 
 } // namespace FlexFlow
+
+namespace std {
+
+size_t hash<::FlexFlow::TaskBinding>::operator() (
+  ::FlexFlow::TaskBinding const &s) const {
+    size_t result = 0;
+    hash_combine(result, s.get_tensor_bindings());
+    hash_combine(result, s.get_arg_bindings());
+    return result;
+  }
+
+} // namespace std
diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc
@@ -42,7 +42,7 @@ void register_tasks_for_layer(TaskRegistry &task_registry,
   }
 }
 
-bool registry_contains_op_task(TaskRegistry const &task_registry,
+bool registry_contains_task_for_layer(TaskRegistry const &task_registry,
                                layer_guid_t const &op,
                                OpTaskType const &op_task_type) {
   std::unordered_map<layer_guid_t, std::optional<task_id_t>> task_ids;

diff --git a/lib/local-execution/src/task_signature.cc b/lib/local-execution/src/task_signature.cc
@@ -18,7 +18,7 @@ void add_slot(TaskSignature &task_signature,
               TensorType tensor_type,
               SlotType slot_type) {
   TensorTypeSlotSpec tensor_guid_slot_spec =
-      TensorTypeSlotSpec{slot_type, tensor_type};
+      TensorTypeSlotSpec{name, tensor_type, slot_type};
   task_signature.tensor_guid_slots.insert({name, tensor_guid_slot_spec});
 }
 

diff --git a/lib/local-execution/src/tensor_reduction.cc b/lib/local-execution/src/tensor_reduction.cc
@@ -4,7 +4,7 @@
 namespace FlexFlow {
 
 reduced_tensor_t lower(tensor_guid_t const &tensor_guid) {
-  return reduced_tensor_t{tensor_guid.raw_graph_output.idx};
+  return reduced_tensor_t{tensor_guid.raw_graph_output.node.raw_uid};
 }
 
 std::vector<reduced_tensor_t>
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,8 +4,6 @@ features = [ @@
       "eq",
       "ord",
       "hash",
-      "json",
-      "rapidcheck",
       "fmt",
     ]
@@ Expand Down @@