halide · steven-johnson · Oct 26, 2021 · Oct 20, 2021 · Oct 20, 2021 · Oct 20, 2021
diff --git a/apps/hannk/delegate/hannk_delegate.cpp b/apps/hannk/delegate/hannk_delegate.cpp
@@ -388,6 +388,10 @@ class HannkDelegateKernel final {
         InterpreterOptions options;
         options.verbosity = options_.verbosity;
         interpreter_ = std::unique_ptr<Interpreter>(new Interpreter(std::move(model_), std::move(options)));
+        if (!interpreter_->prepare()) {
+            TF_LITE_KERNEL_LOG(context, "hannk::Interpreter::prepare() failed");
+            return kTfLiteDelegateError;
+        }
 
         for (int tensor_id : TfLiteIntArrayView(node->outputs)) {
             if (tensor_id == kTfLiteOptionalTensor) {

diff --git a/apps/hannk/interpreter/interpreter.cpp b/apps/hannk/interpreter/interpreter.cpp
@@ -14,8 +14,7 @@ extern "C" int halide_malloc_alignment();
 namespace hannk {
 
 Interpreter::Interpreter(std::unique_ptr<OpGroup> m, InterpreterOptions options)
-    : model_(std::move(m)) {
-    init(options);
+    : model_(std::move(m)), options_(std::move(options)) {
 }
 
 Interpreter::~Interpreter() {
@@ -25,6 +24,8 @@ namespace {
 
 // TODO: maybe move this to a separate file? Not sure if it's complex enough to be worthy or not.
 class TensorVisitor : public OpVisitor {
+    using OpVisitor::visit;
+
     virtual void visit_tensor(const TensorPtr &t) = 0;
 
     void visit(OpGroup *g) override {
@@ -162,33 +163,58 @@ class VerifyAllAllocated : public TensorVisitor {
 
 }  // namespace
 
-void Interpreter::init(InterpreterOptions options) {
-    pad_for_ops(model_.get());
+bool Interpreter::prepare() {
+    if (prepared_) {
+        HLOG(ERROR) << "Do not call prepare() twice";
+        return false;
+    }
+
+    // We must prepare the model before doing the transforms, as some of the
+    // transforms may rely on information cached by prepare(), e.g. alignment requirements.
+    // (Note that any transforms that add new ops are expected to call prepare() on them,
+    // returning errors as appropriate.)
+    if (!model_->prepare()) {
+        HLOG(ERROR) << "model_->prepare() failed.";
+        return false;
+    }
+
+    if (!pad_for_ops(model_.get())) {
+        HLOG(ERROR) << "pad_for_ops() failed.";
+        return false;
+    }
     in_place(model_.get());
     fold_constants(model_.get());
     remove_dead_ops(model_.get());
 
     assert(tensor_storage_arena_ == nullptr);
-    tensor_storage_arena_ = allocate_tensors(model_.get(), options);
+    tensor_storage_arena_ = allocate_tensors(model_.get(), options_);
 
 #ifndef NDEBUG
     VerifyAllAllocated verify_all;
     model_->accept(&verify_all);
 #endif
 
-    if (options.verbosity >= 2) {
+    if (options_.verbosity >= 2) {
         std::ostringstream os;
         os << "Model after transformations:\n";
         model_->dump(os);
         HLOG(INFO) << os.str();
     }
+
+    prepared_ = true;
+    return true;
 }
 
 void Interpreter::execute() {
+    if (!prepared_) {
+        HLOG(ERROR) << "Must call prepare() before execute()";
+        return;
+    }
     model_->execute();
 }
 
 TensorPtr Interpreter::get_tensor(const std::string &name) {
+    HCHECK(prepared_);
     for (int i = 0; i < model_->op_count(); i++) {
         Op *op = model_->op(i);
         for (int j = 0; j < op->input_count(); j++) {
@@ -206,6 +232,7 @@ TensorPtr Interpreter::get_tensor(const std::string &name) {
 }
 
 std::vector<TensorPtr> Interpreter::inputs() {
+    HCHECK(prepared_);
     std::vector<TensorPtr> result;
     for (int i = 0; i < model_->input_count(); i++) {
         result.push_back(model_->input(i));
@@ -215,6 +242,7 @@ std::vector<TensorPtr> Interpreter::inputs() {
 }
 
 std::vector<TensorPtr> Interpreter::outputs() {
+    HCHECK(prepared_);
     std::vector<TensorPtr> result;
     for (int i = 0; i < model_->output_count(); i++) {
         result.push_back(model_->output(i));

diff --git a/apps/hannk/interpreter/interpreter.h b/apps/hannk/interpreter/interpreter.h
@@ -19,8 +19,8 @@ struct InterpreterOptions {
 class Interpreter {
     std::unique_ptr<OpGroup> model_;
     std::unique_ptr<char[]> tensor_storage_arena_;
-
-    void init(InterpreterOptions options);
+    InterpreterOptions options_;
+    bool prepared_ = false;
 
 public:
     explicit Interpreter(std::unique_ptr<OpGroup> m, InterpreterOptions options = InterpreterOptions());
@@ -30,6 +30,16 @@ class Interpreter {
     // If none with that name, return null. Tensor is still owned by the Model.
     TensorPtr get_tensor(const std::string &name);
 
+    // Must call prepare() exactly once, before any calls to execute().
+    // This performs various transformations on the ops, and allows
+    // ops chance to prepare for execution; this is a good
+    // time for the op to prepare and cache anything that might be used
+    // repeatedly if execute() is called multiple times. (Note that an op may have
+    // prepare() called on it, but then later get discarded by a transform.)
+    //
+    // Returns false if an error occurs, in which case execute() should not be called.
+    [[nodiscard]] bool prepare();
+
     void execute();
 
     // Return the Tensor(s) that are the initial input(s) of the Model.

diff --git a/apps/hannk/interpreter/model.cpp b/apps/hannk/interpreter/model.cpp
@@ -91,6 +91,15 @@ void Op::dump(std::ostream &os, int indent) const {
     os << "\n";
 }
 
+bool OpGroup::prepare() {
+    for (int i = 0; i < op_count(); i++) {
+        if (!op(i)->prepare()) {
+            return false;
+        }
+    }
+    return true;
+}
+
 void OpGroup::execute() {
     for (int i = 0; i < op_count(); i++) {
         op(i)->execute();

diff --git a/apps/hannk/interpreter/model.h b/apps/hannk/interpreter/model.h
@@ -278,6 +278,13 @@ class Op {
         }
     }
 
+    // Prepare the op for future execution. The Op can assume that the types and dimensions
+    // of all its input/output Tensors will remain the same after this.
+    // Return false on error.
+    virtual bool prepare() {
+        return true;
+    }
+
     // Execute the op on a given crop.
     virtual void execute() = 0;
 
@@ -347,6 +354,8 @@ class OpGroup : public Op {
 
     BoundsMap map_bounds(int input_idx, int output_idx) const override;
 
+    bool prepare() override;
+
     void execute() override;
 
     int op_count() const {

diff --git a/apps/hannk/interpreter/ops.cpp b/apps/hannk/interpreter/ops.cpp
@@ -748,6 +748,9 @@ halide_type_t ConvOp::filter_type() const {
 }
 
 BoundsMap ConvOp::map_bounds(int input_idx, int output_idx) const {
+    assert(vector_reduction_ > 0);
+    assert(vector_tile_ > 0);
+
 #ifdef CONV_R16
     const int unroll_reduction = filter()->extent(0) >= 16 ? 16 : 4;
 #else
@@ -763,22 +766,13 @@ BoundsMap ConvOp::map_bounds(int input_idx, int output_idx) const {
         }
         return result;
     } else if (input_idx == 1) {
-        // Pass minimal sized buffers to learn about the alignment requirements.
-        HalideBuffer<uint8_t> input_buf(nullptr, 1, 1, 1, 1);
-        HalideBuffer<int32_t> bias_buf(nullptr, 1);
-        HalideBuffer<void> filter_buf(filter_type(), nullptr, 1, 1, 1, 1, 1, 1);
-        HalideBuffer<uint8_t> output_buf(nullptr, 1, 1, 1, 1);
-        conv_u8_u8_u8(input_buf, 0, filter_buf, 0, bias_buf, 1, 1, 1, 1, 0, 0, 0, 0, 0, output_buf);
-
-        const int vector_reduction = filter_buf.dim(0).extent();
-        const int vector_tile = filter_buf.dim(1).extent();
-        const int channel_alignment = unroll_reduction / vector_reduction;
+        const int channel_alignment = unroll_reduction / vector_reduction_;
         BoundsMap result(input()->rank() + 2, output()->rank());
         result
-            .constant(0, vector_reduction)
-            .constant(1, vector_tile)
-            .constant(2, align_up(ceil_div(filter()->extent(0), vector_reduction), channel_alignment))
-            .upsample(3, 0, vector_tile);
+            .constant(0, vector_reduction_)
+            .constant(1, vector_tile_)
+            .constant(2, align_up(ceil_div(filter()->extent(0), vector_reduction_), channel_alignment))
+            .upsample(3, 0, vector_tile_);
         for (int i = 1; i < output()->rank() - 1; i++) {
             result.constant(i + 3, filter()->bounds(i));
         }
@@ -817,6 +811,22 @@ void call_conv2d(halide_buffer_t *input, halide_buffer_t *filter, halide_buffer_
 
 }  // namespace
 
+bool ConvOp::prepare() {
+    // Pass minimal sized buffers to learn about the alignment requirements.
+    // TODO: need to adapt this to the types of in, filt, out once we support multiple variants
+    HalideBuffer<uint8_t> input_buf(nullptr, 1, 1, 1, 1);
+    HalideBuffer<int32_t> bias_buf(nullptr, 1);
+    HalideBuffer<void> filter_buf(filter_type(), nullptr, 1, 1, 1, 1, 1, 1);
+    HalideBuffer<uint8_t> output_buf(nullptr, 1, 1, 1, 1);
+    if (conv_u8_u8_u8(input_buf, 0, filter_buf, 0, bias_buf, 1, 1, 1, 1, 0, 0, 0, 0, 0, output_buf) != 0) {
+        return false;
+    }
+
+    vector_reduction_ = filter_buf.dim(0).extent();
+    vector_tile_ = filter_buf.dim(1).extent();
+    return true;
+}
+
 void ConvOp::execute() {
     const TensorPtr &in = input();
     const TensorPtr &filt = filter();
@@ -896,18 +906,8 @@ void call_depthwise_conv_uint8(
     }
 }
 
-// TODO: this could probably be cached rather than recalculated each time
-int get_depthwise_conv_channel_alignment() {
-    // Pass minimal sized buffers to learn about the alignment requirements.
-    HalideBuffer<uint8_t> input_buf(nullptr, 1, 1, 1, 1);
-    HalideBuffer<int32_t> bias_buf(nullptr, 1);
-    HalideBuffer<uint8_t> filter_buf(nullptr, 1, 1, 1);
-    HalideBuffer<uint8_t> output_buf(nullptr, 1, 1, 1, 1);
-    depthwise_conv_uint8(input_buf, 0, filter_buf, 0, bias_buf, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, output_buf);
-    return input_buf.dim(0).extent();
-}
-
 bool can_be_shallow(int alignment, int extent_0, int extent_1) {
+    assert(alignment > 0);
     // This is correct: we want to use shallow when the vector size (ie, alignment)
     // is evenly divisble by the number of channels (ie, extent(0)).
     //
@@ -920,6 +920,8 @@ bool can_be_shallow(int alignment, int extent_0, int extent_1) {
 
 BoundsMap DepthwiseConv2DOp::map_bounds(int input_idx, int output_idx) const {
     assert(output_idx == 0);
+    assert(channel_alignment_ > 0);
+
     if (input_idx == 0) {
         BoundsMap result(4, 4);
         result
@@ -928,12 +930,11 @@ BoundsMap DepthwiseConv2DOp::map_bounds(int input_idx, int output_idx) const {
             .downsample(2, 2, stride_[1], Interval(0, dilation_[1] * (filter()->extent(2) - 1)))
             .elementwise(3, 3);
         if (depth_multiplier_ == 1) {
-            const int alignment = get_depthwise_conv_channel_alignment();
             if (stride_[0] == 1 &&
-                can_be_shallow(alignment, input()->extent(0), input()->extent(1))) {
+                can_be_shallow(channel_alignment_, input()->extent(0), input()->extent(1))) {
                 // We can use the shallow version of depthwise here.
             } else {
-                result.align_input(0, alignment);
+                result.align_input(0, channel_alignment_);
             }
         }
         return result;
@@ -949,6 +950,20 @@ BoundsMap DepthwiseConv2DOp::map_bounds(int input_idx, int output_idx) const {
     }
 }
 
+bool DepthwiseConv2DOp::prepare() {
+    // Pass minimal sized buffers to learn about the alignment requirements.
+    // TODO: need to adapt this to the types of in, filt, out once we support multiple variants
+    HalideBuffer<uint8_t> input_buf(nullptr, 1, 1, 1, 1);
+    HalideBuffer<int32_t> bias_buf(nullptr, 1);
+    HalideBuffer<uint8_t> filter_buf(nullptr, 1, 1, 1);
+    HalideBuffer<uint8_t> output_buf(nullptr, 1, 1, 1, 1);
+    if (depthwise_conv_uint8(input_buf, 0, filter_buf, 0, bias_buf, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, output_buf) != 0) {
+        return false;
+    }
+    channel_alignment_ = input_buf.dim(0).extent();
+    return true;
+}
+
 void DepthwiseConv2DOp::execute() {
     const TensorPtr &in = input();
     const TensorPtr &filt = filter();
@@ -975,7 +990,7 @@ void DepthwiseConv2DOp::execute() {
         if (stride_[0] == 1 &&
             can_fuse_cx(FuseType::InPlace, input_buf) &&
             can_fuse_cx(FuseType::InPlace, output_buf) &&
-            can_be_shallow(get_depthwise_conv_channel_alignment(), input_buf.dim(0).extent(), input_buf.dim(1).extent())) {
+            can_be_shallow(channel_alignment_, input_buf.dim(0).extent(), input_buf.dim(1).extent())) {
             input_stride_x = input_buf.dim(1).stride();
             fuse_cx(FuseType::InPlace, input_buf);
             fuse_cx(FuseType::InPlace, output_buf);

diff --git a/apps/hannk/interpreter/ops.h b/apps/hannk/interpreter/ops.h
@@ -97,6 +97,10 @@ class ConvOp : public Op {
     Padding padding_;
     ActivationFunction activation_;
 
+    // calculated in prepare()
+    int vector_reduction_ = 0;
+    int vector_tile_ = 0;
+
 public:
     ConvOp(const TensorPtr &input, const TensorPtr &filter, const TensorPtr &bias, const TensorPtr &output,
            std::array<int, 2> stride, std::array<int, 2> dilation, Padding padding,
@@ -129,6 +133,8 @@ class ConvOp : public Op {
     halide_type_t filter_type() const;
     BoundsMap map_bounds(int input_idx, int output_idx) const override;
 
+    bool prepare() override;
+
     void execute() override;
 
     std::string name() const override {
@@ -143,6 +149,9 @@ class DepthwiseConv2DOp : public Op {
     Padding padding_;
     ActivationFunction activation_;
 
+    // calculated in prepare()
+    int channel_alignment_ = 0;
+
 public:
     DepthwiseConv2DOp(const TensorPtr &input, const TensorPtr &filter, const TensorPtr &bias, const TensorPtr &output,
                       int depth_multiplier, std::array<int, 2> stride, std::array<int, 2> dilation,
@@ -179,6 +188,8 @@ class DepthwiseConv2DOp : public Op {
 
     BoundsMap map_bounds(int input_idx, int output_idx) const override;
 
+    bool prepare() override;
+
     void execute() override;
 
     std::string name() const override {