Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[hannk] Add a prepare() method for ops and interp #6338

Merged
merged 5 commits into from
Oct 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions apps/hannk/delegate/hannk_delegate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,10 @@ class HannkDelegateKernel final {
InterpreterOptions options;
options.verbosity = options_.verbosity;
interpreter_ = std::unique_ptr<Interpreter>(new Interpreter(std::move(model_), std::move(options)));
if (!interpreter_->prepare()) {
TF_LITE_KERNEL_LOG(context, "hannk::Interpreter::prepare() failed");
return kTfLiteDelegateError;
}

for (int tensor_id : TfLiteIntArrayView(node->outputs)) {
if (tensor_id == kTfLiteOptionalTensor) {
Expand Down
40 changes: 34 additions & 6 deletions apps/hannk/interpreter/interpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ extern "C" int halide_malloc_alignment();
namespace hannk {

Interpreter::Interpreter(std::unique_ptr<OpGroup> m, InterpreterOptions options)
: model_(std::move(m)) {
init(options);
: model_(std::move(m)), options_(std::move(options)) {
}

Interpreter::~Interpreter() {
Expand All @@ -25,6 +24,8 @@ namespace {

// TODO: maybe move this to a separate file? Not sure if it's complex enough to be worthy or not.
class TensorVisitor : public OpVisitor {
using OpVisitor::visit;

virtual void visit_tensor(const TensorPtr &t) = 0;

void visit(OpGroup *g) override {
Expand Down Expand Up @@ -162,33 +163,58 @@ class VerifyAllAllocated : public TensorVisitor {

} // namespace

void Interpreter::init(InterpreterOptions options) {
pad_for_ops(model_.get());
bool Interpreter::prepare() {
if (prepared_) {
HLOG(ERROR) << "Do not call prepare() twice";
return false;
}

// We must prepare the model before doing the transforms, as some of the
// transforms may rely on information cached by prepare(), e.g. alignment requirements.
// (Note that any transforms that add new ops are expected to call prepare() on them,
// returning errors as appropriate.)
if (!model_->prepare()) {
HLOG(ERROR) << "model_->prepare() failed.";
return false;
}

if (!pad_for_ops(model_.get())) {
HLOG(ERROR) << "pad_for_ops() failed.";
return false;
}
in_place(model_.get());
fold_constants(model_.get());
remove_dead_ops(model_.get());

assert(tensor_storage_arena_ == nullptr);
tensor_storage_arena_ = allocate_tensors(model_.get(), options);
tensor_storage_arena_ = allocate_tensors(model_.get(), options_);

#ifndef NDEBUG
VerifyAllAllocated verify_all;
model_->accept(&verify_all);
#endif

if (options.verbosity >= 2) {
if (options_.verbosity >= 2) {
std::ostringstream os;
os << "Model after transformations:\n";
model_->dump(os);
HLOG(INFO) << os.str();
}

prepared_ = true;
return true;
}

void Interpreter::execute() {
if (!prepared_) {
HLOG(ERROR) << "Must call prepare() before execute()";
return;
}
model_->execute();
}

TensorPtr Interpreter::get_tensor(const std::string &name) {
HCHECK(prepared_);
for (int i = 0; i < model_->op_count(); i++) {
Op *op = model_->op(i);
for (int j = 0; j < op->input_count(); j++) {
Expand All @@ -206,6 +232,7 @@ TensorPtr Interpreter::get_tensor(const std::string &name) {
}

std::vector<TensorPtr> Interpreter::inputs() {
HCHECK(prepared_);
std::vector<TensorPtr> result;
for (int i = 0; i < model_->input_count(); i++) {
result.push_back(model_->input(i));
Expand All @@ -215,6 +242,7 @@ std::vector<TensorPtr> Interpreter::inputs() {
}

std::vector<TensorPtr> Interpreter::outputs() {
HCHECK(prepared_);
std::vector<TensorPtr> result;
for (int i = 0; i < model_->output_count(); i++) {
result.push_back(model_->output(i));
Expand Down
14 changes: 12 additions & 2 deletions apps/hannk/interpreter/interpreter.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ struct InterpreterOptions {
class Interpreter {
std::unique_ptr<OpGroup> model_;
std::unique_ptr<char[]> tensor_storage_arena_;

void init(InterpreterOptions options);
InterpreterOptions options_;
bool prepared_ = false;

public:
explicit Interpreter(std::unique_ptr<OpGroup> m, InterpreterOptions options = InterpreterOptions());
Expand All @@ -30,6 +30,16 @@ class Interpreter {
// If none with that name, return null. Tensor is still owned by the Model.
TensorPtr get_tensor(const std::string &name);

// Must call prepare() exactly once, before any calls to execute().
// This performs various transformations on the ops, and allows
// ops chance to prepare for execution; this is a good
// time for the op to prepare and cache anything that might be used
// repeatedly if execute() is called multiple times. (Note that an op may have
// prepare() called on it, but then later get discarded by a transform.)
//
// Returns false if an error occurs, in which case execute() should not be called.
[[nodiscard]] bool prepare();

void execute();

// Return the Tensor(s) that are the initial input(s) of the Model.
Expand Down
9 changes: 9 additions & 0 deletions apps/hannk/interpreter/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,15 @@ void Op::dump(std::ostream &os, int indent) const {
os << "\n";
}

bool OpGroup::prepare() {
for (int i = 0; i < op_count(); i++) {
if (!op(i)->prepare()) {
return false;
}
}
return true;
}

void OpGroup::execute() {
for (int i = 0; i < op_count(); i++) {
op(i)->execute();
Expand Down
9 changes: 9 additions & 0 deletions apps/hannk/interpreter/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,13 @@ class Op {
}
}

// Prepare the op for future execution. The Op can assume that the types and dimensions
// of all its input/output Tensors will remain the same after this.
// Return false on error.
virtual bool prepare() {
return true;
}

// Execute the op on a given crop.
virtual void execute() = 0;

Expand Down Expand Up @@ -347,6 +354,8 @@ class OpGroup : public Op {

BoundsMap map_bounds(int input_idx, int output_idx) const override;

bool prepare() override;

void execute() override;

int op_count() const {
Expand Down
73 changes: 44 additions & 29 deletions apps/hannk/interpreter/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,9 @@ halide_type_t ConvOp::filter_type() const {
}

BoundsMap ConvOp::map_bounds(int input_idx, int output_idx) const {
assert(vector_reduction_ > 0);
assert(vector_tile_ > 0);

#ifdef CONV_R16
const int unroll_reduction = filter()->extent(0) >= 16 ? 16 : 4;
#else
Expand All @@ -763,22 +766,13 @@ BoundsMap ConvOp::map_bounds(int input_idx, int output_idx) const {
}
return result;
} else if (input_idx == 1) {
// Pass minimal sized buffers to learn about the alignment requirements.
HalideBuffer<uint8_t> input_buf(nullptr, 1, 1, 1, 1);
HalideBuffer<int32_t> bias_buf(nullptr, 1);
HalideBuffer<void> filter_buf(filter_type(), nullptr, 1, 1, 1, 1, 1, 1);
HalideBuffer<uint8_t> output_buf(nullptr, 1, 1, 1, 1);
conv_u8_u8_u8(input_buf, 0, filter_buf, 0, bias_buf, 1, 1, 1, 1, 0, 0, 0, 0, 0, output_buf);

const int vector_reduction = filter_buf.dim(0).extent();
const int vector_tile = filter_buf.dim(1).extent();
const int channel_alignment = unroll_reduction / vector_reduction;
const int channel_alignment = unroll_reduction / vector_reduction_;
BoundsMap result(input()->rank() + 2, output()->rank());
result
.constant(0, vector_reduction)
.constant(1, vector_tile)
.constant(2, align_up(ceil_div(filter()->extent(0), vector_reduction), channel_alignment))
.upsample(3, 0, vector_tile);
.constant(0, vector_reduction_)
.constant(1, vector_tile_)
.constant(2, align_up(ceil_div(filter()->extent(0), vector_reduction_), channel_alignment))
.upsample(3, 0, vector_tile_);
for (int i = 1; i < output()->rank() - 1; i++) {
result.constant(i + 3, filter()->bounds(i));
}
Expand Down Expand Up @@ -817,6 +811,22 @@ void call_conv2d(halide_buffer_t *input, halide_buffer_t *filter, halide_buffer_

} // namespace

bool ConvOp::prepare() {
// Pass minimal sized buffers to learn about the alignment requirements.
// TODO: need to adapt this to the types of in, filt, out once we support multiple variants
HalideBuffer<uint8_t> input_buf(nullptr, 1, 1, 1, 1);
HalideBuffer<int32_t> bias_buf(nullptr, 1);
HalideBuffer<void> filter_buf(filter_type(), nullptr, 1, 1, 1, 1, 1, 1);
HalideBuffer<uint8_t> output_buf(nullptr, 1, 1, 1, 1);
if (conv_u8_u8_u8(input_buf, 0, filter_buf, 0, bias_buf, 1, 1, 1, 1, 0, 0, 0, 0, 0, output_buf) != 0) {
return false;
}

vector_reduction_ = filter_buf.dim(0).extent();
vector_tile_ = filter_buf.dim(1).extent();
return true;
}

void ConvOp::execute() {
const TensorPtr &in = input();
const TensorPtr &filt = filter();
Expand Down Expand Up @@ -896,18 +906,8 @@ void call_depthwise_conv_uint8(
}
}

// TODO: this could probably be cached rather than recalculated each time
int get_depthwise_conv_channel_alignment() {
// Pass minimal sized buffers to learn about the alignment requirements.
HalideBuffer<uint8_t> input_buf(nullptr, 1, 1, 1, 1);
HalideBuffer<int32_t> bias_buf(nullptr, 1);
HalideBuffer<uint8_t> filter_buf(nullptr, 1, 1, 1);
HalideBuffer<uint8_t> output_buf(nullptr, 1, 1, 1, 1);
depthwise_conv_uint8(input_buf, 0, filter_buf, 0, bias_buf, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, output_buf);
return input_buf.dim(0).extent();
}

bool can_be_shallow(int alignment, int extent_0, int extent_1) {
assert(alignment > 0);
// This is correct: we want to use shallow when the vector size (ie, alignment)
// is evenly divisble by the number of channels (ie, extent(0)).
//
Expand All @@ -920,6 +920,8 @@ bool can_be_shallow(int alignment, int extent_0, int extent_1) {

BoundsMap DepthwiseConv2DOp::map_bounds(int input_idx, int output_idx) const {
assert(output_idx == 0);
assert(channel_alignment_ > 0);

if (input_idx == 0) {
BoundsMap result(4, 4);
result
Expand All @@ -928,12 +930,11 @@ BoundsMap DepthwiseConv2DOp::map_bounds(int input_idx, int output_idx) const {
.downsample(2, 2, stride_[1], Interval(0, dilation_[1] * (filter()->extent(2) - 1)))
.elementwise(3, 3);
if (depth_multiplier_ == 1) {
const int alignment = get_depthwise_conv_channel_alignment();
if (stride_[0] == 1 &&
can_be_shallow(alignment, input()->extent(0), input()->extent(1))) {
can_be_shallow(channel_alignment_, input()->extent(0), input()->extent(1))) {
// We can use the shallow version of depthwise here.
} else {
result.align_input(0, alignment);
result.align_input(0, channel_alignment_);
}
}
return result;
Expand All @@ -949,6 +950,20 @@ BoundsMap DepthwiseConv2DOp::map_bounds(int input_idx, int output_idx) const {
}
}

bool DepthwiseConv2DOp::prepare() {
// Pass minimal sized buffers to learn about the alignment requirements.
// TODO: need to adapt this to the types of in, filt, out once we support multiple variants
HalideBuffer<uint8_t> input_buf(nullptr, 1, 1, 1, 1);
HalideBuffer<int32_t> bias_buf(nullptr, 1);
HalideBuffer<uint8_t> filter_buf(nullptr, 1, 1, 1);
HalideBuffer<uint8_t> output_buf(nullptr, 1, 1, 1, 1);
if (depthwise_conv_uint8(input_buf, 0, filter_buf, 0, bias_buf, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, output_buf) != 0) {
return false;
}
channel_alignment_ = input_buf.dim(0).extent();
return true;
}

void DepthwiseConv2DOp::execute() {
const TensorPtr &in = input();
const TensorPtr &filt = filter();
Expand All @@ -975,7 +990,7 @@ void DepthwiseConv2DOp::execute() {
if (stride_[0] == 1 &&
can_fuse_cx(FuseType::InPlace, input_buf) &&
can_fuse_cx(FuseType::InPlace, output_buf) &&
can_be_shallow(get_depthwise_conv_channel_alignment(), input_buf.dim(0).extent(), input_buf.dim(1).extent())) {
can_be_shallow(channel_alignment_, input_buf.dim(0).extent(), input_buf.dim(1).extent())) {
input_stride_x = input_buf.dim(1).stride();
fuse_cx(FuseType::InPlace, input_buf);
fuse_cx(FuseType::InPlace, output_buf);
Expand Down
11 changes: 11 additions & 0 deletions apps/hannk/interpreter/ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ class ConvOp : public Op {
Padding padding_;
ActivationFunction activation_;

// calculated in prepare()
int vector_reduction_ = 0;
int vector_tile_ = 0;

public:
ConvOp(const TensorPtr &input, const TensorPtr &filter, const TensorPtr &bias, const TensorPtr &output,
std::array<int, 2> stride, std::array<int, 2> dilation, Padding padding,
Expand Down Expand Up @@ -129,6 +133,8 @@ class ConvOp : public Op {
halide_type_t filter_type() const;
BoundsMap map_bounds(int input_idx, int output_idx) const override;

bool prepare() override;

void execute() override;

std::string name() const override {
Expand All @@ -143,6 +149,9 @@ class DepthwiseConv2DOp : public Op {
Padding padding_;
ActivationFunction activation_;

// calculated in prepare()
int channel_alignment_ = 0;

public:
DepthwiseConv2DOp(const TensorPtr &input, const TensorPtr &filter, const TensorPtr &bias, const TensorPtr &output,
int depth_multiplier, std::array<int, 2> stride, std::array<int, 2> dilation,
Expand Down Expand Up @@ -179,6 +188,8 @@ class DepthwiseConv2DOp : public Op {

BoundsMap map_bounds(int input_idx, int output_idx) const override;

bool prepare() override;

void execute() override;

std::string name() const override {
Expand Down
Loading