intel
diff --git a/‎onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
Lines changed: 29 additions & 37 deletions b/‎onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
Lines changed: 29 additions & 37 deletions
diff --git a/‎onnxruntime/core/providers/cpu/tensor/upsample.cc
Lines changed: 73 additions & 29 deletions b/‎onnxruntime/core/providers/cpu/tensor/upsample.cc
Lines changed: 73 additions & 29 deletions
@@ -967,41 +967,35 @@ static void PermuteInput(api::GraphRef& graph, api::NodeRef& node, size_t i, con
   node.SetInput(i, gather_output);
 }
 
-// static bool HandleResize(HandlerArgs& args) {
-//  auto inputs = args.node.Inputs();
-//  int64_t rank_int = gsl::narrow_cast<int64_t>(args.perm.size());
-//
-//  auto p = ChannelFirstToLastPerm(rank_int);
-//  auto& perm = p == args.perm ? args.perm : args.perm_inv;
-//  auto& perm_inv = p == args.perm ? args.perm_inv : args.perm;
-//
-//  if (args.ctx.opset < 11) {
-//     PermuteInput(args.ctx.graph, args.node, 1, perm);
-//   } else {
-//     if (inputs[1] != "") {
-//       std::vector<int64_t> double_perm_inv = perm;
-//       double_perm_inv.reserve(2 * args.perm.size());
-//       for (int64_t p1 : perm) {
-//         double_perm_inv.push_back(p1 + rank_int);
-//       }
-//       PermuteInput(args.ctx.graph, args.node, 1, double_perm_inv);
-//     }
-//     for (size_t i = 2; i < inputs.size(); ++i) {
-//       if (inputs[i] != "") {
-//         PermuteInput(args.ctx.graph, args.node, i, perm);
-//       }
-//     }
-//   }
-//
-//   TransposeFirstInput(args.ctx, args.node, perm);
-//   TransposeOutputs(args.ctx, args.node, perm_inv);
-//
-//   SwapNodeOpTypeAndDomain(args.ctx.graph, args.node, args.node.OpType(), "com.microsoft.nhwc");
-//
-//   return true;
-// }
+static bool HandleResize(HandlerArgs& args) {
+  auto inputs = args.node.Inputs();
+  int64_t rank_int = gsl::narrow_cast<int64_t>(args.perm.size());
+
+  if (args.ctx.opset < 11) {
+    PermuteInput(args.ctx.graph, args.node, 1, args.perm_inv);
+  } else {
+    if (inputs[1] != "") {
+      std::vector<int64_t> double_perm_inv = args.perm_inv;
+      double_perm_inv.reserve(2 * args.perm_inv.size());
+      for (int64_t p : args.perm_inv) {
+        double_perm_inv.push_back(p + rank_int);
+      }
+      PermuteInput(args.ctx.graph, args.node, 1, double_perm_inv);
+    }
+    for (size_t i = 2; i < inputs.size(); ++i) {
+      if (inputs[i] != "") {
+        PermuteInput(args.ctx.graph, args.node, i, args.perm_inv);
+      }
+    }
+  }
+
+  TransposeFirstInput(args.ctx, args.node, args.perm_inv);
+  TransposeOutputs(args.ctx, args.node, args.perm);
+
+  return true;
+}
 
-// constexpr HandlerInfo resize_handler = {&FirstInput, &HandleResize};
+constexpr HandlerInfo resize_handler = {&FirstInput, &HandleResize};
 
 static bool HandlePad(HandlerArgs& args) {
   size_t rank = args.perm.size();
@@ -1640,9 +1634,7 @@ static const std::unordered_map<std::string_view, const HandlerInfo&> handler_ma
     {"Split", split_handler},
     {"Shape", shape_handler},
     {"Pad", pad_handler},
-    // Todo: renable resize handler after adding NHWC support in upsample op on cpu
-    // https://github.com/microsoft/onnxruntime/issues/9857
-    //  {"Resize", resize_handler},
+    {"Resize", resize_handler},
     {"ReduceSum", reduce_sum_handler},
 
     {"ReduceLogSum", reduce_op_handler},
 
@@ -420,13 +420,15 @@ struct BilinearParams {
 // that amounts to 'Bilinear' Upsampling/Resizing in the sense that it assumes
 // the scale values for the outermost 2 dimensions are 1.
 // This is the common use-case where the 4-D input (batched multi-channel images)
-// is usually of shape [N, C, H, W] and the scales are [1.0, 1.0, height_scale, width_scale]
-static BilinearParams SetupUpsampleBilinear(int64_t input_height,
-                                            int64_t input_width,
-                                            int64_t output_height,
-                                            int64_t output_width,
-                                            float height_scale,
-                                            float width_scale,
+// is usually of shapes:
+// - [N, C, H, W] and the scales are [1.0, 1.0, height_scale, width_scale]
+// - [N, H, W, C] and the scales are [1.0, height_scale, width_scale, 1.0]
+static BilinearParams SetupUpsampleBilinear(const int64_t input_height,
+                                            const int64_t input_width,
+                                            const int64_t output_height,
+                                            const int64_t output_width,
+                                            const float height_scale,
+                                            const float width_scale,
                                             const std::vector<float>& roi,
                                             AllocatorPtr& alloc,
                                             const GetOriginalCoordinateFunc& get_original_coordinate) {
@@ -523,26 +525,25 @@ static BilinearParams SetupUpsampleBilinear(int64_t input_height,
 }
 
 template <typename T>
-void UpsampleBilinear(int64_t batch_size,
-                      int64_t num_channels,
-                      int64_t input_height,
-                      int64_t input_width,
-                      int64_t output_height,
-                      int64_t output_width,
-                      float height_scale,
-                      float width_scale,
+void UpsampleBilinear(const int64_t batch_size,
+                      const int64_t num_channels,
+                      const int64_t input_height,
+                      const int64_t input_width,
+                      const int64_t output_height,
+                      const int64_t output_width,
+                      const float height_scale,
+                      const float width_scale,
                       const std::vector<float>& roi,
-                      bool use_extrapolation,
-                      float extrapolation_value,
-                      const T* XdataBase,
-                      T* YdataBase,
+                      const bool use_extrapolation,
+                      const float extrapolation_value,
+                      const T* const XdataBase,
+                      T* const YdataBase,
                       AllocatorPtr& alloc,
                       const GetOriginalCoordinateFunc& get_original_coordinate,
                       concurrency::ThreadPool* tp) {
   BilinearParams p = SetupUpsampleBilinear(input_height, input_width, output_height, output_width,
                                            height_scale, width_scale, roi,
                                            alloc, get_original_coordinate);
-
   for (int64_t n = 0; n < batch_size; ++n) {
     concurrency::ThreadPool::TrySimpleParallelFor(
         tp, num_channels,
@@ -1065,22 +1066,65 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
     case UpsampleMode::LINEAR: {
       // Supports 'bilinear' and 'trilinear' sampling only
 
-      //'bilinear' == 2-D input or 4-D input with outermost 2 scales as 1
+      //'bilinear' == 2-D input or 4-D input with outermost 2 scales as 1 or
+      // 4-D input with outermost and innermost scales as 1
       if (dims.size() == 2 || dims.size() == 4) {
         bool is_2D = dims.size() == 2;
 
-        const int64_t batch_size = is_2D ? 1 : dims[0];
-        const int64_t num_channels = is_2D ? 1 : dims[1];
-        const int64_t input_height = is_2D ? dims[0] : dims[2];
-        const int64_t input_width = is_2D ? dims[1] : dims[3];
-
-        const int64_t output_height = is_2D ? output_dims[0] : output_dims[2];
-        const int64_t output_width = is_2D ? output_dims[1] : output_dims[3];
+        int64_t batch_size;
+        int64_t num_channels;
+        int64_t input_height;
+        int64_t input_width;
+
+        int64_t output_height;
+        int64_t output_width;
+
+        float height_scale;
+        float width_scale;
+
+        if (is_2D) {
+          batch_size = 1;
+          num_channels = 1;
+          input_height = dims[0];
+          input_width = dims[1];
+
+          output_height = output_dims[0];
+          output_width = output_dims[1];
+
+          height_scale = scales[0];
+          width_scale = scales[1];
+        } else {
+          if (scales[1] == 1.0f) {
+            batch_size = dims[0];
+            num_channels = dims[1];
+            input_height = dims[2];
+            input_width = dims[3];
+
+            output_height = output_dims[2];
+            output_width = output_dims[3];
+
+            height_scale = scales[2];
+            width_scale = scales[3];
+          } else {
+            ORT_ENFORCE(scales[3] == 1.0f, "4-D input with innermost scale (usually channel of NHWC) as 1.");
+
+            batch_size = dims[0];
+            num_channels = dims[3];
+            input_height = dims[1];
+            input_width = dims[2];
+
+            output_height = output_dims[1];
+            output_width = output_dims[2];
+
+            height_scale = scales[1];
+            width_scale = scales[2];
+          }
+        }
 
         AllocatorPtr alloc;
         ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
         UpsampleBilinear(batch_size, num_channels, input_height, input_width, output_height, output_width,
-                         is_2D ? scales[0] : scales[2], is_2D ? scales[1] : scales[3], roi,
+                         height_scale, width_scale, roi,
                          use_extrapolation_, extrapolation_value_, X->Data<T>(),
                          Y->MutableData<T>(), alloc, get_original_coordinate_,
                          output_height * output_width > 64 ? context->GetOperatorThreadPool() : nullptr);