Merged PR 6973543: ORT DML EP Opset 13 more complete

Extend opset 13 support for: - Split-13 - Squeeze-13 - Unsqueeze-13 - Reshape-13 - QuantizeLinear-13 - DequantizeLinear-13 - ReduceSum-13 - Resize-13 Also: - Rename the file where all the opset versions are stored from "OperatorRegistration.h" to "OperatorVersions.h", which will make it much less confusing in the future when looking given there's another file called "OperatorRegistration.h" that corresponds to "OperatorRegistration.cpp". - Detemplatize many of the OperatorHelper.h constructors, which duplicate multiple instantiations due to the operator helper classes not sharing a common base class, by wrapping them with an adapter. Ideally there would be a common COM base interface that both IMLOperatorKernelCreationContext and IMLOperatorShapeInferenceContext implementation objects would implement, which a wrapper in MLOperatorAuthorHelper.h could QI for. - Fix style formatting issues in OperatorHelper.h (sorry for the noise). ``` Summary: Total=4679, Passed=4355, Failed=0, Blocked=0, Not Run=0, Skipped=324 ``` Corresponding WindowsAI PR: https://microsoft.visualstudio.com/WindowsAI/_git/WindowsAI/pullrequest/6973645 Related work items: #36672908, #36672926
microsoft · Feb 18, 2022 · 6db6ee5 · 6db6ee5
1 parent 4388eae
commit 6db6ee5
Show file tree

Hide file tree

Showing 16 changed files with 952 additions and 578 deletions.
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
@@ -124,8 +124,12 @@ namespace Dml::GraphDescBuilder
                 // Check whether this specific node requested support for constant CPU inputs
                 if (std::find(requiredConstantCpuInputs.begin(), requiredConstantCpuInputs.end(), inputIndex) != requiredConstantCpuInputs.end())
                 {
-                    const onnxruntime::NodeArg* arg = node.InputDefs()[inputIndex];
-                    tensor = constantCpuGraphInputGetter(arg->Name());
+                    auto inputDefs = node.InputDefs();
+                    if (inputIndex < inputDefs.size())
+                    {
+                        const onnxruntime::NodeArg* arg = inputDefs[inputIndex];
+                        tensor = constantCpuGraphInputGetter(arg->Name());
+                    }
                 }
 
                 return tensor;

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphTransformer.cpp
@@ -9,7 +9,7 @@
 #include "GraphPartitioner.h"
 #include "core/providers/dml/OperatorAuthorHelper/Attributes.h"
 #include "core/providers/dml/OperatorAuthorHelper/OperatorHelper.h"
-#include "core/providers/dml/OperatorAuthorHelper/OperatorRegistration.h"
+#include "core/providers/dml/OperatorAuthorHelper/OperatorVersions.h"
 #include "core/framework/kernel_registry.h"
 #include "core/graph/graph_utils.h"
 

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp
@@ -505,41 +505,53 @@ class DmlOperatorElementwiseQLinear : public DmlOperator
         std::vector<uint32_t> outputShape = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0);
         const uint32_t outputShapeDimCount = gsl::narrow_cast<uint32_t>(outputShape.size());
 
-        Initialize(kernelInfo, std::nullopt, std::nullopt, outputShape);
+        Initialize(kernelInfo, std::nullopt, std::nullopt);
 
-        // If the axis attribute is explicitly provided, then broadcasting must be performed along that axis.
-        // So massage the actual shapes of the scale and zero-point tensors (1D with length equal to the input
-        // axis being broadcast to) into broadcastable shapes.
+        uint32_t axis = 0;
+        uint32_t broadcastAxisLength = 0;
+
+        // If an axis was given explicitly passed (or the default value 1 is set from the schema),
+        // then other inputs are broadcasting to the shape of the input data tensor.
         if (kernelInfo.HasAttribute(AttrName::Axis, MLOperatorAttributeType::Int))
         {
             const int32_t signedAxis = gsl::narrow_cast<int32_t>(kernelInfo.GetAttribute<int64_t>(AttrName::Axis));
-            const uint32_t axis = Dml::HandleNegativeAxis(signedAxis, outputShapeDimCount);
-            const uint32_t broadcastAxisLength = outputShape[axis];
+            axis = Dml::HandleNegativeAxis(signedAxis, outputShapeDimCount);
+            broadcastAxisLength = outputShape[axis];
+        }
+
+
+        // Explicitly reshape each of the inputs after the first input (scale and zero point tensors).
+        for (uint32_t index = 1, inputCount = gsl::narrow_cast<uint32_t>(m_inputTensorDescs.size()); index < inputCount; ++index)
+        {
+            auto edgeDesc = kernelInfo.GetInputEdgeDescription(index);
+            assert(edgeDesc.edgeType == MLOperatorEdgeType::Tensor);
+
+            // Fix up the the tensor shape by filling with trailing ones. So input[2,3] with axis=0 and scale[2]
+            // becomes scale[2,1], so that broadcasting works correctly.
+            std::vector<uint32_t> inputTensorShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(index);
 
-            // Explicitly reshape each of the inputs after the first input (scale and zero point tensors).
-            for (uint32_t index = 1, inputCount = gsl::narrow_cast<uint32_t>(m_inputTensorDescs.size()); index < inputCount; ++index)
+            // If the input tensor is a 1D vector, then extra massaging is needed to project their
+            // 1D vectors back to the full shape for broadcasting along the given axis.
+            // The 1D vector should have a length equal to the output tensor's dimension on that axis.
+            if (inputTensorShape.size() == 1 && inputTensorShape != outputShape)
             {
-                auto edgeDesc = kernelInfo.GetInputEdgeDescription(index);
-                assert(edgeDesc.edgeType == MLOperatorEdgeType::Tensor);
-
-                // Fix up the the tensor shape by filling with trailing ones. So input[2,3] with axis=0 and scale[2]
-                // becomes scale[2,1], so that broadcasting works correctly.
-                std::vector<uint32_t> adjustedInputTensorShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(index);
-                ML_CHECK_VALID_ARGUMENT(adjustedInputTensorShape.size() == 1);
-                ML_CHECK_VALID_ARGUMENT(adjustedInputTensorShape[0] == broadcastAxisLength);
-                adjustedInputTensorShape.insert(adjustedInputTensorShape.end(), outputShapeDimCount - 1 - axis, 1);
-
-                m_inputTensorDescs[index] = TensorDesc(
-                    edgeDesc.tensorDataType,
-                    gsl::make_span(outputShape),
-                    gsl::make_span(adjustedInputTensorShape),
-                    TensorAxis::DoNotCoerce,
-                    TensorAxis::W,
-                    TensorAxis::RightAligned,
-                    NchwDimensionCount, // minDimensionCount
-                    0 // guaranteedBaseOffsetAlignment
-                );
+                ML_CHECK_VALID_ARGUMENT(inputTensorShape[0] == broadcastAxisLength);
+                inputTensorShape.insert(inputTensorShape.begin(), axis, 1);
+                inputTensorShape.insert(inputTensorShape.end(), outputShapeDimCount - 1 - axis, 1);
             }
+            // For any other shape (scalar/ND), leave it alone, and the TensorDesc constructor
+            // will apply broadcasting with standard elementwise alignment.
+
+            m_inputTensorDescs[index] = TensorDesc(
+                edgeDesc.tensorDataType,
+                gsl::make_span(outputShape),
+                gsl::make_span(inputTensorShape),
+                TensorAxis::DoNotCoerce,
+                TensorAxis::W,
+                TensorAxis::RightAligned,
+                NchwDimensionCount, // minDimensionCount
+                0 // guaranteedBaseOffsetAlignment
+            );
         }
 
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorReduce.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorReduce.cpp
@@ -14,44 +14,51 @@ class DmlOperatorReduce : public DmlOperator, public ReduceHelperBase
         DML_REDUCE_FUNCTION function
         )
     :   DmlOperator(kernelInfo),
-        ReduceHelperBase(kernelInfo,
-                         kernelInfo.GetTensorShapeDescription(),
-                        (function != DML_REDUCE_FUNCTION_ARGMAX && function != DML_REDUCE_FUNCTION_ARGMIN))
+        ReduceHelperBase(
+            kernelInfo,
+            kernelInfo.GetTensorShapeDescription(),
+            (function != DML_REDUCE_FUNCTION_ARGMAX && function != DML_REDUCE_FUNCTION_ARGMIN)
+        )
     {
-        ML_CHECK_VALID_ARGUMENT(kernelInfo.GetInputCount() == 1);
+        ML_CHECK_VALID_ARGUMENT(kernelInfo.GetInputCount() >= 1);
         ML_CHECK_VALID_ARGUMENT(kernelInfo.GetOutputCount() == 1);
-        DmlOperator::Initialize(kernelInfo);
+        std::vector<std::optional<uint32_t>> inputIndices = { 0 };
+        std::vector<std::optional<uint32_t>> outputIndices = { 0 };
+        DmlOperator::Initialize(kernelInfo, inputIndices, outputIndices, std::nullopt, std::nullopt, 1u);
 
         std::vector<uint32_t> dmlAxes;
         std::vector<DimensionType> reducedDims = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(0);
-        int dimOffset = gsl::narrow_cast<int>(m_inputTensorDescs[0].GetDimensionCount() - reducedDims.size());
         for (auto& dim : m_axes)
         {
+            // Replace all reduced axes with 1 for their size.
             assert(dim < static_cast<int32_t>(reducedDims.size())); // ReduceHelperBase already validated this.
             reducedDims[dim] = 1;
-            dmlAxes.push_back(static_cast<uint32_t>(dim + dimOffset));
+            dmlAxes.push_back(static_cast<uint32_t>(dim)); // Signed to unsigned which DML expects.
         }
 
         if (!m_keepDims)
         {
-            // DML doesn't know about keepDim and always assume the dim is preserved after reduce.
+            // DML expects the input and output tensors to have identical counts and doesn't know about
+            // ONNX's 'keepdims' attribute, keeping all dimensions anyway rather removing those of size 1.
             // So if m_keepDims is false, the ONNX output dim is different than DML tensor desc dim.
+            //
             // ReduceSum example:
-            // input dims: {3, 2, 2}
-            // axes: 1
-            // keepDims: 0
+            //     input dims: {3, 2, 2}
+            //     axes: 1
+            //     keepDims: 0
             // 
-            // the ONNX output expect to be of dim {3, 2}, while DML expect the output tensor desc
-            // dim to be {3, 1, 2}.
-            //
+            // The ONNX output expects output dims of {3, 2},
+            // while DML expect the output tensor desc of {3, 1, 2}.
 
             m_outputTensorDescs[0] = CreateTensorDescFromOutput(
-                kernelInfo, 
-                0, 
-                TensorAxis::DoNotCoerce, 
-                TensorAxis::W, 
+                kernelInfo,
+                0,
+                TensorAxis::DoNotCoerce,
+                TensorAxis::W,
                 TensorAxis::RightAligned,
-                reducedDims);
+                reducedDims,
+                1 // minimumDimensionCount
+            );
         }
 
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorResize.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorResize.cpp
@@ -19,9 +19,10 @@ constexpr NameAndIndex coordinateTransformationModes[] =
 constexpr NameAndIndex nearestNeighborRoundingModes[] =
 {
     {"", 0},
-    {"round_prefer_floor", 0},
-    {"round_prefer_ceil", 1},
-    {"floor", 2},
+    {"round_prefer_floor", 0},  // round halves down
+    {"round_prefer_ceil", 1},   // round halves up
+    {"floor", 2},               // round always down
+    // {"ceil", 3},             // round always up (requires a DirectML API addition)
 };
 
 void ComputePixelOffsetsAndScales(
@@ -338,9 +339,9 @@ void CALLBACK QueryResize(IMLOperatorSupportQueryContextPrivate* context, bool*
 
 DML_OP_DEFINE_CREATION_FUNCTION(Resize10, VersionedKernel<DmlOperatorResize, 10>);
 DML_OP_DEFINE_CREATION_FUNCTION(Resize11, VersionedKernel<DmlOperatorResize, 11>);
+DML_OP_DEFINE_CREATION_FUNCTION(Resize13, VersionedKernel<DmlOperatorResize, 13>);
 DML_OP_DEFINE_CREATION_FUNCTION(Upsample7, VersionedKernel<DmlOperatorResize, 7>);
 DML_OP_DEFINE_CREATION_FUNCTION(Upsample9, VersionedKernel<DmlOperatorResize, 9>);
 DML_OP_DEFINE_CREATION_FUNCTION(Upsample10, VersionedKernel<DmlOperatorResize, 10>);
-DML_OP_DEFINE_CREATION_FUNCTION(Upsample13, VersionedKernel<DmlOperatorResize, 13>);
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSplit.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSplit.cpp
@@ -11,13 +11,18 @@ class DmlOperatorSplit : public DmlOperator, public SplitHelper
 public:
     using Self = DmlOperatorSplit;
 
-    DmlOperatorSplit(const MLOperatorKernelCreationContext& kernelInfo)
+    DmlOperatorSplit(const MLOperatorKernelCreationContext& kernelInfo, uint32_t opsetVersion)
         : DmlOperator(kernelInfo),
-          SplitHelper(kernelInfo, kernelInfo.GetTensorShapeDescription())
+          SplitHelper(kernelInfo, kernelInfo.GetTensorShapeDescription(), opsetVersion)
     {
-        ML_CHECK_VALID_ARGUMENT(kernelInfo.GetInputCount() == 1, "DML only supports split on a single input tensor.");
-        ML_CHECK_VALID_ARGUMENT(kernelInfo.GetOutputCount() > 0, "Runtime error no output stream specified.");
-        DmlOperator::Initialize(kernelInfo);
+        ML_CHECK_VALID_ARGUMENT(kernelInfo.GetInputCount() > 0, "Splits needs an input tensor.");
+        ML_CHECK_VALID_ARGUMENT(kernelInfo.GetOutputCount() > 0, "Splits needs an output tensor.");
+
+        // Use only the first input tensor. Later opset versions may pass parameters
+        // like splits as dynamic parameters via tensors rather than constants,
+        // and that second parameter is CPU based.
+        std::vector<std::optional<uint32_t>> inputIndices = {0};
+        DmlOperator::Initialize(kernelInfo, inputIndices, std::nullopt);
 
         uint32_t dmlAxis = GetDmlAdjustedAxis(m_axis, kernelInfo, m_inputTensorDescs.front().GetDimensionCount());
 
@@ -36,6 +41,8 @@ class DmlOperatorSplit : public DmlOperator, public SplitHelper
     }
 };
 
-DML_OP_DEFINE_CREATION_FUNCTION(Split, DmlOperatorSplit);
+DML_OP_DEFINE_CREATION_FUNCTION(Split7, VersionedKernel<DmlOperatorSplit, 7>);
+DML_OP_DEFINE_CREATION_FUNCTION(Split11, VersionedKernel<DmlOperatorSplit, 11>);
+DML_OP_DEFINE_CREATION_FUNCTION(Split13, VersionedKernel<DmlOperatorSplit, 13>);
 
 } // namespace Dml