diff options
-rw-r--r-- | .SRCINFO | 116 | ||||
-rw-r--r-- | 14267.diff | 1904 | ||||
-rw-r--r-- | 15089.diff | 476 | ||||
-rw-r--r-- | PKGBUILD | 381 | ||||
-rw-r--r-- | build-fixes.patch | 55 | ||||
-rw-r--r-- | clang.patch | 22 | ||||
-rw-r--r-- | install-orttraining-files.diff | 21 | ||||
-rw-r--r-- | notes.txt | 3 | ||||
-rw-r--r-- | system-dnnl.diff | 37 | ||||
-rw-r--r-- | system-flatbuffers.patch | 14 |
10 files changed, 2826 insertions, 203 deletions
@@ -1,70 +1,98 @@ pkgbase = python-onnxruntime pkgdesc = Cross-platform, high performance scoring engine for ML models - pkgver = 1.9.1 - pkgrel = 4 + pkgver = 1.16.3 + pkgrel = 1 url = https://github.com/microsoft/onnxruntime arch = x86_64 license = MIT makedepends = git makedepends = cmake - makedepends = gtest - makedepends = gmock + makedepends = ninja + makedepends = gcc-libs + makedepends = glibc + makedepends = cxxopts makedepends = pybind11 - makedepends = python-setuptools + makedepends = abseil-cpp makedepends = nlohmann-json makedepends = chrono-date makedepends = boost makedepends = eigen - makedepends = flatbuffers + makedepends = onednn + makedepends = nsync + makedepends = openmpi + makedepends = python-coloredlogs + makedepends = python-flatbuffers + makedepends = python-numpy + makedepends = python-sympy + makedepends = python-setuptools + makedepends = python-installer + makedepends = python-wheel + makedepends = python-build + makedepends = chrpath makedepends = cuda makedepends = cudnn makedepends = nccl - makedepends = clang - depends = nsync - depends = re2 - depends = python-flatbuffers - depends = python-numpy - depends = python-onnx - depends = python-protobuf - depends = openmpi - depends = onednn - options = !lto - source = git+https://github.com/microsoft/onnxruntime#tag=v1.9.1 - source = git+https://github.com/onnx/onnx.git - source = git+https://github.com/dcleblanc/SafeInt.git - source = git+https://github.com/martinmoene/optional-lite.git - source = git+https://github.com/tensorflow/tensorboard.git - source = git+https://github.com/dmlc/dlpack.git - source = git+https://github.com/jarro2783/cxxopts.git - source = pytorch_cpuinfo::git+https://github.com/pytorch/cpuinfo.git - source = build-fixes.patch - source = clang.patch + options = debug + source = git+https://github.com/microsoft/onnxruntime#tag=v1.16.3 + source = install-orttraining-files.diff source = system-dnnl.diff sha512sums = SKIP sha512sums = SKIP sha512sums = SKIP - sha512sums = SKIP - sha512sums = SKIP - sha512sums = SKIP - sha512sums = SKIP - sha512sums = SKIP - sha512sums = 685f0235abed6e1277dd0eb9bda56c464d1987fe7fc90a3550e17ec70cc49fd15f34996a0e159f9622c4ca3e6bf29917fe51b7849342531fa2a6808d782f1e06 - sha512sums = ad94af8bb25744b244c4f82e9a06189741f82b295a88523ca0e8005568fac710c2299d783989457e9cf96ef8da0593fb4f70c8792d416f44ab29d6493e204f13 - sha512sums = 6735c7aca2ba2f1f2a5286eb064125bf7f2c68a575d572dd157769d15778ff3e717b3a53d696c767748229f23ee6c3a7c82679df1d86283d7c4dd0ec9103ae08 -pkgname = python-onnxruntime +pkgname = onnxruntime + depends = gcc-libs + depends = glibc + depends = onednn + depends = openmpi + depends = libmpi.so + depends = abseil-cpp + depends = nsync + provides = libonnxruntime.so + provides = libonnxruntime_providers_shared.so -pkgname = python-onnxruntime-cuda - pkgdesc = Cross-platform, high performance scoring engine for ML models (CUDA execution provider) +pkgname = python-onnxruntime + pkgdesc = Cross-platform, high performance scoring engine for ML models (Python Bindings) + depends = onnxruntime + depends = gcc-libs + depends = glibc + depends = abseil-cpp + depends = openmpi + depends = libmpi.so depends = nsync - depends = re2 + depends = python-coloredlogs depends = python-flatbuffers depends = python-numpy - depends = python-onnx - depends = python-protobuf - depends = openmpi - depends = onednn - depends = cuda + depends = python-sympy + depends = python-packaging + depends = python-setuptools + depends = python-requests + optdepends = python-onnx: for the backend API, quantization, orttraining, transformers and various tools + optdepends = python-psutil: for transformers + optdepends = python-py-cpuinfo: for transformers + optdepends = python-py3nvml: for transformers + optdepends = python-transformers: for transformers + optdepends = python-scipy: for transformers and various tools + optdepends = python-pytorch: for transformers, orttraining and various tools + optdepends = python-pytorch-cuda + optdepends = python-cerberus: for orttraining + optdepends = python-h5py: for orttraining + optdepends = python-matplotlib + optdepends = python-tensorflow-opt-cuda + optdepends = python-importlib-metadata + +pkgname = onnxruntime-cuda + pkgdesc = Cross-platform, high performance scoring engine for ML models (CUDA execution provider) + depends = gcc-libs + depends = glibc depends = cudnn depends = nccl - depends = python-onnxruntime + depends = openmpi + depends = libmpi.so + depends = nsync + depends = abseil-cpp + depends = cuda + depends = libcublas.so + depends = libcudart.so + conflicts = python-onnxruntime-cuda + replaces = python-onnxruntime-cuda diff --git a/14267.diff b/14267.diff new file mode 100644 index 000000000000..3a08c9ab7583 --- /dev/null +++ b/14267.diff @@ -0,0 +1,1904 @@ +diff --git a/cmake/external/dnnl.cmake b/cmake/external/dnnl.cmake +index 175ad41b6f0..397c4d6abeb 100644 +--- a/cmake/external/dnnl.cmake ++++ b/cmake/external/dnnl.cmake +@@ -2,16 +2,16 @@ include (ExternalProject) + + set(DNNL_URL https://github.com/oneapi-src/onednn.git) + # If DNNL_TAG is updated, check if MKLML_VERSION and platform.cmake.patch need to be updated. +-set(DNNL_TAG v2.7.1) ++set(DNNL_TAG v3.0) + + if(WIN32) + set(DNNL_SHARED_LIB dnnl.dll) + set(DNNL_IMPORT_LIB dnnl.lib) + else() + if (APPLE) +- set(DNNL_SHARED_LIB libdnnl.2.dylib) ++ set(DNNL_SHARED_LIB libdnnl.3.dylib) + else() +- set(DNNL_SHARED_LIB libdnnl.so.2) ++ set(DNNL_SHARED_LIB libdnnl.so.3) + endif() + endif() + +diff --git a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc +index c147a0f4923..c6ee7e9f451 100644 +--- a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc ++++ b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc +@@ -345,7 +345,7 @@ Status DnnlExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fuse + for (size_t i = 0; i < context_num_outputs; i++) { + auto output_name = subgraph_primitive->GetOrderedOutputs()[i]; + auto output_md = subgraph_primitive->GetOutputInfo(output_name); +- auto output_shape = output_md.dims(); ++ auto output_shape = output_md.get_dims(); + //if an output is a scaler, onednn internally uses tensor representation (eg, (1,1,...)) + //but allocating an output with no shape instead of the equivalent tensorshape to avoid shape mismatch + if (subgraph_primitive->IsScalarOutput(output_name)) { +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_batchnorm.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_batchnorm.cc +index a91c5dfc8d5..0e8e9a2f7ad 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_batchnorm.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_batchnorm.cc +@@ -26,7 +26,7 @@ void DnnlBatchNorm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + + auto batchnorm_scale_mem = sp.GetMemory(node.Input(IN_SCALE)); + auto scale_md = batchnorm_scale_mem.get_desc(); +- auto scale_dims = scale_md.dims(); ++ auto scale_dims = scale_md.get_dims(); + + auto batchnorm_bias_mem = sp.GetMemory(node.Input(IN_B)); + auto bias_md = batchnorm_bias_mem.get_desc(); +@@ -37,41 +37,30 @@ void DnnlBatchNorm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + auto batchnorm_var_mem = sp.GetMemory(node.Input(IN_VAR)); + auto var_md = batchnorm_var_mem.get_desc(); + ++ // Primitive desc info ++ auto dst_md = dnnl::memory::desc(src_md.get_dims(), src_md.get_data_type(), dnnl::memory::format_tag::any); ++ auto flags = dnnl::normalization_flags::use_scale ++ | dnnl::normalization_flags::use_shift ++ | dnnl::normalization_flags::use_global_stats; + +- std::vector<memory::desc> src_mds; +- src_mds.push_back(scale_md); +- src_mds.push_back(bias_md); +- const int axis = 0; +- +- //To make the inputs compatible with OneDNN, we need to concatenate scale and bias into a single tensor of length 2XC +- //Then, we create the batchnorm pd and feed in the inputs. +- auto concat_pd = dnnl::concat::primitive_desc(axis, src_mds, dnnl_engine); +- +- //If using GPU this will move the memory from the CPU to the GPU. +- batchnorm_scale_mem = sp.GetMemoryAndReshape(node.Input(IN_SCALE), concat_pd.src_desc(), dnnl_engine); +- batchnorm_bias_mem = sp.GetMemoryAndReshape(node.Input(IN_B), concat_pd.src_desc(), dnnl_engine); +- batchnorm_mean_mem = sp.GetMemoryAndReshape(node.Input(IN_MEAN), mean_md, dnnl_engine); +- batchnorm_var_mem = sp.GetMemoryAndReshape(node.Input(IN_VAR), var_md, dnnl_engine); +- auto batchnorm_scale_shift_mem = dnnl::memory(concat_pd.dst_desc(), dnnl_engine); +- +- auto batchnorm_desc = dnnl::batch_normalization_forward::desc(dnnl::prop_kind::forward_inference, src_md, epsilon, +- dnnl::normalization_flags::use_scale_shift | dnnl::normalization_flags::use_global_stats); +- auto batchnorm_pd = dnnl::batch_normalization_forward::primitive_desc(batchnorm_desc, dnnl_engine); ++ auto batchnorm_pd = ++ dnnl::batch_normalization_forward::primitive_desc(dnnl_engine, dnnl::prop_kind::forward_inference, ++ src_md, dst_md, epsilon, flags); + + // If using GPU this will move the memory from the CPU to the GPU. + batchnorm_src_mem = sp.GetMemoryAndReshape(node.Input(IN_X), batchnorm_pd.src_desc(), dnnl_engine); ++ batchnorm_scale_mem = sp.GetMemoryAndReshape(node.Input(IN_SCALE), scale_md, dnnl_engine); ++ batchnorm_bias_mem = sp.GetMemoryAndReshape(node.Input(IN_B), bias_md, dnnl_engine); ++ batchnorm_mean_mem = sp.GetMemoryAndReshape(node.Input(IN_MEAN), mean_md, dnnl_engine); ++ batchnorm_var_mem = sp.GetMemoryAndReshape(node.Input(IN_VAR), var_md, dnnl_engine); + auto batchnorm_dst_mem = dnnl::memory(batchnorm_pd.dst_desc(), dnnl_engine); + +- auto concat_op = dnnl::concat(concat_pd); +- sp.AddPrimitive(concat_op, {{DNNL_ARG_MULTIPLE_SRC, batchnorm_scale_mem}, +- {DNNL_ARG_MULTIPLE_SRC+1, batchnorm_bias_mem}, +- {DNNL_ARG_DST, batchnorm_scale_shift_mem}}); +- + auto batchnorm_op = dnnl::batch_normalization_forward(batchnorm_pd); + sp.AddPrimitive(batchnorm_op, {{DNNL_ARG_SRC, batchnorm_src_mem}, + {DNNL_ARG_MEAN, batchnorm_mean_mem}, + {DNNL_ARG_VARIANCE, batchnorm_var_mem}, +- {DNNL_ARG_SCALE_SHIFT, batchnorm_scale_shift_mem}, ++ {DNNL_ARG_SCALE, batchnorm_scale_mem}, ++ {DNNL_ARG_SHIFT, batchnorm_bias_mem}, + {DNNL_ARG_DST, batchnorm_dst_mem}}); + + sp.SetMemory(node.Output(OUT_Y), batchnorm_dst_mem); +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_binary.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_binary.cc +index 6445aeaec8c..0d845ce2ebf 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_binary.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_binary.cc +@@ -19,8 +19,8 @@ void DnnlBinary::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + auto src_0_ori_md = binary_src0_mem.get_desc(); + auto src_1_ori_md = binary_src1_mem.get_desc(); + +- auto src_0_dims = src_0_ori_md.dims(); +- auto src_1_dims = src_1_ori_md.dims(); ++ auto src_0_dims = src_0_ori_md.get_dims(); ++ auto src_1_dims = src_1_ori_md.get_dims(); + if (src_0_dims.size() != src_1_dims.size()) { + while (src_0_dims.size() < src_1_dims.size()) { + src_0_dims.insert(src_0_dims.begin(), 1); +@@ -42,8 +42,7 @@ void DnnlBinary::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + + auto dst_md = dnnl::memory::desc(output_shape, node.Output(OUT_Y).Type(), dnnl::memory::format_tag::any); + +- auto binary_d = dnnl::binary::desc(algo, src_0_md, src_1_md, dst_md); +- auto binary_pd = dnnl::binary::primitive_desc(binary_d, eng); ++ auto binary_pd = dnnl::binary::primitive_desc(eng, algo, src_0_md, src_1_md, dst_md); + + auto binary_dst_mem = dnnl::memory(binary_pd.dst_desc(), eng); + auto binary_prim = dnnl::binary(binary_pd); +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_cast.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_cast.cc +index 1a21d290e2b..9100b16377f 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_cast.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_cast.cc +@@ -19,7 +19,7 @@ void DnnlCast::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + auto src_mem = sp.GetMemory(node.Input(IN_INPUT)); + auto src_tag = node.Input(IN_INPUT).Format(); + auto src_md = src_mem.get_desc(); +- auto src_dims = src_md.dims(); ++ auto src_dims = src_md.get_dims(); + + // dst characteristics + dnnl::memory::data_type dst_type; +@@ -71,7 +71,7 @@ void DnnlCast::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + } + + // Generate the dst memory descriptor +- auto dst_md = dnnl::memory::desc(src_md.dims(), dst_type, dst_tag); ++ auto dst_md = dnnl::memory::desc(src_md.get_dims(), dst_type, dst_tag); + + // Create the reorder primitive descriptor. + auto reorder_pd = dnnl::reorder::primitive_desc(dnnl_engine, src_md, dnnl_engine, dst_md); +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_concat.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_concat.cc +index fcc72621b41..5ca4f24eef1 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_concat.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_concat.cc +@@ -31,7 +31,7 @@ void DnnlConcat::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + auto axis = GetAxis(node, input_rank != -1 ? input_rank : 0); + + // Create primitive descriptor +- auto concat_pd = dnnl::concat::primitive_desc(static_cast<int>(axis), src_mds, dnnl_engine); ++ auto concat_pd = dnnl::concat::primitive_desc(dnnl_engine, static_cast<int>(axis), src_mds); + + // Create primitive memory objects + std::vector<dnnl::memory> concat_src_mems; +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_conv.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_conv.cc +index a076633ca8d..a9d2d3eb6f3 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_conv.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_conv.cc +@@ -21,13 +21,13 @@ void DnnlConv::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + + auto conv_src_mem = sp.GetMemory(node.Input(IN_X)); + auto src_md = conv_src_mem.get_desc(); +- src_md.data.format_kind = dnnl_format_kind_t::dnnl_format_kind_any; +- auto src_dims = conv_src_mem.get_desc().dims(); ++ src_md = dnnl::memory::desc(src_md.get_dims(), src_md.get_data_type(), dnnl::memory::format_tag::any); ++ auto src_dims = conv_src_mem.get_desc().get_dims(); + + auto conv_weights_mem = sp.GetMemory(node.Input(IN_W)); + auto weight_md = conv_weights_mem.get_desc(); +- weight_md.data.format_kind = dnnl_format_kind_t::dnnl_format_kind_any; +- auto weight_dims_original = conv_weights_mem.get_desc().dims(); ++ weight_md = dnnl::memory::desc(weight_md.get_dims(), weight_md.get_data_type(), dnnl::memory::format_tag::any); ++ auto weight_dims_original = conv_weights_mem.get_desc().get_dims(); + dnnl::memory::dims weight_dims = weight_dims_original; + + bool bias_exists = node.Input(IN_B).Exists(); +@@ -97,27 +97,20 @@ void DnnlConv::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + + dnnl::primitive_attr attr; + if (has_relu) { +- const float ops_scale = 1.f; +- const float ops_alpha = 0.f; +- const float ops_beta = 0.f; + dnnl::post_ops ops; +- ops.append_eltwise(ops_scale, dnnl::algorithm::eltwise_relu, ops_alpha, ops_beta); ++ ops.append_eltwise(dnnl::algorithm::eltwise_relu, 0.f, 0.f); + attr.set_post_ops(ops); + } + + dnnl::convolution_forward::primitive_desc conv_pd; + if (bias_exists) { +- auto conv_desc = dnnl::convolution_forward::desc( +- prop_kind, dnnl::algorithm::convolution_direct, +- src_md, weight_md, bias_md, dst_md, +- strides, dilations, padding_left, padding_right); +- conv_pd = dnnl::convolution_forward::primitive_desc(conv_desc, attr, dnnl_engine); ++ conv_pd = dnnl::convolution_forward::primitive_desc(dnnl_engine, prop_kind, dnnl::algorithm::convolution_direct, ++ src_md, weight_md, bias_md, dst_md, strides, dilations, ++ padding_left, padding_right, attr); + } else { +- auto conv_desc = dnnl::convolution_forward::desc( +- prop_kind, dnnl::algorithm::convolution_direct, +- src_md, weight_md, dst_md, +- strides, dilations, padding_left, padding_right); +- conv_pd = dnnl::convolution_forward::primitive_desc(conv_desc, attr, dnnl_engine); ++ conv_pd = dnnl::convolution_forward::primitive_desc(dnnl_engine, prop_kind, dnnl::algorithm::convolution_direct, ++ src_md, weight_md, dst_md, strides, dilations, padding_left, ++ padding_right, attr); + } + + // If using GPU this will move the memory from the CPU to the GPU. +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.cc +index 1208f206d7f..d8a245b5f7f 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.cc +@@ -49,15 +49,15 @@ void DnnlConvGrad::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + + auto dy_mem = sp.GetMemory(node.Input(IN_DY)); + auto dy_md = dy_mem.get_desc(); +- auto dy_dims = dy_mem.get_desc().dims(); ++ auto dy_dims = dy_mem.get_desc().get_dims(); + + auto x_mem = sp.GetMemory(node.Input(IN_X)); + auto x_md = x_mem.get_desc(); +- auto x_dims = x_mem.get_desc().dims(); ++ auto x_dims = x_mem.get_desc().get_dims(); + + auto w_mem = sp.GetMemory(node.Input(IN_W)); + auto w_md = w_mem.get_desc(); +- auto w_dims_original = w_mem.get_desc().dims(); ++ auto w_dims_original = w_mem.get_desc().get_dims(); + auto w_dims = w_dims_original; + + bool dx_required = node.Output(OUT_DX).Exists(); +@@ -122,37 +122,39 @@ void DnnlConvGrad::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + // Reproduce the forward convolution pd. + dnnl::convolution_forward::primitive_desc conv_forward_pd; + if (db_required) { +- auto conv_forward_desc = dnnl::convolution_forward::desc(dnnl::prop_kind::forward_training, +- dnnl::algorithm::convolution_direct, +- fwd_x_md, w_md, fwd_b_md, fwd_y_md, +- strides, dilations, padding_left, padding_right); +- conv_forward_pd = dnnl::convolution_forward::primitive_desc(conv_forward_desc, dnnl_engine); ++ conv_forward_pd = dnnl::convolution_forward::primitive_desc(dnnl_engine, dnnl::prop_kind::forward_training, ++ dnnl::algorithm::convolution_direct, ++ fwd_x_md, w_md, fwd_b_md, fwd_y_md, ++ strides, dilations, padding_left, padding_right); + } else { +- auto conv_forward_desc = dnnl::convolution_forward::desc(dnnl::prop_kind::forward_training, +- dnnl::algorithm::convolution_direct, +- fwd_x_md, w_md, fwd_y_md, +- strides, dilations, padding_left, padding_right); +- conv_forward_pd = dnnl::convolution_forward::primitive_desc(conv_forward_desc, dnnl_engine); ++ ++ conv_forward_pd = dnnl::convolution_forward::primitive_desc(dnnl_engine, dnnl::prop_kind::forward_training, ++ dnnl::algorithm::convolution_direct, ++ fwd_x_md, w_md, fwd_y_md, ++ strides, dilations, padding_left, padding_right); + } + + // Create the convolution backward data primitive desc +- auto conv_backward_data_desc = dnnl::convolution_backward_data::desc(dnnl::algorithm::convolution_direct, +- dx_md, w_md, dy_md, +- strides, dilations, padding_left, padding_right); +- auto conv_backward_data_pd = dnnl::convolution_backward_data::primitive_desc(conv_backward_data_desc, dnnl_engine, conv_forward_pd); ++ auto conv_backward_data_pd = ++ dnnl::convolution_backward_data::primitive_desc(dnnl_engine, dnnl::algorithm::convolution_direct, ++ dx_md, w_md, dy_md, strides, dilations, padding_left, ++ padding_right, conv_forward_pd); + + // Create the convolution backward weights primitve desc + dnnl::convolution_backward_weights::primitive_desc conv_backward_weights_pd; + if (db_required) { +- auto conv_backward_weights_desc = dnnl::convolution_backward_weights::desc(dnnl::algorithm::convolution_direct, +- x_md, dw_md, db_md, dy_md, +- strides, dilations, padding_left, padding_right); +- conv_backward_weights_pd = dnnl::convolution_backward_weights::primitive_desc(conv_backward_weights_desc, dnnl_engine, conv_forward_pd); ++ ++ conv_backward_weights_pd = ++ dnnl::convolution_backward_weights::primitive_desc( dnnl_engine, dnnl::algorithm::convolution_direct, ++ x_md, dw_md, db_md, dy_md, ++ strides, dilations, padding_left, ++ padding_right, conv_forward_pd); + } else { +- auto conv_backward_weights_desc = dnnl::convolution_backward_weights::desc(dnnl::algorithm::convolution_direct, +- x_md, dw_md, dy_md, +- strides, dilations, padding_left, padding_right); +- conv_backward_weights_pd = dnnl::convolution_backward_weights::primitive_desc(conv_backward_weights_desc, dnnl_engine, conv_forward_pd); ++ conv_backward_weights_pd = ++ dnnl::convolution_backward_weights::primitive_desc( dnnl_engine, dnnl::algorithm::convolution_direct, ++ x_md, dw_md, dy_md, ++ strides, dilations, padding_left, padding_right, ++ conv_forward_pd); + } + + // check if memory needs to be moved to GPU +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_dequantizelinear.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_dequantizelinear.cc +index cde20fdaca2..074df058806 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_dequantizelinear.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_dequantizelinear.cc +@@ -47,7 +47,7 @@ void DnnlDequantizeLinear::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& + // Get descs + auto x_md = x_mem.get_desc(); + auto x_scale_md = x_scale_mem.get_desc(); +- auto x_dims = x_md.dims().size(); ++ auto x_dims = x_md.get_dims().size(); + + // Fix scale dims + int64_t axis = GetAxis(node, x_dims); +@@ -65,11 +65,11 @@ void DnnlDequantizeLinear::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& + } + + // Create dst mem +- auto dst_md = dnnl::memory::desc(x_md.dims(), node.Output(OUT_Y).Type(), dnnl::memory::format_tag::any); ++ auto dst_md = dnnl::memory::desc(x_md.get_dims(), node.Output(OUT_Y).Type(), dnnl::memory::format_tag::any); + dnnl::memory dst_mem; + + // If zero point exists and we are NOT dequantizing int32, then substract zp from x and scale +- if (isZeroPointUseful && (x_mem.get_desc().data_type() != dnnl::memory::data_type::s32)) { ++ if (isZeroPointUseful && (x_mem.get_desc().get_data_type() != dnnl::memory::data_type::s32)) { + // Get Zero point + auto x_zp_mem = sp.GetMemory(node.Input(IN_X_ZERO_POINT)); + // Get mds for operands +@@ -84,8 +84,6 @@ void DnnlDequantizeLinear::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& + Padd(&x_zp_md, static_cast<uint64_t>(axis) + 1, x_dims); + } + +- // Create binary desc +- auto binary_d = dnnl::binary::desc(dnnl::algorithm::binary_sub, x_md, x_zp_md, dst_md); + // Add post op scale + dnnl::primitive_attr binary_attr; + { +@@ -94,7 +92,8 @@ void DnnlDequantizeLinear::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& + binary_attr.set_post_ops(binary_ops); + } + // Add post op to scale result +- auto binary_pd = dnnl::binary::primitive_desc(binary_d, binary_attr, dnnl_engine); ++ auto binary_pd = dnnl::binary::primitive_desc(dnnl_engine, dnnl::algorithm::binary_sub, ++ x_md, x_zp_md, dst_md, binary_attr); + // Move to GPU if available + x_zp_mem = sp.GetMemoryAndReshape(node.Input(IN_X_ZERO_POINT), x_zp_md, dnnl_engine); + // Create primitive and set dst mem +@@ -108,9 +107,9 @@ void DnnlDequantizeLinear::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& + + // If zp doesn't exists or we are dequantizing from int32, only need to scale + } else { +- // Create binary and primitive desc +- auto binary_d = dnnl::binary::desc(dnnl::algorithm::binary_mul, x_md, x_scale_md, dst_md); +- auto binary_pd = dnnl::binary::primitive_desc(binary_d, dnnl_engine); ++ // Create binary primitive desc ++ auto binary_pd = dnnl::binary::primitive_desc(dnnl_engine, dnnl::algorithm::binary_mul, ++ x_md, x_scale_md, dst_md); + + // Create primitive + dst_mem = dnnl::memory(binary_pd.dst_desc(), dnnl_engine); +@@ -133,8 +132,8 @@ bool DnnlDequantizeLinear::isZeroPointNonZero(dnnl::memory* zp_mem) { + // Because zp will always be int8, uint8 or int32, this cast is always valid + auto zp_data = static_cast<uint8_t*>(zp_mem->get_data_handle()); + // Adjust the iteration num +- auto topline = zp_mem->get_desc().dims().size(); +- if (zp_mem->get_desc().data_type() == dnnl::memory::data_type::s32) { ++ auto topline = zp_mem->get_desc().get_dims().size(); ++ if (zp_mem->get_desc().get_data_type() == dnnl::memory::data_type::s32) { + topline *= 4; + } + // ZP is either a scalar or a 1-D vector so iterate over all the dimensions +@@ -150,7 +149,7 @@ bool DnnlDequantizeLinear::isZeroPointNonZero(dnnl::memory* zp_mem) { + + void DnnlDequantizeLinear::Padd(dnnl::memory::desc* target_md, size_t front_pad, size_t back_pad) { + // Pads an input to broadcast the op correctly +- auto target_dims = target_md->dims(); ++ auto target_dims = target_md->get_dims(); + + // Add front padding + while (target_dims.size() < front_pad) { +@@ -185,8 +184,8 @@ int64_t DnnlDequantizeLinear::GetAxis(DnnlNode& node, size_t x_dims) { + void DnnlDequantizeLinear::ValidateDims(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + // We only need to validate when zp is provided + if (node.Input(IN_X_ZERO_POINT).Exists()) { +- auto x_scale_dims = sp.GetMemory(node.Input(IN_X_SCALE)).get_desc().dims(); +- auto x_zp_dims = sp.GetMemory(node.Input(IN_X_ZERO_POINT)).get_desc().dims(); ++ auto x_scale_dims = sp.GetMemory(node.Input(IN_X_SCALE)).get_desc().get_dims(); ++ auto x_zp_dims = sp.GetMemory(node.Input(IN_X_ZERO_POINT)).get_desc().get_dims(); + + if (x_zp_dims != x_scale_dims) { + ORT_THROW("x_scale and x_zero_point dimensions does not match"); +@@ -200,7 +199,7 @@ void DnnlDequantizeLinear::ValidateType(DnnlSubgraphPrimitive& sp, DnnlNode& nod + auto x_md = sp.GetMemory(node.Input(IN_X)).get_desc(); + auto x_zp_md = sp.GetMemory(node.Input(IN_X_ZERO_POINT)).get_desc(); + +- if (x_md.data_type() != x_zp_md.data_type()) { ++ if (x_md.get_data_type() != x_zp_md.get_data_type()) { + ORT_THROW("x and x_zero_point have different datatypes"); + } + } +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_dynamicquantizelinear.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_dynamicquantizelinear.cc +index 1d24e863297..b62cd7cb628 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_dynamicquantizelinear.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_dynamicquantizelinear.cc +@@ -4,6 +4,8 @@ + #include "dnnl_dynamicquantizelinear.h" + #include "dnnl_subgraph.h" + #include "dnnl_subgraph_primitive.h" ++#include "dnnl_util.h" ++ + + namespace onnxruntime { + namespace ort_dnnl { +@@ -23,7 +25,7 @@ void DnnlDynamicQuantizeLinear::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlN + // Get src mem + auto x_mem = sp.GetMemory(node.Input(IN_X)); + auto x_md = x_mem.get_desc(); +- auto x_size = x_md.dims().size(); ++ auto x_size = x_md.get_dims().size(); + auto x_format = sp.GetDnnlFormat(x_size); + x_mem = sp.GetMemoryAndReshape(node.Input(IN_X), x_md, eng); + +@@ -31,10 +33,8 @@ void DnnlDynamicQuantizeLinear::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlN + dnnl::memory::dims one_dim(x_size, 1); + + // Y_SCALE COMPUTATION +- // Create descriptor for reduction max and min +- auto y_scale_md = dnnl::memory::desc(one_dim, x_md.data_type(), x_format); +- auto max_reduction_d = dnnl::reduction::desc(dnnl::algorithm::reduction_max, x_md, y_scale_md, 0.f, 0.f); +- auto min_reduction_d = dnnl::reduction::desc(dnnl::algorithm::reduction_min, x_md, y_scale_md, 0.f, 0.f); ++ // Create descriptor for y_scale ++ auto y_scale_md = dnnl::memory::desc(one_dim, x_md.get_data_type(), x_format); + + // Fill memory with 0's, needed for min and max binary + auto zero_mem = dnnl::memory(y_scale_md, eng); +@@ -50,7 +50,7 @@ void DnnlDynamicQuantizeLinear::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlN + // y_scale = x_max - x_min + calc_y_scale.append_binary(dnnl::algorithm::binary_sub, y_scale_md); + // y_scale =/ 255 +- calc_y_scale.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, 1.0f / 255.0f, 0.0f); ++ calc_y_scale.append_eltwise(dnnl::algorithm::eltwise_linear, 1.0f / 255.0f, 0.0f); + max_reduction_attr.set_post_ops(calc_y_scale); + } + +@@ -63,8 +63,11 @@ void DnnlDynamicQuantizeLinear::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlN + } + + // Create reduction primitive +- auto max_reduction_prim = dnnl::reduction(dnnl::reduction::primitive_desc(max_reduction_d, max_reduction_attr, eng)); +- auto min_reduction_prim = dnnl::reduction(dnnl::reduction::primitive_desc(min_reduction_d, min_reduction_attr, eng)); ++ auto max_reduction_prim = dnnl::reduction({eng, dnnl::algorithm::reduction_max, ++ x_md, y_scale_md, 0.f, 0.f, max_reduction_attr}); ++ auto min_reduction_prim = dnnl::reduction( {eng, dnnl::algorithm::reduction_min, ++ x_md, y_scale_md, 0.f, 0.f, min_reduction_attr}); ++ + + // Create y_scale and min memory + auto y_scale_mem = dnnl::memory(y_scale_md, eng); +@@ -85,43 +88,48 @@ void DnnlDynamicQuantizeLinear::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlN + // Y_ZERO_POINT COMPUTATION + // Create memory and primitive descriptors + auto y_zp_md = dnnl::memory::desc(one_dim, dnnl::memory::data_type::u8, x_format); +- auto zp_prim_d = dnnl::binary::desc(dnnl::algorithm::binary_div, y_scale_md, y_scale_md, y_zp_md); + + // Add round and clip post ops + dnnl::primitive_attr zp_prim_attr; + { +- zp_prim_attr.set_scales(DNNL_ARG_SRC_0, 0, {-1.0f}); + dnnl::post_ops div_saturate_round; +- div_saturate_round.append_eltwise(1.0f, dnnl::algorithm::eltwise_round, 0.0f, 0.0f); ++ div_saturate_round.append_eltwise(dnnl::algorithm::eltwise_round, 0.0f, 0.0f); + zp_prim_attr.set_post_ops(div_saturate_round); + } ++ // Set the value to scale DNNL_ARG_SRC_0 with mask 0 ++ zp_prim_attr.set_scales_mask(DNNL_ARG_SRC_0, 0); ++ // Create the memory object related to the scale ++ auto scale_mem = dnnl::memory({{1}, dnnl::memory::data_type::f32, {1}}, eng); ++ // Write the alpha value into the memory object ++ sp.WriteToDnnlMemory<float>(scale_mem, {-1.0f}); + + // Create primitives +- auto zp_prim_pd = dnnl::binary::primitive_desc(zp_prim_d, zp_prim_attr, eng); ++ auto zp_prim_pd = dnnl::binary::primitive_desc( eng, dnnl::algorithm::binary_div, ++ y_scale_md, y_scale_md, y_zp_md, zp_prim_attr); + auto zp_prim = dnnl::binary(zp_prim_pd); + + // Create zp memory dst + auto y_zp_mem = dnnl::memory(zp_prim_pd.dst_desc(), eng); + + // Calc zp +- sp.AddPrimitive(zp_prim,{{DNNL_ARG_SRC_0, min_reduction_mem}, +- {DNNL_ARG_SRC_1, y_scale_mem}, +- {DNNL_ARG_DST, y_zp_mem}}); ++ sp.AddPrimitive(zp_prim,{ {DNNL_ARG_SRC_0, min_reduction_mem}, ++ {DNNL_ARG_SRC_1, y_scale_mem}, ++ {DNNL_ARG_DST, y_zp_mem}, ++ {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0, scale_mem}}); + + // Y COMPUTATION + // Create y md and binary desc +- auto y_md = dnnl::memory::desc(x_md.dims(), dnnl::memory::data_type::u8, x_format); +- auto y_bin_d = dnnl::binary::desc(dnnl::algorithm::binary_div, x_mem.get_desc(), y_scale_mem.get_desc(), y_md); ++ auto y_md = dnnl::memory::desc(x_md.get_dims(), dnnl::memory::data_type::u8, x_format); + // Add post ops + dnnl::primitive_attr y_bin_attr; + { + dnnl::post_ops round_add; +- round_add.append_eltwise(1.0f, dnnl::algorithm::eltwise_round, 0.0f, 0.0f); ++ round_add.append_eltwise(dnnl::algorithm::eltwise_round, 0.0f, 0.0f); + round_add.append_binary(dnnl::algorithm::binary_add, y_zp_mem.get_desc()); + y_bin_attr.set_post_ops(round_add); + } + // Create binary primitive with post ops +- auto y_pd = dnnl::binary::primitive_desc(y_bin_d, y_bin_attr, eng); ++ auto y_pd = dnnl::binary::primitive_desc(eng, dnnl::algorithm::binary_div, x_mem.get_desc(), y_scale_mem.get_desc(), y_md, y_bin_attr); + auto y_prim = dnnl::binary(y_pd); + // Create y_dst mem + auto y_mem = dnnl::memory(y_pd.dst_desc(), eng); +@@ -139,8 +147,8 @@ void DnnlDynamicQuantizeLinear::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlN + + //change md to targeted data type of cast op dst + dnnl::memory::desc DnnlDynamicQuantizeLinear::ChangeMemoryDescDataType(dnnl::memory::desc md, dnnl::memory::data_type dt) { +- auto dims = md.dims(); +- auto strides = md.data.format_desc.blocking.strides; ++ auto dims = md.get_dims(); ++ auto strides = md.get_strides(); + dnnl::memory::dims strides_vec; + for (size_t i = 0; i < dims.size(); i++) { + strides_vec.push_back(strides[i]); +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_elementwise.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_elementwise.cc +index 4d825474d8b..a2c8a02f42f 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_elementwise.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_elementwise.cc +@@ -35,17 +35,25 @@ void DnnlElementwise::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) + } + break; + } ++ case dnnl::algorithm::eltwise_soft_relu: { ++ if (node.OpType() == "Softplus") { ++ requires_alpha = true; ++ alpha = 1.0f; ++ } ++ break; ++ } + default: + alpha = 0.0; + } + ++ // Generate a dst_md from the src data ++ auto dst_md = dnnl::memory::desc(src_md.get_dims(), src_md.get_data_type(), dnnl::memory::format_tag::any); ++ + dnnl::eltwise_forward::primitive_desc elementwise_pd; + if (requires_alpha) { +- auto elementwise_desc = dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference, algo, src_md, alpha); +- elementwise_pd = dnnl::eltwise_forward::primitive_desc(elementwise_desc, dnnl_engine); ++ elementwise_pd = dnnl::eltwise_forward::primitive_desc(dnnl_engine, dnnl::prop_kind::forward_inference, algo, src_md, dst_md, alpha); + } else { +- auto elementwise_desc = dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference, algo, src_md); +- elementwise_pd = dnnl::eltwise_forward::primitive_desc(elementwise_desc, dnnl_engine); ++ elementwise_pd = dnnl::eltwise_forward::primitive_desc(dnnl_engine, dnnl::prop_kind::forward_inference, algo, src_md, dst_md); + } + + // If using GPU this will move the memory from the CPU to the GPU. +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_gelu.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_gelu.cc +index 6e1b6fcd5a6..d0df371b488 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_gelu.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_gelu.cc +@@ -29,8 +29,8 @@ void DnnlGelu::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + auto src0_ori_md = src_mem.get_desc(); + auto src1_ori_md = bias_mem.get_desc(); + +- auto src0_dims = src0_ori_md.dims(); +- auto src1_dims = src1_ori_md.dims(); ++ auto src0_dims = src0_ori_md.get_dims(); ++ auto src1_dims = src1_ori_md.get_dims(); + if (src0_dims.size() != src1_dims.size()) { + while (src0_dims.size() < src1_dims.size()) { + src0_dims.insert(src0_dims.begin(), 1); +@@ -53,13 +53,12 @@ void DnnlGelu::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + dnnl::primitive_attr attr; + dnnl::post_ops ops; + dnnl::algorithm algo = dnnl_util::OrtOperatorToDnnlAlgorithm(node.OpType()); +- ops.append_eltwise(1.0f, algo, 1.0f, 1.0f); ++ ops.append_eltwise(algo, 1.0f, 1.0f); + attr.set_post_ops(ops); + + auto dst_md = dnnl::memory::desc(output_shape, node.Output(OUT_Y).Type(), dnnl::memory::format_tag::any); + +- auto binary_d = dnnl::binary::desc(dnnl::algorithm::binary_add, src0_md, src1_md, dst_md); +- auto binary_pd = dnnl::binary::primitive_desc(binary_d, attr, dnnl_engine); ++ auto binary_pd = dnnl::binary::primitive_desc(dnnl_engine, dnnl::algorithm::binary_add, src0_md, src1_md, dst_md, attr); + + dst_mem = dnnl::memory(binary_pd.dst_desc(), dnnl_engine); + auto binary_prim = dnnl::binary(binary_pd); +@@ -68,9 +67,12 @@ void DnnlGelu::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + {DNNL_ARG_SRC_1, bias_mem}, + {DNNL_ARG_DST, dst_mem}}); + } else { ++ auto dst_md = dnnl::memory::desc( src_mem.get_desc().get_dims(), ++ node.Output(OUT_Y).Type(), ++ dnnl::memory::format_tag::any); + dnnl::algorithm algo = dnnl_util::OrtOperatorToDnnlAlgorithm(node.OpType()); +- auto gelu_desc = dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference, algo, gelu_src_mem.get_desc()); +- auto gelu_pd = dnnl::eltwise_forward::primitive_desc(gelu_desc, dnnl_engine); ++ auto gelu_pd = dnnl::eltwise_forward::primitive_desc( dnnl_engine, dnnl::prop_kind::forward_inference, algo, ++ gelu_src_mem.get_desc(), dst_md); + + // If using GPU this will move the memory from the CPU to the GPU. + gelu_src_mem = sp.GetMemoryAndReshape(node.Input(IN_X), gelu_pd.src_desc(), dnnl_engine); +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_gemm.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_gemm.cc +index 6178bbab85b..364ebdf5f22 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_gemm.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_gemm.cc +@@ -4,6 +4,7 @@ + #include "dnnl_gemm.h" + #include "dnnl_subgraph.h" + #include "dnnl_subgraph_primitive.h" ++#include "dnnl_util.h" + + namespace onnxruntime { + namespace ort_dnnl { +@@ -56,8 +57,8 @@ OneDNN algorithm: + void DnnlGemm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + auto eng = sp.GetEngine(); + +- auto a_dims = sp.GetMemory(node.Input(IN_A)).get_desc().dims(); +- auto b_dims = sp.GetMemory(node.Input(IN_B)).get_desc().dims(); ++ auto a_dims = sp.GetMemory(node.Input(IN_A)).get_desc().get_dims(); ++ auto b_dims = sp.GetMemory(node.Input(IN_B)).get_desc().get_dims(); + + bool input_c_exists = node.Input(IN_C).Exists(); + +@@ -92,14 +93,17 @@ void DnnlGemm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + dnnl::primitive_attr matmul_attr; + // scale the output from MatMul to alpha + float alpha = GetAlpha(node); +- std::vector<float> alphaScale({alpha}); +- matmul_attr.set_output_scales(0, alphaScale); ++ // Set the value to scale DNNL_ARG_SRC with mask 0 ++ matmul_attr.set_scales_mask(DNNL_ARG_SRC, 0); ++ // Create the memory object related to the scale ++ auto alpha_mem = dnnl::memory({{1}, dnnl::memory::data_type::f32, {1}}, eng); ++ // Write the alpha value into the memory object ++ sp.WriteToDnnlMemory<float>(alpha_mem, {alpha}); + + auto matmul_dst_md = dnnl::memory::desc(output_shape, node.Output(OUT_Y).Type(), {N, 1}); + +- auto matmul_d = dnnl::matmul::desc(a_md, b_md, matmul_dst_md); + dnnl::matmul::primitive_desc matmul_pd; +- matmul_pd = dnnl::matmul::primitive_desc(matmul_d, matmul_attr, eng); ++ matmul_pd = dnnl::matmul::primitive_desc(eng, a_md, b_md, matmul_dst_md, matmul_attr); + + auto matmul_a_mem = sp.GetMemoryAndReshape(node.Input(IN_A), matmul_pd.src_desc(), eng, transA); + auto matmul_b_mem = sp.GetMemoryAndReshape(node.Input(IN_B), matmul_pd.weights_desc(), eng, transB); +@@ -111,12 +115,14 @@ void DnnlGemm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + args.insert({DNNL_ARG_SRC, matmul_a_mem}); + args.insert({DNNL_ARG_WEIGHTS, matmul_b_mem}); + args.insert({DNNL_ARG_DST, gemm_dst_mem}); ++ // Set alpha_mem to scale the output ++ args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, alpha_mem}); + + sp.AddPrimitive(matmul_op, args); + + if (input_c_exists) { + auto c_original_md = sp.GetMemory(node.Input(IN_C)).get_desc(); +- auto c_dims = c_original_md.dims(); ++ auto c_dims = c_original_md.get_dims(); + if (c_dims.size() != a_dims.size()) { + while (c_dims.size() < a_dims.size()) { + c_dims.insert(c_dims.begin(), 1); +@@ -127,14 +133,18 @@ void DnnlGemm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + + auto y_md = dnnl::memory::desc(output_shape, node.Output(OUT_Y).Type(), dnnl::memory::format_tag::any); + +- auto binary_d = dnnl::binary::desc(dnnl::algorithm::binary_add, matmul_pd.dst_desc(), c_md, y_md); +- + // Scale input C by beta before adding it to the MatMul output. + dnnl::primitive_attr binary_attr; + float beta = GetBeta(node); +- binary_attr.set_scales(DNNL_ARG_SRC_1, 0, {beta}); ++ // Set the value to scale DNNL_ARG_SRC_1 with mask 0 ++ binary_attr.set_scales_mask(DNNL_ARG_SRC_1, 0); ++ // Create the memory object related to the scale ++ auto beta_mem = dnnl::memory({{1}, dnnl::memory::data_type::f32, {1}}, eng); ++ // Write the alpha value into the memory object ++ sp.WriteToDnnlMemory<float>(beta_mem, {beta}); + +- auto binary_pd = dnnl::binary::primitive_desc(binary_d, binary_attr,eng); ++ auto binary_pd = dnnl::binary::primitive_desc(eng, dnnl::algorithm::binary_add, ++ matmul_pd.dst_desc(), c_md, y_md, binary_attr); + + auto binary_c_mem = sp.GetMemoryAndReshape(node.Input(IN_C), binary_pd.src1_desc(), eng); + +@@ -142,7 +152,8 @@ void DnnlGemm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + + sp.AddPrimitive(binary_op, {{DNNL_ARG_SRC_0, gemm_dst_mem}, + {DNNL_ARG_SRC_1, binary_c_mem}, +- {DNNL_ARG_DST, gemm_dst_mem}}); ++ {DNNL_ARG_DST, gemm_dst_mem}, ++ {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_1, beta_mem}}); + } + sp.SetMemory(node.Output(OUT_Y), gemm_dst_mem); + } +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_layernorm.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_layernorm.cc +index 7d3d26bc972..1e21a955987 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_layernorm.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_layernorm.cc +@@ -94,7 +94,7 @@ void DnnlLayerNorm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + src_mem = sp.GetMemoryAndReshape(node.Input(IN_INPUT), src_md, dnnl_engine); + + // Make dst desc, must be same as src +- auto dst_md = dnnl::memory::desc(src_md.dims(), node.Output(OUT_OUTPUT).Type(), dnnl::memory::format_tag::any); ++ auto dst_md = dnnl::memory::desc(src_md.get_dims(), node.Output(OUT_OUTPUT).Type(), dnnl::memory::format_tag::any); + + // Add src + skip + { +@@ -105,8 +105,7 @@ void DnnlLayerNorm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + auto skip_mem = sp.GetMemoryAndReshape(node.Input(IN_SKIP), skip_md, dnnl_engine); + + // Create and add primitive +- auto add_skip_d = dnnl::binary::desc(dnnl::algorithm::binary_add, src_md, skip_md, dst_md); +- auto add_skip_pd = dnnl::binary::primitive_desc(add_skip_d, dnnl_engine); ++ auto add_skip_pd = dnnl::binary::primitive_desc(dnnl_engine, dnnl::algorithm::binary_add, src_md, skip_md, dst_md); + auto add_skip = dnnl::binary(add_skip_pd); + std::unordered_map<int, dnnl::memory> add_skip_mem_map({{DNNL_ARG_SRC_0, src_mem}, {DNNL_ARG_SRC_1, skip_mem}, {DNNL_ARG_DST, src_mem}}); + sp.AddPrimitive(add_skip, add_skip_mem_map); +@@ -121,9 +120,9 @@ void DnnlLayerNorm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + // Move the bias to GPU if needed + auto bias_mem = sp.GetMemoryAndReshape(node.Input(IN_SLN_BIAS), bias_md, dnnl_engine); + // Get bias dims +- auto bias_dims = bias_md.dims(); ++ auto bias_dims = bias_md.get_dims(); + // Get src dims +- auto src_dims = src_md.dims(); ++ auto src_dims = src_md.get_dims(); + + // To follow the spec means our bias will always have less dimensions that our input + // so we add the extra dimensions, reshape it and let OneDNN broadcast the value +@@ -133,8 +132,7 @@ void DnnlLayerNorm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + bias_md = bias_md.reshape(bias_dims); + + // Create and add primitive +- auto add_bias_d = dnnl::binary::desc(dnnl::algorithm::binary_add, src_md, bias_md, dst_md); +- auto add_bias_pd = dnnl::binary::primitive_desc(add_bias_d, dnnl_engine); ++ auto add_bias_pd = dnnl::binary::primitive_desc(dnnl_engine, dnnl::algorithm::binary_add, src_md, bias_md, dst_md); + auto add_bias = dnnl::binary(add_bias_pd); + std::unordered_map<int, dnnl::memory> add_bias_mem_map({{DNNL_ARG_SRC_0, src_mem}, {DNNL_ARG_SRC_1, bias_mem}, {DNNL_ARG_DST, src_mem}}); + sp.AddPrimitive(add_bias, add_bias_mem_map); +@@ -174,10 +172,8 @@ void DnnlLayerNorm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + + // Get epsilon to avoid zero division + float epsilon = GetEpsilon(node); +- // Operation desciptor +- auto lnorm_desc = dnnl::layer_normalization_forward::desc(prop_kind, src_md, epsilon, op_flags); + // Primitive desciptor +- auto lnorm_pd = dnnl::layer_normalization_forward::primitive_desc(lnorm_desc, dnnl_engine); ++ auto lnorm_pd = dnnl::layer_normalization_forward::primitive_desc(dnnl_engine, prop_kind, src_md, src_md, epsilon, op_flags); + // Primitive + auto lnorm_prim = dnnl::layer_normalization_forward(lnorm_pd); + +@@ -190,8 +186,8 @@ void DnnlLayerNorm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + if (node.Input(scale_pos).Type() != dnnl::memory::data_type::f32) { + // casting to fp32 if input with other data type + auto gamma_md = gamma_mem.get_desc(); +- auto dims = gamma_md.dims(); +- auto strides = gamma_md.data.format_desc.blocking.strides; ++ auto dims = gamma_md.get_dims(); ++ auto strides = gamma_md.get_strides(); + dnnl::memory::dims gamma_strides_vec; + for (size_t i = 0; i < dims.size(); i++) { + gamma_strides_vec.push_back(strides[i]); +@@ -210,8 +206,8 @@ void DnnlLayerNorm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + if (node.Input(shift_pos).Type() != dnnl::memory::data_type::f32) { + // casting to fp32 if input with other data type + auto beta_md = beta_mem.get_desc(); +- auto dims = beta_md.dims(); +- auto strides = beta_md.data.format_desc.blocking.strides; ++ auto dims = beta_md.get_dims(); ++ auto strides = beta_md.get_strides(); + dnnl::memory::dims beta_strides_vec; + for (size_t i = 0; i < dims.size(); i++) { + beta_strides_vec.push_back(strides[i]); +@@ -249,7 +245,7 @@ void DnnlLayerNorm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + + void DnnlLayerNorm::ValidateDims(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + // Get input and evaluate +- auto input_dims = sp.GetMemory(node.Input(IN_INPUT)).get_desc().dims(); ++ auto input_dims = sp.GetMemory(node.Input(IN_INPUT)).get_desc().get_dims(); + auto input_dims_size = input_dims.size(); + + // Check the inputs are supported by OneDNN, this is mandatory since sometimes +@@ -269,14 +265,14 @@ void DnnlLayerNorm::ValidateDims(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + } + + // Get skip and evaluate +- auto skip_dims = sp.GetMemory(node.Input(IN_SKIP)).get_desc().dims(); ++ auto skip_dims = sp.GetMemory(node.Input(IN_SKIP)).get_desc().get_dims(); + if (input_dims != skip_dims) { + ORT_THROW("Input and skip dimmentions do not match"); + } + + // Check if bias was provided and evaluate + if (node.Input(IN_SLN_BIAS).Exists()) { +- auto bias_dims = sp.GetMemory(node.Input(IN_SLN_BIAS)).get_desc().dims(); ++ auto bias_dims = sp.GetMemory(node.Input(IN_SLN_BIAS)).get_desc().get_dims(); + if (bias_dims.size() != 1) { + ORT_THROW("Bias is expected to have 1 dimension, got ", bias_dims.size()); + } +@@ -297,7 +293,7 @@ void DnnlLayerNorm::ValidateDims(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + } + + // Get gamma and evaluate +- auto gamma_dims = sp.GetMemory(node.Input(gamma_pos)).get_desc().dims(); ++ auto gamma_dims = sp.GetMemory(node.Input(gamma_pos)).get_desc().get_dims(); + if (gamma_dims.size() != 1) { + ORT_THROW("Gamma is expected to have 1 dimension, got ", gamma_dims.size()); + } +@@ -307,7 +303,7 @@ void DnnlLayerNorm::ValidateDims(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + + // Check if shift was provided and evaluate + if (node.Input(shift_pos).Exists()) { +- auto beta_dims = sp.GetMemory(node.Input(shift_pos)).get_desc().dims(); ++ auto beta_dims = sp.GetMemory(node.Input(shift_pos)).get_desc().get_dims(); + if (beta_dims.size() != 1) { + ORT_THROW("Beta is expected to have 1 dimension, got ", beta_dims.size()); + } +@@ -334,7 +330,7 @@ dnnl::memory DnnlLayerNorm::CastAndTransformMemory(DnnlSubgraphPrimitive& sp, dn + + // Make a new memory descriptor based on the source descriptor and given destination dataype and strides + auto src_md = src_mem.get_desc(); +- dnnl::memory::desc dst_md = dnnl::memory::desc(src_md.dims(), dst_datatype, dst_strides); ++ dnnl::memory::desc dst_md = dnnl::memory::desc(src_md.get_dims(), dst_datatype, dst_strides); + dst_mem = dnnl::memory(dst_md, eng); + + // Reorder source memory to destination memory as per the given dataype and strides +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_lrn.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_lrn.cc +index c44e4772680..16f795127d7 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_lrn.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_lrn.cc +@@ -24,17 +24,35 @@ void DnnlLrn::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + + auto lrn_src_mem = sp.GetMemory(node.Input(IN_X)); + auto lrn_src_md = lrn_src_mem.get_desc(); ++ // Create a dst_md from src_md ++ auto dst_md = dnnl::memory::desc(lrn_src_md.get_dims(), lrn_src_md.get_data_type(), dnnl::memory::format_tag::any); + +- auto lrn_desc = dnnl::lrn_forward::desc(dnnl::prop_kind::forward_scoring, dnnl::algorithm::lrn_across_channels, lrn_src_md, size, alpha, beta, bias); +- auto lrn_pd = dnnl::lrn_forward::primitive_desc(lrn_desc, dnnl_engine); ++ // Define prop kind according to training status ++ dnnl::prop_kind prop_kind; ++#ifdef ENABLE_TRAINING ++ prop_kind = dnnl::prop_kind::forward_training; ++#else ++ prop_kind = dnnl::prop_kind::forward_inference; ++#endif // ENABLE_TRAINING ++ ++ auto lrn_pd = dnnl::lrn_forward::primitive_desc(dnnl_engine, prop_kind, dnnl::algorithm::lrn_across_channels, ++ lrn_src_md, dst_md, size, alpha, beta, bias); + + // If using GPU this will move the memory from the CPU to the GPU. + lrn_src_mem = sp.GetMemoryAndReshape(node.Input(IN_X), lrn_pd.src_desc(), dnnl_engine); + auto lrn_dst_mem = dnnl::memory(lrn_pd.dst_desc(), dnnl_engine); + + auto lrn_op = dnnl::lrn_forward(lrn_pd); ++#ifdef ENABLE_TRAINING ++ auto workspace_mem = dnnl::memory(lrn_pd.workspace_desc(), dnnl_engine); ++ ++ sp.AddPrimitive(lrn_op, {{DNNL_ARG_SRC, lrn_src_mem}, ++ {DNNL_ARG_WORKSPACE, workspace_mem}, ++ {DNNL_ARG_DST, lrn_dst_mem}}); ++#else + sp.AddPrimitive(lrn_op, {{DNNL_ARG_SRC, lrn_src_mem}, +- {DNNL_ARG_DST, lrn_dst_mem}}); ++ {DNNL_ARG_DST, lrn_dst_mem}}); ++#endif // ENABLE_TRAINING + + sp.SetMemory(node.Output(OUT_Y), lrn_dst_mem); + } +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul.cc +index 49b7094559b..8ac0d37f88a 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul.cc +@@ -61,8 +61,8 @@ void DnnlMatMul::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + alpha = GetAlpha(node); + } + +- auto src_dims = sp.GetMemory(node.Input(IN_A)).get_desc().dims(); +- auto weights_dims = sp.GetMemory(node.Input(IN_B)).get_desc().dims(); ++ auto src_dims = sp.GetMemory(node.Input(IN_A)).get_desc().get_dims(); ++ auto weights_dims = sp.GetMemory(node.Input(IN_B)).get_desc().get_dims(); + + + // If this is required for transposed inputs, then this will be done later on in the code. +@@ -190,7 +190,7 @@ void DnnlMatMul::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + // Handle Binary post ops including the input memory + if (binary_ops.count(post_ops[i]) != 0) { + auto ori_binary_md = sp.GetMemory(node.Input(IN_BINARY_0 + binary_count).Name()).get_desc(); +- auto ori_binary_dims = ori_binary_md.dims(); ++ auto ori_binary_dims = ori_binary_md.get_dims(); + auto binary_mem_dims = ori_binary_dims; + if (ori_binary_dims.size() != output_shape.size()) { + if (ori_binary_dims.size() > output_shape.size()) { +@@ -225,25 +225,29 @@ void DnnlMatMul::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + post_op_alpha = GetFloatAttr(node, "alpha", /*default_alpha*/ 1.0f); + break; + } ++ case dnnl::algorithm::eltwise_soft_relu: { ++ if (post_ops[i] == "Softplus") { ++ post_op_alpha = 1.0f; ++ } ++ break; ++ } + default: + post_op_alpha = 0.0; + } +- ops.append_eltwise(1.0f, algo, post_op_alpha, 0.0f); ++ ops.append_eltwise(algo, post_op_alpha, 0.0f); + } + } + attr.set_post_ops(ops); + } + + if (is_fusedmatmul) { +- // Set the scaling of output as a post op in the primitive attribute, taking the value from alpha attribute +- std::vector<float> alphaScale({alpha}); +- attr.set_output_scales(0, alphaScale); ++ // Set the value to scale DNNL_ARG_SRC with mask 0 ++ attr.set_scales_mask(DNNL_ARG_SRC, 0); + } + + auto dst_md = dnnl::memory::desc(output_shape, node.Output(OUT_Y).Type(), dnnl::memory::format_tag::any); + +- auto matmul_d = dnnl::matmul::desc(src_md, weights_md, dst_md); +- auto matmul_pd = dnnl::matmul::primitive_desc(matmul_d, attr, eng); ++ auto matmul_pd = dnnl::matmul::primitive_desc(eng, src_md, weights_md, dst_md, attr); + + dnnl::memory matmul_src_mem, matmul_weights_mem; + auto matmul_dst_mem = dnnl::memory(matmul_pd.dst_desc(), eng); +@@ -265,6 +269,15 @@ void DnnlMatMul::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + {DNNL_ARG_WEIGHTS, matmul_weights_mem}, + {DNNL_ARG_DST, matmul_dst_mem}}); + ++ if (is_fusedmatmul) { ++ // Create the memory object related to the scale ++ auto alpha_mem = dnnl::memory({{1}, dnnl::memory::data_type::f32, {1}}, eng); ++ // Write the alpha value into the memory object ++ sp.WriteToDnnlMemory<float>(alpha_mem, {alpha}); ++ // Set alpha_mem to scale the output ++ mem_map.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, alpha_mem}); ++ } ++ + // add to memory map with extra third input if fused with add + if (has_postop_fusion) { + // add to memory map for extra binary inputs +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul_integer.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul_integer.cc +index 7c92243f986..ffa146298e2 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul_integer.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul_integer.cc +@@ -38,8 +38,8 @@ void DnnlMatMulInteger::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& nod + } + } + +- auto src_dims = sp.GetMemory(node.Input(IN_A)).get_desc().dims(); +- auto weights_dims = sp.GetMemory(node.Input(IN_B)).get_desc().dims(); ++ auto src_dims = sp.GetMemory(node.Input(IN_A)).get_desc().get_dims(); ++ auto weights_dims = sp.GetMemory(node.Input(IN_B)).get_desc().get_dims(); + + if (src_dims.size() != weights_dims.size()) { + while (src_dims.size() < weights_dims.size()) { +@@ -70,11 +70,11 @@ void DnnlMatMulInteger::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& nod + bool has_b_zero_point = node.Input(IN_B_ZERO_POINT).Name() != ""; + + if (has_a_zero_point) { +- matmul_attr.set_zero_points(DNNL_ARG_SRC, /* mask */ 0, {DNNL_RUNTIME_S32_VAL}); ++ matmul_attr.set_zero_points_mask(DNNL_ARG_SRC, /* mask */ 0); + } + + if (has_b_zero_point) { +- matmul_attr.set_zero_points(DNNL_ARG_WEIGHTS, /* mask */ 0, {DNNL_RUNTIME_S32_VAL}); ++ matmul_attr.set_zero_points_mask(DNNL_ARG_WEIGHTS, /* mask */ 0); + } + + /* +@@ -94,7 +94,7 @@ void DnnlMatMulInteger::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& nod + // Handle Binary post ops including the input memory + if (binary_ops.count(post_ops[i]) != 0) { + auto ori_binary_md = sp.GetMemory(node.Input(IN_BINARY_0 + binary_count).Name()).get_desc(); +- auto ori_binary_dims = ori_binary_md.dims(); ++ auto ori_binary_dims = ori_binary_md.get_dims(); + auto binary_mem_dims = ori_binary_dims; + if (ori_binary_dims.size() != output_shape.size()) { + if (ori_binary_dims.size() > output_shape.size()) { +@@ -129,17 +129,22 @@ void DnnlMatMulInteger::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& nod + post_op_alpha = GetFloatAttr(node, "alpha", /*default_alpha*/ 1.0f); + break; + } ++ case dnnl::algorithm::eltwise_soft_relu: { ++ if (post_ops[i] == "Softplus") { ++ post_op_alpha = 1.0f; ++ } ++ break; ++ } + default: + post_op_alpha = 0.0; + } +- ops.append_eltwise(1.0f, algo, post_op_alpha, 0.0f); ++ ops.append_eltwise(algo, post_op_alpha, 0.0f); + } + } + matmul_attr.set_post_ops(ops); + } + +- auto matmul_d = dnnl::matmul::desc(src_md, weights_md, dst_md); +- auto matmul_pd = dnnl::matmul::primitive_desc(matmul_d, matmul_attr, eng); ++ auto matmul_pd = dnnl::matmul::primitive_desc(eng, src_md, weights_md, dst_md, matmul_attr); + + auto matmul_src_mem = sp.GetMemoryAndReshape(node.Input(IN_A), matmul_pd.src_desc(), eng); + auto matmul_weights_mem = sp.GetMemoryAndReshape(node.Input(IN_B), matmul_pd.weights_desc(), eng); +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_pool.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_pool.cc +index 341868a3c70..32b9c64a920 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_pool.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_pool.cc +@@ -22,9 +22,9 @@ void DnnlPool::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + auto pool_src_mem = sp.GetMemory(node.Input(IN_X)); + #endif // ENABLE_TRAINING + auto src_md = pool_src_mem.get_desc(); +- auto src_dims = pool_src_mem.get_desc().dims(); ++ auto src_dims = pool_src_mem.get_desc().get_dims(); + +- #ifdef ENABLE_TRAINING ++#ifdef ENABLE_TRAINING + auto prop_kind = dnnl::prop_kind::forward; + #else + auto prop_kind = dnnl::prop_kind::forward_inference; +@@ -43,20 +43,16 @@ void DnnlPool::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + auto strides = GetStrides(node, shape); + + auto dst_mem_dims = InferOutputDims(node, src_dims, kernel_shape, strides); +- dnnl::memory::desc dst_md = dnnl::memory::desc(dst_mem_dims, node.Input(IN_X).Type(), dnnl::memory::format_tag::any); ++ dnnl::memory::desc dst_md = dnnl::memory::desc(dst_mem_dims, node.Input(OUT_Y).Type(), dnnl::memory::format_tag::any); + + auto padding = InferPadding(node, src_dims, kernel_shape, strides); + auto padding_left = GetPaddingLeft(padding); + auto padding_right = GetPaddingRight(padding); + ++ auto dilation = dnnl::memory::dims(kernel_shape.size(), 0); + +- +- auto pool_desc = dnnl::pooling_forward::desc(prop_kind, algo, +- src_md, dst_md, +- strides, kernel_shape, +- padding_left, padding_right); +- +- auto pool_pd = dnnl::pooling_forward::primitive_desc(pool_desc, dnnl_engine); ++ auto pool_pd = dnnl::pooling_forward::primitive_desc(dnnl_engine, prop_kind, algo, src_md, dst_md, strides, ++ kernel_shape, dilation, padding_left, padding_right); + + #ifndef ENABLE_TRAINING + // If using GPU this will move the memory from the CPU to the GPU. +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_poolgrad.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_poolgrad.cc +index 946d5a5543f..301de8ee3e1 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_poolgrad.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_poolgrad.cc +@@ -59,7 +59,7 @@ void DnnlPoolGrad::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + + auto dy_mem = sp.GetMemory(node.Input(IN_DY)); + auto dy_md = dy_mem.get_desc(); +- auto dy_dims = dy_mem.get_desc().dims(); ++ auto dy_dims = dy_mem.get_desc().get_dims(); + + dnnl::memory indices_mem; + dnnl::memory::desc indices_md; +@@ -69,7 +69,7 @@ void DnnlPoolGrad::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + if (maxpoolgrad_optype) { + indices_mem = sp.GetMemory(node.Input(IN_INDICES)); + indices_md = indices_mem.get_desc(); +- indices_dims = indices_mem.get_desc().dims(); ++ indices_dims = indices_mem.get_desc().get_dims(); + } + + auto dx_dims = node.Output(OUT_DX).Dim(); +@@ -92,15 +92,15 @@ void DnnlPoolGrad::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + } + } + +- dnnl::pooling_forward::desc pool_forward_desc(dnnl::prop_kind::forward, algo, +- fwd_dx_md, dy_md, +- strides, kernel_shape, +- padding_left, padding_right); +- dnnl::pooling_forward::primitive_desc pool_forward_pd(pool_forward_desc, dnnl_engine); ++ // Dilatation of 1 ++ auto dilatation = dnnl::memory::dims(kernel_shape.size(), 1); + +- dnnl::pooling_backward::desc pool_backword_desc(algo, dx_md, dy_md, +- strides, kernel_shape, padding_left, padding_right); +- dnnl::pooling_backward::primitive_desc pool_backward_pd(pool_backword_desc, dnnl_engine, pool_forward_pd); ++ ++ dnnl::pooling_forward::primitive_desc pool_forward_pd(dnnl_engine, dnnl::prop_kind::forward, algo, fwd_dx_md, dy_md, ++ strides, kernel_shape, dilatation, padding_left, padding_right); ++ ++ dnnl::pooling_backward::primitive_desc pool_backward_pd(dnnl_engine, algo, dx_md, dy_md, strides, kernel_shape, ++ dilatation, padding_left, padding_right, pool_forward_pd); + + dnnl::pooling_backward pool_backward_op(pool_backward_pd); + +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_pow.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_pow.cc +index 470f30e551f..ccc42ef6a77 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_pow.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_pow.cc +@@ -44,9 +44,11 @@ void DnnlPow::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + ORT_THROW("Pow exponent data type not supported"); + } + ++ auto dst_md = dnnl::memory::desc(src_md.get_dims(), src_md.get_data_type(), dnnl::memory::format_tag::any); ++ + // DNNL eltwise_pow is defined as alpha*x^beta. We don't use alpha so it is hard coded to 1.0 +- dnnl::eltwise_forward::desc elementwise_desc(dnnl::prop_kind::forward_inference, dnnl::algorithm::eltwise_pow, src_md, 1.0, beta); +- dnnl::eltwise_forward::primitive_desc elementwise_pd(elementwise_desc, dnnl_engine); ++ dnnl::eltwise_forward::primitive_desc elementwise_pd(dnnl_engine, dnnl::prop_kind::forward_inference, ++ dnnl::algorithm::eltwise_pow, src_md, dst_md, 1.0, beta); + + // If using GPU this will move the memory from the CPU to the GPU. + elementwise_src_mem = sp.GetMemoryAndReshape(node.Input(IN_X), elementwise_pd.src_desc(), dnnl_engine); +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_qattention.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_qattention.cc +index 05eee228b73..ebec6f4e74c 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_qattention.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_qattention.cc +@@ -2,6 +2,7 @@ + // Licensed under the MIT License + + #include "dnnl_qattention.h" ++#include "dnnl_util.h" + + namespace onnxruntime { + namespace ort_dnnl { +@@ -21,8 +22,7 @@ dnnl::memory DnnlQAttention::ComputeTotalScale(DnnlSubgraphPrimitive& sp, DnnlNo + auto src_0_md = input_scale_mem.get_desc().reshape({1}); + auto src_1_md = weights_scale_mem.get_desc().reshape({1}); + auto dst_md = src_1_md; +- auto binary_d = dnnl::binary::desc(dnnl::algorithm::binary_mul, src_0_md, src_1_md, dst_md); +- auto binary_pd = dnnl::binary::primitive_desc(binary_d, eng); ++ auto binary_pd = dnnl::binary::primitive_desc(eng, dnnl::algorithm::binary_mul, src_0_md, src_1_md, dst_md); + + auto binary_src0_mem = sp.GetMemoryAndReshape(node.Input(INPUT_SCALE), binary_pd.src0_desc(), eng); + auto binary_src1_mem = sp.GetMemoryAndReshape(node.Input(WEIGHTS_SCALE), binary_pd.src1_desc(), eng); +@@ -115,12 +115,12 @@ void DnnlQAttention::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) + { + //set input zp + if (has_input_zero_point) { +- matmul_attr.set_zero_points(DNNL_ARG_SRC, 0, {DNNL_RUNTIME_S32_VAL}); ++ matmul_attr.set_zero_points_mask(DNNL_ARG_SRC, 0); + } + + //set weight zp + if (has_weights_zero_point) { +- matmul_attr.set_zero_points(DNNL_ARG_WEIGHTS, 0, {DNNL_RUNTIME_S32_VAL}); ++ matmul_attr.set_zero_points_mask(DNNL_ARG_WEIGHTS, 0); + } + } + +@@ -131,18 +131,18 @@ void DnnlQAttention::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) + auto input_md_ori = sp.GetMemory(node.Input(INPUT)).get_desc(); + auto weights_md_ori = sp.GetMemory(node.Input(WEIGHTS)).get_desc(); + +- auto weights_dims = weights_md_ori.dims(); ++ auto weights_dims = weights_md_ori.get_dims(); + weights_dims.insert(weights_dims.begin(), 1); + +- input_md = dnnl::memory::desc(input_md_ori.dims(), input_md_ori.data_type(), dnnl::memory::format_tag::any); +- weights_md = dnnl::memory::desc(weights_dims, weights_md_ori.data_type(), dnnl::memory::format_tag::any); ++ input_md = dnnl::memory::desc(input_md_ori.get_dims(), input_md_ori.get_data_type(), dnnl::memory::format_tag::any); ++ weights_md = dnnl::memory::desc(weights_dims, weights_md_ori.get_data_type(), dnnl::memory::format_tag::any); + } + + dnnl::memory::desc QKV_md; + { + //the output of int8 matmul is always 3 dims and consists of Q,K,V values +- auto QKV_dims = input_md.dims(); +- QKV_dims[2] = weights_md.dims()[2]; ++ auto QKV_dims = input_md.get_dims(); ++ QKV_dims[2] = weights_md.get_dims()[2]; + //use format any for optimization + if (isBF16Acc) { + QKV_md = dnnl::memory::desc(QKV_dims, dnnl::memory::data_type::bf16, dnnl::memory::format_tag::any); +@@ -151,8 +151,7 @@ void DnnlQAttention::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) + } + } + +- auto matmul_d = dnnl::matmul::desc(input_md, weights_md, QKV_md); +- auto matmul_pd = dnnl::matmul::primitive_desc(matmul_d, matmul_attr, eng); ++ auto matmul_pd = dnnl::matmul::primitive_desc(eng, input_md, weights_md, QKV_md, matmul_attr); + // (input-input_zero_point)*(weight-weight_zero_point) + auto matmul_prim = dnnl::matmul(matmul_pd); + +@@ -189,19 +188,16 @@ void DnnlQAttention::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) + auto total_scale_mem = ComputeTotalScale(sp, node); + + auto bias_md = sp.GetMemory(node.Input(BIAS)).get_desc(); +- bias_md = bias_md.reshape({1, 1, bias_md.dims()[0]}); ++ bias_md = bias_md.reshape({1, 1, bias_md.get_dims()[0]}); + auto QKV_desc = QKV_mem.get_desc(); + +- //always broadcast from bias to QKV +- auto binary_d = dnnl::binary::desc(dnnl::algorithm::binary_add, QKV_desc, bias_md, QKV_desc); +- + dnnl::primitive_attr binary_attr; + //scale source 0, matmul output + if (total_scale_mem) { +- binary_attr.set_scales(DNNL_ARG_SRC_0, 0, {DNNL_RUNTIME_F32_VAL}); ++ binary_attr.set_scales_mask(DNNL_ARG_SRC_0, 0); + } + +- auto binary_pd = dnnl::binary::primitive_desc(binary_d, binary_attr, eng); ++ auto binary_pd = dnnl::binary::primitive_desc(eng, dnnl::algorithm::binary_add, QKV_desc, bias_md, QKV_desc, binary_attr); + auto binary_prim = dnnl::binary(binary_pd); + + auto bias_mem = sp.GetMemoryAndReshape(node.Input(BIAS), binary_pd.src1_desc(), eng); +@@ -211,7 +207,7 @@ void DnnlQAttention::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) + {DNNL_ARG_DST, QKV_mem}}); + + if (total_scale_mem) { +- binary_mem_map[DNNL_ARG_ATTR_INPUT_SCALES | DNNL_ARG_SRC_0] = total_scale_mem; ++ binary_mem_map[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0] = total_scale_mem; + } + + sp.AddPrimitive(binary_prim, binary_mem_map); +@@ -219,10 +215,10 @@ void DnnlQAttention::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) + + //parse some dim information for permute and reshape + //eg, 8,512,2034 = 8,512,(3,12,64) +- auto batch_size = QKV_mem.get_desc().dims()[0]; +- auto sequence_length = QKV_mem.get_desc().dims()[1]; ++ auto batch_size = QKV_mem.get_desc().get_dims()[0]; ++ auto sequence_length = QKV_mem.get_desc().get_dims()[1]; + auto num_heads = GetNumHeads(node); +- auto hidden_size = QKV_mem.get_desc().dims()[2] / 3; ++ auto hidden_size = QKV_mem.get_desc().get_dims()[2] / 3; + auto head_size = hidden_size / num_heads; + + // Slice QKV into submemories +@@ -257,12 +253,16 @@ void DnnlQAttention::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) + //need a reorder of data type from s32 to f32 to let mask to have the same data type as QK result + if (has_mask_index) { + auto mask_index_mem_desc = sp.GetMemory(node.Input(MASK_INDEX)).get_desc(); +- +- auto linear_d = dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference, dnnl::algorithm::eltwise_linear, mask_index_mem_desc, 10000.0f, -10000.0f); +- auto linear_pd = dnnl::eltwise_forward::primitive_desc(linear_d, eng); ++ auto linear_dst_mem = dnnl::memory::desc( mask_index_mem_desc.get_dims(), ++ mask_index_mem_desc.get_data_type(), ++ dnnl::memory::format_tag::any); ++ auto linear_pd = dnnl::eltwise_forward::primitive_desc( eng, dnnl::prop_kind::forward_inference, ++ dnnl::algorithm::eltwise_linear, ++ mask_index_mem_desc, linear_dst_mem, ++ 10000.0f, -10000.0f); + + auto mask_index_ori_mem = sp.GetMemoryAndReshape(node.Input(MASK_INDEX), linear_pd.src_desc(), eng); +- assert(linear_pd.dst_desc().data_type() == dnnl::memory::data_type::s32); ++ assert(linear_pd.dst_desc().get_data_type() == dnnl::memory::data_type::s32); + auto mask_index_mem_unbroadcasted_src = dnnl::memory(linear_pd.dst_desc(), eng); + + auto linear_prim = dnnl::eltwise_forward(linear_pd); +@@ -272,8 +272,8 @@ void DnnlQAttention::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) + dnnl::memory mask_index_mem_unbroadcasted_dst; + { + auto mask_index_md_unbroadcasted = mask_index_mem_unbroadcasted_src.get_desc(); +- auto dims = mask_index_md_unbroadcasted.dims(); +- auto strides = mask_index_md_unbroadcasted.data.format_desc.blocking.strides; ++ auto dims = mask_index_md_unbroadcasted.get_dims(); ++ auto strides = mask_index_md_unbroadcasted.get_strides(); + dnnl::memory::dims strides_vec; + for (size_t i = 0; i < dims.size(); i++) { + strides_vec.push_back(strides[i]); +@@ -288,7 +288,7 @@ void DnnlQAttention::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) + + + //unsqueeze the mem for broadcasting +- auto mask_index_dims = mask_index_mem_unbroadcasted_dst.get_desc().dims(); ++ auto mask_index_dims = mask_index_mem_unbroadcasted_dst.get_desc().get_dims(); + //not symetric, simply broadcasting + //eg 8,512 -> 8,1,1,512 + //eg 8,1,1,512 -> 8,12,512,512 +@@ -297,9 +297,7 @@ void DnnlQAttention::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) + auto mask_index_broadcasted_md = mask_index_mem_unbroadcasted_dst.get_desc().reshape(mask_index_dims); + //set mask_index_mem + mask_index_mem = dnnl::memory(mask_index_broadcasted_md, eng, nullptr); +- dnnl::stream s(eng); +- mask_index_mem.set_data_handle(mask_index_mem_unbroadcasted_dst.get_data_handle(), s); +- s.wait(); ++ mask_index_mem.set_data_handle(mask_index_mem_unbroadcasted_dst.get_data_handle()); + } + + dnnl::memory QK_mem; +@@ -308,8 +306,8 @@ void DnnlQAttention::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) + { + dnnl::primitive_attr QK_attr; + { +- auto scales = std::vector<float>({float(1 / std::sqrt(head_size))}); +- QK_attr.set_output_scales(0, scales); ++ // Set output scales ++ QK_attr.set_scales_mask(DNNL_ARG_SRC, 0); + + if (mask_index_mem) { + dnnl::post_ops add_bias; +@@ -326,26 +324,32 @@ void DnnlQAttention::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) + QK_md = dnnl::memory::desc({batch_size, num_heads, sequence_length, sequence_length}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::any); + } + } +- auto QK_d = dnnl::matmul::desc(Q_md, K_md, QK_md); +- auto QK_pd = dnnl::matmul::primitive_desc(QK_d, QK_attr, eng); ++ auto QK_pd = dnnl::matmul::primitive_desc(eng, Q_md, K_md, QK_md, QK_attr); + auto QK_prim = dnnl::matmul(QK_pd); + ++ // Create the memory object related to the scale ++ auto out_scales_mem = dnnl::memory({{1}, dnnl::memory::data_type::f32, {1}}, eng); ++ // Write the alpha value into the memory object ++ sp.WriteToDnnlMemory<float>(out_scales_mem, std::vector<float>({float(1 / std::sqrt(head_size))})); ++ + QK_mem = dnnl::memory(QK_pd.dst_desc(), eng); + { + //QKV_mem is used as both input and weight but since matmul is defined on submemory, computation will be applied to correct submemory + std::unordered_map<int, dnnl::memory> QK_mem_map({{DNNL_ARG_SRC, Q_mem}, + {DNNL_ARG_WEIGHTS, K_mem}, +- {DNNL_ARG_DST, QK_mem}}); ++ {DNNL_ARG_DST, QK_mem}, ++ {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, out_scales_mem}}); + if (mask_index_mem) { + QK_mem_map[DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1] = mask_index_mem; + } +- sp.AddPrimitive(QK_prim, QK_mem_map); ++ sp.AddPrimitive(QK_prim, QK_mem_map, {DNNL_ARG_DST}); + } + + //apply softmax in place to produce attention prob + { +- auto softmax_desc = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_inference, QK_mem.get_desc(), 3); +- auto softmax_pd = dnnl::softmax_forward::primitive_desc(softmax_desc, eng); ++ auto softmax_pd = dnnl::softmax_forward::primitive_desc(eng, dnnl::prop_kind::forward_inference, ++ dnnl::algorithm::softmax_accurate, ++ QK_mem.get_desc(), QK_mem.get_desc(), 3); + auto softmax_prim = dnnl::softmax_forward::primitive(softmax_pd); + + //QK = softmax(QK) in place +@@ -367,8 +371,7 @@ void DnnlQAttention::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) + } + } + +- auto Prob_V_d = dnnl::matmul::desc(QK_mem.get_desc(), V_md, QAttention_dst_md); +- auto Prob_V_pd = dnnl::matmul::primitive_desc(Prob_V_d, eng); ++ auto Prob_V_pd = dnnl::matmul::primitive_desc(eng, QK_mem.get_desc(), V_md, QAttention_dst_md); + auto Prob_V_prim = dnnl::matmul(Prob_V_pd); + + QAttention_dst_mem = dnnl::memory(Prob_V_pd.dst_desc(), eng); +@@ -424,7 +427,7 @@ dnnl::memory DnnlQAttention::CopySubMemory(DnnlSubgraphPrimitive& sp, dnnl::memo + + // Make destination memory object from source descriptor given sub memory dimension and offset + auto src_md = src_mem.get_desc().submemory_desc(sub_mem_dims, sub_mem_offset); +- dnnl::memory::desc dst_md = dnnl::memory::desc(src_md.dims(), src_md.data_type(), sp.GetDnnlFormat(src_md.dims().size())); ++ dnnl::memory::desc dst_md = dnnl::memory::desc(src_md.get_dims(), src_md.get_data_type(), sp.GetDnnlFormat(src_md.get_dims().size())); + dst_mem = dnnl::memory(dst_md, eng); + + // Copy submemory from source to destination given dimensions and offset +@@ -446,7 +449,7 @@ dnnl::memory DnnlQAttention::CastMemory(DnnlSubgraphPrimitive& sp, dnnl::memory& + + // Make a new memory descriptor based on the source descriptor and given destination datatype + auto src_md = src_mem.get_desc(); +- dnnl::memory::desc dst_md = dnnl::memory::desc(src_md.dims(), dst_datatype, sp.GetDnnlFormat(src_md.dims().size())); ++ dnnl::memory::desc dst_md = dnnl::memory::desc(src_md.get_dims(), dst_datatype, sp.GetDnnlFormat(src_md.get_dims().size())); + dst_mem = dnnl::memory(dst_md, eng); + + // Reorder source memory to destination memory as per the given datatype +@@ -468,7 +471,7 @@ dnnl::memory DnnlQAttention::CastAndTransformMemory(DnnlSubgraphPrimitive& sp, d + + // Make a new memory descriptor based on the source descriptor and given destination dataype and strides + auto src_md = src_mem.get_desc(); +- dnnl::memory::desc dst_md = dnnl::memory::desc(src_md.dims(), dst_datatype, dst_strides); ++ dnnl::memory::desc dst_md = dnnl::memory::desc(src_md.get_dims(), dst_datatype, dst_strides); + dst_mem = dnnl::memory(dst_md, eng); + + // Reorder source memory to destination memory as per the given dataype and strides +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_qattention.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_qattention.h +index 82879047679..d1cea23fca2 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_qattention.h ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_qattention.h +@@ -5,7 +5,6 @@ + #include <cmath> + #include "dnnl_subgraph.h" + #include "dnnl_subgraph_primitive.h" +-#include "dnnl_util.h" + + namespace onnxruntime { + namespace ort_dnnl { +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.cc +index 1b06724e26d..cd8901a5043 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.cc +@@ -68,7 +68,7 @@ void DnnlReduce::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + } else { + if (node.Input(IN_AXES).Exists()) { + auto axes_mem = sp.GetMemory(node.Input(IN_AXES)); +- dnnl::memory::dims axes_dims = axes_mem.get_desc().dims(); ++ dnnl::memory::dims axes_dims = axes_mem.get_desc().get_dims(); + int64_t* p_axes_data = (int64_t*)axes_mem.get_data_handle(); + axes = std::vector<int64_t>(p_axes_data, p_axes_data + axes_dims[0]); + } +@@ -93,7 +93,7 @@ void DnnlReduce::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + //We need to calculate output tensor shape + //First we initialize it with input shape and then we modify it based on the attribute values + //This is because the DNNL primitive functionality is determined by the input and output shapes. +- auto src_dims = src_md.dims(); ++ auto src_dims = src_md.get_dims(); + auto ndim = src_dims.size(); + + // convert negative axis values to the positive axis +@@ -120,13 +120,13 @@ void DnnlReduce::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + + auto dst_shape = TensorShape(src_dims.data(), ndim); + dnnl::memory::dims dst_dims_mkl(dst_shape.GetDims().begin(), dst_shape.GetDims().end()); +- auto dst_md = dnnl::memory::desc({dst_dims_mkl}, src_md.data_type(), dnnl::memory::format_tag::any); ++ auto dst_md = dnnl::memory::desc({dst_dims_mkl}, src_md.get_data_type(), dnnl::memory::format_tag::any); + + // Check to see if the destination shape and source shape are the same. + bool src_and_dst_dims_equal = true; +- if (src_md.dims().size() == dst_md.dims().size()) { +- for (size_t i = 0; i < src_md.dims().size(); ++i) { +- if (src_md.dims()[i] != dst_md.dims()[i]) { ++ if (src_md.get_dims().size() == dst_md.get_dims().size()) { ++ for (size_t i = 0; i < src_md.get_dims().size(); ++i) { ++ if (src_md.get_dims()[i] != dst_md.get_dims()[i]) { + src_and_dst_dims_equal = false; + break; + } +@@ -164,22 +164,25 @@ void DnnlReduce::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + dnnl::primitive_attr dnnl_primitive_attr; + if ((reduce_op == ReduceLogSum || reduce_op == ReduceLogSumExp ) && !src_and_dst_dims_equal) { + dnnl::post_ops eltwise_post_op; +- eltwise_post_op.append_eltwise(1.0f, dnnl::algorithm::eltwise_log, 1.0f, 1.0f); ++ eltwise_post_op.append_eltwise(dnnl::algorithm::eltwise_log, 1.0f, 1.0f); + dnnl_primitive_attr.set_post_ops(eltwise_post_op); + } + + if (reduce_op == ReduceLogSumExp) { + if (!src_and_dst_dims_equal) { +- auto elementwise_desc = dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference, dnnl::algorithm::eltwise_exp, src_md); +- auto elementwise_pd = dnnl::eltwise_forward::primitive_desc(elementwise_desc, dnnl_engine); ++ auto elementwise_pd = dnnl::eltwise_forward::primitive_desc(dnnl_engine, dnnl::prop_kind::forward_inference, ++ dnnl::algorithm::eltwise_exp, src_md, ++ dnnl::memory::desc(src_md.get_dims(), ++ src_md.get_data_type(), ++ dnnl::memory::format_tag::any)); + + auto elementwise_dst_mem = dnnl::memory(elementwise_pd.dst_desc(), dnnl_engine); + + auto elemenwise_primitive = dnnl::eltwise_forward(elementwise_pd); + sp.AddPrimitive(elemenwise_primitive, {{DNNL_ARG_SRC, src_mem}, + {DNNL_ARG_DST, elementwise_dst_mem}}); +- auto reduce_desc = dnnl::reduction::desc(algo, src_md, dst_md, 0.f, 0.f); +- auto reduce_pd = dnnl::reduction::primitive_desc(reduce_desc, dnnl_primitive_attr, dnnl_engine); ++ auto reduce_pd = dnnl::reduction::primitive_desc(dnnl_engine, algo, src_md, dst_md, 0.f, 0.f, ++ dnnl_primitive_attr); + + reduce_dst_mem = dnnl::memory(reduce_pd.dst_desc(), dnnl_engine); + +@@ -190,8 +193,11 @@ void DnnlReduce::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + reduce_dst_mem = src_mem; + } + } else if(reduce_op == ReduceSumSquare) { +- auto elementwise_desc = dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference, dnnl::algorithm::eltwise_square, src_md); +- auto elementwise_pd = dnnl::eltwise_forward::primitive_desc(elementwise_desc, dnnl_engine); ++ auto elementwise_pd = dnnl::eltwise_forward::primitive_desc(dnnl_engine, dnnl::prop_kind::forward_inference, ++ dnnl::algorithm::eltwise_square, src_md, ++ dnnl::memory::desc(src_md.get_dims(), ++ src_md.get_data_type(), ++ dnnl::memory::format_tag::any)); + + auto elementwise_dst_mem = dnnl::memory(elementwise_pd.dst_desc(), dnnl_engine); + +@@ -199,8 +205,7 @@ void DnnlReduce::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + sp.AddPrimitive(elemenwise_primitive, {{DNNL_ARG_SRC, src_mem}, + {DNNL_ARG_DST, elementwise_dst_mem}}); + if (!src_and_dst_dims_equal) { +- auto reduce_desc = dnnl::reduction::desc(algo, src_md, dst_md, 0.f, 0.f); +- auto reduce_pd = dnnl::reduction::primitive_desc(reduce_desc, dnnl_engine); ++ auto reduce_pd = dnnl::reduction::primitive_desc(dnnl_engine, algo, src_md, dst_md, 0.f, 0.f); + + reduce_dst_mem = dnnl::memory(reduce_pd.dst_desc(), dnnl_engine); + +@@ -220,8 +225,8 @@ void DnnlReduce::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + p_val = 2.0f; + } + +- auto reduce_desc = dnnl::reduction::desc(algo, src_md, dst_md, p_val, 0.f); +- auto reduce_pd = dnnl::reduction::primitive_desc(reduce_desc, dnnl_primitive_attr, dnnl_engine); ++ auto reduce_pd = dnnl::reduction::primitive_desc(dnnl_engine, algo, src_md, dst_md, p_val, 0.f, ++ dnnl_primitive_attr); + + // If using GPU this will move the memory from the CPU to the GPU. + reduce_src_mem = sp.GetMemoryAndReshape(node.Input(IN_DATA), reduce_pd.src_desc(), dnnl_engine); +@@ -232,8 +237,11 @@ void DnnlReduce::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + {DNNL_ARG_DST, reduce_dst_mem}}); + } else { + if (reduce_op == ReduceLogSum) { +- auto elementwise_desc = dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference, dnnl::algorithm::eltwise_log, src_md); +- auto elementwise_pd = dnnl::eltwise_forward::primitive_desc(elementwise_desc, dnnl_engine); ++ auto elementwise_pd = dnnl::eltwise_forward::primitive_desc(dnnl_engine, dnnl::prop_kind::forward_inference, ++ dnnl::algorithm::eltwise_log, src_md, ++ dnnl::memory::desc(src_md.get_dims(), ++ src_md.get_data_type(), ++ dnnl::memory::format_tag::any)); + + reduce_dst_mem = dnnl::memory(elementwise_pd.dst_desc(), dnnl_engine); + +@@ -274,7 +282,7 @@ void DnnlReduce::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + if ((j < axes.size() && axes[j] == static_cast<int64_t>(i) && src_dims[i] == 0) || + (axes.size() == 0 && src_dims[i] == 0)) { + if (!keepdims) { +- auto dims = src_md.dims(); ++ auto dims = src_md.get_dims(); + ORT_ENFORCE(keepdims, + "Can't reduce on dim with value of 0 if 'keepdims' is false. " + "Invalid output shape would be produced. input_shape:", +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_relugrad.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_relugrad.cc +index 62da9cb3d89..a542a7d67b8 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_relugrad.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_relugrad.cc +@@ -18,13 +18,20 @@ void DnnlReluGrad::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + auto relu_bwd_src_mem = sp.GetMemoryAndReshape(node.Input(IN_X), src_mem.get_desc(), eng); + auto relu_bwd_diff_dst_mem = sp.GetMemoryAndReshape(node.Input(IN_dY), diff_dst_mem.get_desc(), eng); + +- //create hints on the fly +- auto hints_d = dnnl::eltwise_forward::desc(dnnl::prop_kind::forward, dnnl::algorithm::eltwise_relu, relu_bwd_src_mem.get_desc(), 0.0, 0.0); +- auto hints_pd = dnnl::eltwise_forward::primitive_desc(hints_d, eng); +- +- auto relu_bwd_d = dnnl::eltwise_backward::desc(dnnl::algorithm::eltwise_relu, relu_bwd_diff_dst_mem.get_desc(), relu_bwd_src_mem.get_desc(), 0.0, 0.0); ++ // Generate the dst_md ++ auto dst_md = dnnl::memory::desc(src_mem.get_desc().get_dims(), ++ node.Output(OUT_dX).Type(), ++ dnnl::memory::format_tag::any); + +- auto relu_bwd_pd = dnnl::eltwise_backward::primitive_desc(relu_bwd_d, eng, hints_pd); ++ //create hints on the fly ++ auto hints_pd = dnnl::eltwise_forward::primitive_desc(eng, dnnl::prop_kind::forward, dnnl::algorithm::eltwise_relu, ++ relu_bwd_src_mem.get_desc(), dst_md, 0.0, 0.0); ++ ++ auto relu_bwd_pd = dnnl::eltwise_backward::primitive_desc(eng, dnnl::algorithm::eltwise_relu, ++ relu_bwd_diff_dst_mem.get_desc(), ++ relu_bwd_src_mem.get_desc(), ++ src_mem.get_desc(), ++ 0.0, 0.0, hints_pd); + + auto relu_bwd_diff_src_mem = dnnl::memory(relu_bwd_pd.diff_src_desc(), eng); + +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_reshape.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reshape.cc +index 16090e86cf3..1e4ca5ddd4e 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_reshape.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reshape.cc +@@ -15,10 +15,10 @@ void DnnlReshape::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + + // the input shape assumes OrtFormat so we get the memory in OrtFormat. + auto data_mem = sp.GetMemoryInOrtFormat(node.Input(IN_DATA), dnnl_engine); +- dnnl::memory::dims data_dims = data_mem.get_desc().dims(); ++ dnnl::memory::dims data_dims = data_mem.get_desc().get_dims(); + + auto shape_mem = sp.GetMemory(node.Input(IN_SHAPE)); +- dnnl::memory::dims shape_dims = shape_mem.get_desc().dims(); ++ dnnl::memory::dims shape_dims = shape_mem.get_desc().get_dims(); + int64_t* shape_data = (int64_t*)shape_mem.get_data_handle(); + + // Reshape helper will take input data_dims shape and the reshape_shape and replace the -1 and 0s with the calculated +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_softmax.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_softmax.cc +index fbb0754a3fe..c44abd913e8 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_softmax.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_softmax.cc +@@ -23,11 +23,18 @@ void DnnlSoftmax::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + auto softmax_src_mem = sp.GetMemory(node.Input(IN_X)); + auto softmax_src_md = softmax_src_mem.get_desc(); + +- if (axis < 0) +- axis = softmax_src_md.dims().size() + axis; ++ if (axis < 0){ ++ axis = softmax_src_md.get_dims().size() + axis; ++ } ++ ++ // Generate the dst_md ++ auto dst_md = dnnl::memory::desc(softmax_src_md.get_dims(), ++ node.Output(OUT_Y).Type(), ++ dnnl::memory::format_tag::any); + +- auto softmax_desc = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_training, softmax_src_md, (int) axis); +- auto softmax_pd = dnnl::softmax_forward::primitive_desc(softmax_desc, dnnl_engine); ++ auto softmax_pd = dnnl::softmax_forward::primitive_desc(dnnl_engine, dnnl::prop_kind::forward_training, ++ dnnl::algorithm::softmax_accurate, softmax_src_md, dst_md, ++ static_cast<int>(axis)); + + // If using GPU this will move the memory from the CPU to the GPU. + softmax_src_mem = sp.GetMemoryAndReshape(node.Input(IN_X), softmax_pd.src_desc(), dnnl_engine); +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_softmaxgrad.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_softmaxgrad.cc +index f033b665776..930d7fe843b 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_softmaxgrad.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_softmaxgrad.cc +@@ -18,18 +18,27 @@ void DnnlSoftmaxGrad::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) + auto softmax_bwd_src_mem = sp.GetMemoryAndReshape(node.Input(IN_X), src_mem.get_desc(), eng); + auto softmax_bwd_diff_dst_mem = sp.GetMemoryAndReshape(node.Input(IN_dY), diff_dst_mem.get_desc(), eng); + +- auto axis = ReadAxis(node); ++ int axis; ++ { ++ auto axis64 = ReadAxis(node); ++ if (axis64 < 0) ++ axis64 = src_mem.get_desc().get_dims().size() + axis64; + +- if (axis < 0) +- axis = src_mem.get_desc().dims().size() + axis; ++ axis = static_cast<int>(axis64); ++ } + +- //create hints on the fly +- auto hints_d = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_training, softmax_bwd_src_mem.get_desc(), (int) axis); +- auto hints_pd = dnnl::softmax_forward::primitive_desc(hints_d, eng); ++ auto fws_dst_md = dnnl::memory::desc(diff_dst_mem.get_desc().get_dims(), ++ diff_dst_mem.get_desc().get_data_type(), ++ dnnl::memory::format_tag::any); + +- auto softmax_bwd_d = dnnl::softmax_backward::desc(softmax_bwd_diff_dst_mem.get_desc(), softmax_bwd_src_mem.get_desc(), (int) axis); ++ //create hints on the fly ++ auto hints_pd = dnnl::softmax_forward::primitive_desc(eng, dnnl::prop_kind::forward_training, ++ dnnl::algorithm::softmax_accurate, ++ softmax_bwd_src_mem.get_desc(), fws_dst_md, axis); + +- auto softmax_bwd_pd = dnnl::softmax_backward::primitive_desc(softmax_bwd_d, eng, hints_pd); ++ auto softmax_bwd_pd = dnnl::softmax_backward::primitive_desc(eng, dnnl::algorithm::softmax_accurate, ++ fws_dst_md, softmax_bwd_diff_dst_mem.get_desc(), ++ softmax_bwd_src_mem.get_desc(), axis, hints_pd); + + auto softmax_bwd_diff_src_mem = dnnl::memory(softmax_bwd_pd.diff_src_desc(), eng); + +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_squeeze.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_squeeze.cc +index f9c2fe9b6bf..024dbb1f779 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_squeeze.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_squeeze.cc +@@ -15,14 +15,14 @@ void DnnlSqueeze::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + + // the input shape assumes OrtFormat so we get the memory in OrtFormat. + auto data_mem = sp.GetMemoryInOrtFormat(node.Input(IN_DATA), dnnl_engine); +- dnnl::memory::dims data_dims = data_mem.get_desc().dims(); ++ dnnl::memory::dims data_dims = data_mem.get_desc().get_dims(); + + std::vector<int64_t> axes_data; + // ONNX Squeeze version 13+ the axes is an input tensor + // ONNX Squeeze before version 13 axes comes from an Attribute. + if (node.Input(IN_AXES).Exists()) { + auto axes_mem = sp.GetMemory(node.Input(IN_AXES)); +- dnnl::memory::dims axes_dims = axes_mem.get_desc().dims(); ++ dnnl::memory::dims axes_dims = axes_mem.get_desc().get_dims(); + int64_t* p_axes_data = (int64_t*)axes_mem.get_data_handle(); + axes_data = std::vector<int64_t>(p_axes_data, p_axes_data + axes_dims[0]); + } else { +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.cc +index 0854bb29e5b..ce747daf623 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.cc +@@ -78,8 +78,8 @@ inline bool Contains(const Map& map, const Key& key) { + #if DNNL_TENSOR_PRINT_MEMORY + void DnnlSubgraphPrimitive::PrintMemory(const dnnl::memory& mem) { + auto md = mem.get_desc(); +- auto dt = md.data_type(); +- auto dims = md.dims(); ++ auto dt = md.get_data_type(); ++ auto dims = md.get_dims(); + if (Product(dims) > DNNL_TENSOR_PRINT_MEMORY_MAX_TENSOR_ELEMENTS) { + printf("tensor too long ignore printing \n"); + return; +@@ -87,7 +87,7 @@ void DnnlSubgraphPrimitive::PrintMemory(const dnnl::memory& mem) { + dnnl::memory to_mem; + if (!IsMemoryInExpectedOrtFormat(md)|| mem.get_engine().get_kind() != dnnl::engine::kind::cpu) { + printf("\n print memory reorder started \n"); +- dnnl::memory::desc to_md = dnnl::memory::desc(md.dims(), md.data_type(), GetDnnlFormat(md.dims().size())); ++ dnnl::memory::desc to_md = dnnl::memory::desc(md.get_dims(), md.get_data_type(), GetDnnlFormat(md.get_dims().size())); + to_mem = dnnl::memory(to_md, GetCPUEngine()); + auto stream = dnnl::stream(mem.get_engine()); + dnnl::reorder(mem, to_mem).execute(stream, {{DNNL_ARG_FROM, mem}, {DNNL_ARG_TO, to_mem}}); +@@ -411,7 +411,7 @@ void DnnlSubgraphPrimitive::AddOutputs() { + auto dnnl_tensor_name = tensor->Name(); + auto engine = GetCPUEngine(); + auto output_mem_dnnl = GetMemory(dnnl_tensor_name); +- auto output_md = dnnl::memory::desc(output_mem_dnnl.get_desc().dims(), dnnl_data_type, GetDnnlFormat(output_mem_dnnl.get_desc().dims().size())); ++ auto output_md = dnnl::memory::desc(output_mem_dnnl.get_desc().get_dims(), dnnl_data_type, GetDnnlFormat(output_mem_dnnl.get_desc().get_dims().size())); + // if output already in correct memory format, just place it to outputs instead of reorder + bool copy_output = outputs_are_always_copied_.find(dnnl_tensor_name) != outputs_are_always_copied_.end(); + if (output_mem_dnnl.get_desc() == output_md && output_mem_dnnl.get_engine() == engine && !copy_output) { +@@ -557,9 +557,9 @@ dnnl::memory DnnlSubgraphPrimitive::GetMemoryAndReshape(const DnnlTensor& tensor + auto mem_to = dnnl::memory(mem_desc, eng); + + // if it is a reshape, ensure reorder is possible by making the same dims +- if (mem_from.get_desc().dims() != mem_to.get_desc().dims() || transpose) { +- auto mem_from_dims = mem_from.get_desc().dims(); +- auto mem_to_dims = mem_to.get_desc().dims(); ++ if (mem_from.get_desc().get_dims() != mem_to.get_desc().get_dims() || transpose) { ++ auto mem_from_dims = mem_from.get_desc().get_dims(); ++ auto mem_to_dims = mem_to.get_desc().get_dims(); + if (Product(mem_from_dims) != Product(mem_to_dims)) { + LOGS_DEFAULT(ERROR) << tensor.Name() << ", Dims From: " << mem_from_dims << ", To: " << mem_to_dims; + throw std::invalid_argument("not a valid reshape, inconsistent dim product"); +@@ -571,14 +571,12 @@ dnnl::memory DnnlSubgraphPrimitive::GetMemoryAndReshape(const DnnlTensor& tensor + //TODO: expand to arbitrary permutation or transpose on given 2 dims for higher dimensional tensors + mem_from_reshape_md = mem_from_reshape_md.permute_axes({1, 0}); + } +- mem_from_reshape_md = mem_from_reshape_md.reshape(mem_desc.dims()); ++ mem_from_reshape_md = mem_from_reshape_md.reshape(mem_desc.get_dims()); + auto mem_from_reshape = dnnl::memory(mem_from_reshape_md, mem_from.get_engine(), nullptr); + if (is_constant) { // if constant, do reshape now + LOGS_DEFAULT(INFO) << "reshaped now"; + //use the stream as a hint to make sure data handle gets set +- dnnl::stream s{eng}; +- mem_from_reshape.set_data_handle(mem_from.get_data_handle(),s); +- s.wait(); ++ mem_from_reshape.set_data_handle(mem_from.get_data_handle()); + } else { + AddReshape(mem_from, mem_from_reshape); + } +@@ -614,7 +612,7 @@ dnnl::memory DnnlSubgraphPrimitive::GetMemoryAndReshape(const DnnlTensor& tensor + dnnl::memory DnnlSubgraphPrimitive::GetMemoryInOrtFormat(const DnnlTensor& tensor, const dnnl::engine& eng) { + auto from_mem = GetMemory(tensor); + auto from_desc = from_mem.get_desc(); +- auto from_dims = from_desc.dims(); ++ auto from_dims = from_desc.get_dims(); + if (!IsMemoryInExpectedOrtFormat(from_desc)) { + dnnl::memory::desc to_md = dnnl::memory::desc(from_dims, tensor.Type(), GetDnnlFormat(from_dims.size())); + dnnl::memory to_mem = dnnl::memory(to_md, eng); +@@ -628,18 +626,18 @@ dnnl::memory DnnlSubgraphPrimitive::GetMemoryInOrtFormat(const DnnlTensor& tenso + } + + bool DnnlSubgraphPrimitive::IsMemoryInExpectedOrtFormat(const dnnl::memory::desc& desc) const { +- if (desc.data.format_kind != dnnl_blocked) { ++ if (desc.get_format_kind() != dnnl::memory::format_kind::blocked) { + return false; + } +- if (desc.data.format_desc.blocking.inner_nblks != 0) { ++ if (desc.get_inner_nblks() != 0) { + return false; + } +- auto strides = desc.data.format_desc.blocking.strides; ++ auto strides = desc.get_strides(); + // if a data format is dnnl_format::abcd... the stride will go from largest to smallest + // if for example we have a shape {2,3,4} we expect a stride of {12, 4, 1} if it were + // of dnnl_format::abc if instead the stride were {12, 1, 4} that would be dnnl_format::acb + // which does not match what is expected from Onnxruntime. +- for (size_t i = 1; i < desc.dims().size(); ++i) { ++ for (size_t i = 1; i < desc.get_dims().size(); ++i) { + if (strides[i - 1] < strides[i]) { + return false; + } +@@ -666,23 +664,20 @@ onnxruntime::common::Status DnnlSubgraphPrimitive::Predict(const std::unordered_ + + for (auto& input : inputs) { + if (Contains(inputs_, input.first)) { +- inputs_.at(input.first).set_data_handle(input.second.buffer, stream); +- stream.wait(); ++ inputs_.at(input.first).set_data_handle(input.second.buffer); + } + } + + for (auto& output : outputs) { + if (Contains(outputs_, output.first)) { +- outputs_.at(output.first).set_data_handle(output.second.buffer, stream); +- stream.wait(); ++ outputs_.at(output.first).set_data_handle(output.second.buffer); + } + } + + // reshapes (eg, unsqueeze) + // it is safe to set data handle because all external data handles have been set and onednn managed memory data handles will not change + for (auto& reshape_pair : reshapes_) { +- reshape_pair.second.set_data_handle(reshape_pair.first.get_data_handle(),stream); +- stream.wait(); ++ reshape_pair.second.set_data_handle(reshape_pair.first.get_data_handle()); + } + + +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h +index b8e9079d029..cf9c8514a2f 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h +@@ -76,6 +76,25 @@ class DnnlSubgraphPrimitive { + dnnl::memory GetMemoryInOrtFormat(const DnnlTensor& tensor, const dnnl::engine& eng); + bool IsMemoryInExpectedOrtFormat(const dnnl::memory::desc& desc) const; + ++ template <typename T> ++ void WriteToDnnlMemory(dnnl::memory& mem, std::vector<T> values) { ++ if (mem.get_engine().get_kind() == dnnl::engine::kind::gpu) { ++ // Create a CPU memory ++ auto cpu_memory = dnnl::memory(mem.get_desc(), GetCPUEngine()); ++ // Copy data from the vector into the CPU memory data handle ++ std::copy(values.begin(), values.end(), static_cast<T*>(cpu_memory.get_data_handle())); ++ // Use reorder to copy data from CPU to GPU ++ dnnl::stream s{mem.get_engine()}; ++ // mem now contains all zero ++ dnnl::reorder(cpu_memory, mem).execute(s, cpu_memory, mem); ++ // wait for reorder to complete ++ s.wait(); ++ } else { ++ // Copy data from the vector into the memory data handle ++ std::copy(values.begin(), values.end(), static_cast<T*>(mem.get_data_handle())); ++ } ++ } ++ + private: + std::string shape_key_; + +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_sum.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_sum.cc +index 8832c2ea5b4..2d692f47c18 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_sum.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_sum.cc +@@ -25,10 +25,10 @@ void DnnlSum::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + scales.push_back(1.0f); + } + +- auto dst_dims = srcs_pd[0].dims(); ++ auto dst_dims = srcs_pd[0].get_dims(); + auto dst_md = dnnl::memory::desc({dst_dims}, node.Input(IN_DATA_0).Type(), dnnl::memory::format_tag::any); + +- auto sum_pd = dnnl::sum::primitive_desc(dst_md, scales, srcs_pd, dnnl_engine); ++ auto sum_pd = dnnl::sum::primitive_desc(dnnl_engine, dst_md, scales, srcs_pd); + + for (size_t i = 0; i < src_mems.size(); ++i) { + src_mems[i] = sp.GetMemoryAndReshape(node.Input(static_cast<int>(IN_DATA_0 + i)), sum_pd.src_desc(), dnnl_engine); +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_transpose.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_transpose.cc +index 2f161e4ebda..a6952ab5fa8 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_transpose.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_transpose.cc +@@ -31,7 +31,7 @@ void DnnlTranspose::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + auto dnnl_engine = sp.GetEngine(); + + auto data_mem = sp.GetMemory(node.Input(IN_DATA)); +- auto data_dims = data_mem.get_desc().dims(); ++ auto data_dims = data_mem.get_desc().get_dims(); + auto ndata_dims = data_dims.size(); + + auto perm = GetPerm(node); +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_unsqueeze.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_unsqueeze.cc +index 9532686028a..88cd212101d 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_unsqueeze.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_unsqueeze.cc +@@ -22,7 +22,7 @@ void DnnlUnsqueeze::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + // To counter this data_dims is left empty if the input is from a scalar. + dnnl::memory::dims data_dims; + if (!data_is_scalar) { +- data_dims = data_mem.get_desc().dims(); ++ data_dims = data_mem.get_desc().get_dims(); + } + + std::vector<int64_t> axes_data; +@@ -30,7 +30,7 @@ void DnnlUnsqueeze::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) { + // ONNX Unsqueeze before version 13 axes comes from an Attribute. + if (node.Input(IN_AXES).Exists()) { + auto axes_mem = sp.GetMemory(node.Input(IN_AXES)); +- dnnl::memory::dims axes_dims = axes_mem.get_desc().dims(); ++ dnnl::memory::dims axes_dims = axes_mem.get_desc().get_dims(); + int64_t* p_axes_data = (int64_t*)axes_mem.get_data_handle(); + axes_data = std::vector<int64_t>(p_axes_data, p_axes_data + axes_dims[0]); + } else { +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_util.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_util.cc +index 0279f4f7430..db9329e8b1f 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_util.cc ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_util.cc +@@ -40,12 +40,14 @@ bool GetGPUInfo(GPUInfo gpu_info) { + gpuRuntimeFound = true; + // attempt to make a dnnl::matmul::desc. If we are able to successfully make a bf16 matmul::desc + // assume the GPU supports all BF16 operations. ++ dnnl::primitive_attr attr; ++ attr.set_scales_mask(DNNL_ARG_SRC, 0); ++ attr.set_zero_points_mask(DNNL_ARG_SRC, /* mask */ 0); + auto src0_md = dnnl::memory::desc({1,1}, dnnl::memory::data_type::bf16, dnnl::memory::format_tag::ab); + auto src1_md = dnnl::memory::desc({1,1}, dnnl::memory::data_type::bf16, dnnl::memory::format_tag::ab); + auto dst_md = dnnl::memory::desc({1,1}, dnnl::memory::data_type::bf16, dnnl::memory::format_tag::ab); +- auto matmul_d = dnnl::matmul::desc(src0_md, src1_md, dst_md); + try { +- auto matmul_pd = dnnl::matmul::primitive_desc(matmul_d, gpu_engine); ++ auto matmul_pd = dnnl::matmul::primitive_desc(gpu_engine, src0_md, src1_md, dst_md, attr); + gpuBF16Supported = true; + } catch(const dnnl::error& e) { + if (e.status == dnnl_unimplemented) { +diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_util.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_util.h +index 3dbb913f0d5..c8a96597c65 100644 +--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_util.h ++++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_util.h +@@ -10,8 +10,11 @@ namespace onnxruntime { + namespace ort_dnnl { + namespace dnnl_util { + bool IsGPURuntimeAvalible(); ++ + bool IsBF16Supported(); ++ + dnnl::algorithm OrtOperatorToDnnlAlgorithm(std::string op); ++ + } // namespace dnnl_util + } // namespace ort_dnnl + } // namespace onnxruntime diff --git a/15089.diff b/15089.diff new file mode 100644 index 000000000000..30a3be0bcbeb --- /dev/null +++ b/15089.diff @@ -0,0 +1,476 @@ +diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h +index e7e989cf17..6f76036b8e 100644 +--- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h ++++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h +@@ -31,4 +31,10 @@ struct OrtTensorRTProviderOptionsV2 { + int trt_force_sequential_engine_build; // force building TensorRT engine sequentially. Default 0 = false, nonzero = true + int trt_context_memory_sharing_enable; // enable context memory sharing between subgraphs. Default 0 = false, nonzero = true + int trt_layer_norm_fp32_fallback; // force Pow + Reduce ops in layer norm to FP32. Default 0 = false, nonzero = true ++ int trt_build_heuristics_enable; // Build engine using heuristics to reduce build time. Default 0 = false, nonzero = true ++ int trt_sparsity_enable; // Control if sparsity can be used by TRT. Default 0 = false, 1 = true ++ int trt_builder_optimization_level; // Set the builder optimization level. WARNING: levels below 2 do not guarantee good engine performance, but greatly improve build time. Default 2, valid range [0-4] ++ int trt_auxiliary_streams; // Set maximum number of auxiliary streams per inference stream. Setting this value to 0 will lead to optimal memory usage. Default -1 = heuristics ++ const char* trt_tactic_sources; // pecify the tactics to be used by adding (+) or removing (-) tactics from the default ++ // tactic sources (default = all available tactics) e.g. "-CUDNN,+CUBLAS" available keys: "CUBLAS"|"CUBLAS_LT"|"CUDNN"|"EDGE_MASK_CONVOLUTIONS" + }; +diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +index ca815fd788..ef96bc0e6f 100644 +--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc ++++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +@@ -119,6 +119,67 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map<s + } + } // namespace + ++std::vector<std::string> SplitToStringVec(std::string const& s, char separator) { ++ std::vector<std::string> splitted; ++ ++ for (size_t start = 0; start < s.length();) { ++ size_t separatorIndex = s.find(separator, start); ++ if (separatorIndex == std::string::npos) { ++ separatorIndex = s.length(); ++ } ++ splitted.emplace_back(s.substr(start, separatorIndex - start)); ++ start = separatorIndex + 1; ++ } ++ ++ return splitted; ++} ++ ++nvinfer1::TacticSources GetTacticSourceFromString(std::string& tactic_sting) { ++ nvinfer1::TacticSources disabledTactics = 0; ++ nvinfer1::TacticSources enabledTactics = 0; ++ std::vector<std::string> tacticList = SplitToStringVec(tactic_sting, ','); ++ for (auto& t : tacticList) { ++ bool enable{false}; ++ if (t.front() == '+') { ++ enable = true; ++ } else if (t.front() != '-') { ++ LOGS_DEFAULT(WARNING) << "[TensorRT EP] Tactic source must be prefixed with + or - skipping: " << t; ++ } ++ t.erase(0, 1); ++ ++ const auto toUpper = [](std::string& sourceName) { ++ std::transform( ++ sourceName.begin(), sourceName.end(), sourceName.begin(), [](char c) { return std::toupper(c); }); ++ return sourceName; ++ }; ++ ++ nvinfer1::TacticSource source{}; ++ t = toUpper(t); ++ if (t == "CUBLAS") { ++ source = nvinfer1::TacticSource::kCUBLAS; ++ } else if (t == "CUBLASLT" || t == "CUBLAS_LT") { ++ source = nvinfer1::TacticSource::kCUBLAS_LT; ++ } else if (t == "CUDNN") { ++ source = nvinfer1::TacticSource::kCUDNN; ++ } else if (t == "EDGE_MASK_CONVOLUTIONS") { ++ source = nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS; ++ } else if (t == "JIT_CONVOLUTIONS") { ++ source = nvinfer1::TacticSource::kJIT_CONVOLUTIONS; ++ } else { ++ LOGS_DEFAULT(WARNING) << "[TensorRT EP] Tactic source was not found with name: " << t; ++ } ++ ++ uint32_t sourceBit = 1U << static_cast<uint32_t>(source); ++ ++ if (enable) { ++ enabledTactics |= sourceBit; ++ } else { ++ disabledTactics |= sourceBit; ++ } ++ } ++ return enabledTactics & ~disabledTactics; ++} ++ + namespace google { + namespace protobuf { + void ShutdownProtobufLibrary(); +@@ -324,6 +385,11 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv + if (fp16_enable_) { + layer_norm_fp32_fallback_ = info.layer_norm_fp32_fallback; + } ++ build_heuristics_enable_ = info.build_heuristics_enable; ++ sparsity_enable_ = info.sparsity_enable; ++ builder_optimization_level_ = info.builder_optimization_level; ++ auxiliary_streams_ = info.auxiliary_streams; ++ tactic_sources_ = info.tactic_sources; + } else { + const std::string max_partition_iterations_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kMaxPartitionIterations); + if (!max_partition_iterations_env.empty()) { +@@ -418,6 +484,31 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv + if (!layer_norm_fp32_fallback_env.empty()) { + layer_norm_fp32_fallback_ = (std::stoi(layer_norm_fp32_fallback_env) == 0 ? false : true); + } ++ ++ const std::string build_heuristics_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kBuildHeuristics); ++ if (!build_heuristics_env.empty()) { ++ build_heuristics_enable_ = (std::stoi(build_heuristics_env) == 0 ? false : true); ++ } ++ ++ const std::string sparsity_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kSparsityEnable); ++ if (!sparsity_enable_env.empty()) { ++ sparsity_enable_ = (std::stoi(sparsity_enable_env) == 0 ? false : true); ++ } ++ ++ const std::string builder_optimization_level_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kBuilderOptimizationLevel); ++ if (!builder_optimization_level_env.empty()) { ++ builder_optimization_level_ = std::stoi(builder_optimization_level_env); ++ } ++ ++ const std::string auxiliary_streams_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kAuxiliaryStreams); ++ if (!auxiliary_streams_env.empty()) { ++ auxiliary_streams_ = std::stoi(auxiliary_streams_env); ++ } ++ ++ const std::string tactic_sources_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kTacticSources); ++ if (!tactic_sources_env.empty()) { ++ tactic_sources_ = tactic_sources_env; ++ } + } + + // Validate setting +@@ -483,7 +574,12 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv + << ", trt_engine_decryption_lib_path: " << engine_decryption_lib_path_ + << ", trt_force_sequential_engine_build: " << force_sequential_engine_build_ + << ", trt_context_memory_sharing_enable: " << context_memory_sharing_enable_ +- << ", trt_layer_norm_fp32_fallback: " << layer_norm_fp32_fallback_; ++ << ", trt_layer_norm_fp32_fallback: " << layer_norm_fp32_fallback_ ++ << ", trt_build_heuristics_enable: " << build_heuristics_enable_ ++ << ", trt_sparsity_enable: " << sparsity_enable_ ++ << ", trt_builder_optimization_level: " << builder_optimization_level_ ++ << ", trt_auxiliary_streams: " << auxiliary_streams_ ++ << ", trt_tactic_sources: " << tactic_sources_; + } + + TensorrtExecutionProvider::~TensorrtExecutionProvider() { +@@ -1366,6 +1462,38 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd + } + } + ++ // enable sparse weights ++ if (sparsity_enable_) { ++ trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS); ++ LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed"; ++ } ++ ++ // enable builder heuristics ++ if (build_heuristics_enable_) { ++ trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC ); ++ LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder heuristics are enabled"; ++ } ++ ++ // switch optimizaion level ++ if (builder_optimization_level_ != 2) { ++ trt_config->setBuilderOptimizationLevel(builder_optimization_level_); ++ LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_; ++ } ++ ++ // limit auxiliary streams ++ if (auxiliary_streams_ >= 0) { ++ trt_config->setMaxAuxStreams(auxiliary_streams_); ++ LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << auxiliary_streams_; ++ } ++ ++ // limit used tactic sources ++ if (!tactic_sources_.empty()) { ++ nvinfer1::TacticSources tactics = trt_config->getTacticSources(); ++ tactics |= GetTacticSourceFromString(tactic_sources_); ++ trt_config->setTacticSources(tactics); ++ LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using " << tactic_sources_; ++ } ++ + // Build TRT engine here if the graph doesn't have dynamic shape input. Otherwise engine will + // be built at runtime + std::unique_ptr<nvinfer1::ICudaEngine> trt_engine; +@@ -1498,13 +1626,19 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd + NodeComputeInfo compute_info; + compute_info.create_state_func = [=](ComputeContext* context, FunctionState* state) { + std::unique_ptr<TensorrtFuncState> p = std::make_unique<TensorrtFuncState>(); ++ // translate tactic sources string to nvinfer1::TacticSources ++ nvinfer1::TacticSources tactics = 0; ++ if (!tactic_sources_.empty()) { ++ tactics = GetTacticSourceFromString(tactic_sources_); ++ } + *p = {context->allocate_func, context->release_func, context->allocator_handle, &parsers_[context->node_name], + &engines_[context->node_name], &contexts_[context->node_name], &builders_[context->node_name], + &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name], + input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_, + dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_, + runtime_.get(), nullptr, allocator_, context_memory_sharing_enable_, &max_ctx_mem_size_, &context_memory_, +- dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_}; ++ dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, ++ build_heuristics_enable_, sparsity_enable_, builder_optimization_level_, auxiliary_streams_ , !tactic_sources_.empty(), tactics}; + *state = p.release(); + return 0; + }; +@@ -1779,6 +1913,38 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd + trt_config->setDLACore(trt_state->dla_core); + } + ++ // enable sparse weights ++ if (trt_state->sparsity_enable) { ++ trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS); ++ LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed"; ++ } ++ ++ // enable builder heuristics ++ if (trt_state->build_heuristics_enable) { ++ trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC ); ++ LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder heuristics are enabled"; ++ } ++ ++ // switch optimizaion level ++ if (trt_state->builder_optimization_level != 2) { ++ trt_config->setBuilderOptimizationLevel(trt_state->builder_optimization_level); ++ LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_; ++ } ++ ++ // limit auxiliary streams ++ if (trt_state->auxiliary_streams >= 0) { ++ trt_config->setMaxAuxStreams(trt_state->auxiliary_streams); ++ LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << trt_state->auxiliary_streams; ++ } ++ ++ // limit used tactic sources ++ if (trt_state->filter_tactic_sources) { ++ nvinfer1::TacticSources tactics = trt_config->getTacticSources(); ++ tactics |= trt_state->tactic_sources; ++ trt_config->setTacticSources(tactics); ++ LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using bitmask " << tactics; ++ } ++ + // Build engine + { + auto lock = GetApiLock(); +diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +index 4558b75fee..042495e961 100644 +--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h ++++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +@@ -30,6 +30,11 @@ static const std::string kDecryptionLibPath = "ORT_TENSORRT_ENGINE_DECRYPTION_LI + static const std::string kForceSequentialEngineBuild = "ORT_TENSORRT_FORCE_SEQUENTIAL_ENGINE_BUILD"; + static const std::string kContextMemorySharingEnable = "ORT_TENSORRT_CONTEXT_MEMORY_SHARING_ENABLE"; + static const std::string kLayerNormFP32Fallback = "ORT_TENSORRT_LAYER_NORM_FP32_FALLBACK"; ++static const std::string kBuildHeuristics = "ORT_TENSORRT_BUILD_HEURISTICS_ENABLE"; ++static const std::string kSparsityEnable = "ORT_TENSORRT_SPARSITY_ENABLE"; ++static const std::string kBuilderOptimizationLevel = "ORT_TENSORRT_BUILDER_OPTIMIZATION_LEVEL"; ++static const std::string kAuxiliaryStreams = "ORT_TENSORRT_AUXILIARY_STREAMS"; ++static const std::string kTacticSources = "ORT_TENSORRT_TACTIC_SOURCES"; + // Old env variable for backward compatibility + static const std::string kEngineCachePath = "ORT_TENSORRT_ENGINE_CACHE_PATH"; + } // namespace tensorrt_env_vars +@@ -114,6 +119,12 @@ struct TensorrtFuncState { + bool engine_decryption_enable = false; + int (*engine_decryption)(const char*, char*, size_t*) = nullptr; + int (*engine_encryption)(const char*, char*, size_t) = nullptr; ++ bool build_heuristics_enable = false; ++ bool sparsity_enable = false; ++ int builder_optimization_level = 2; ++ int auxiliary_streams = -1; ++ bool filter_tactic_sources = false; ++ nvinfer1::TacticSources tactic_sources; + }; + + // Logical device representation. +@@ -163,6 +174,11 @@ class TensorrtExecutionProvider : public IExecutionProvider { + bool int8_use_native_tensorrt_calibration_table_ = false; + bool dump_subgraphs_ = false; + bool engine_cache_enable_ = false; ++ bool build_heuristics_enable_ = false; ++ bool sparsity_enable_ = false; ++ int builder_optimization_level_ = 2; ++ int auxiliary_streams_ = -1; ++ std::string tactic_sources_; + std::string cache_path_, engine_decryption_lib_path_; + std::unique_ptr<nvinfer1::IRuntime> runtime_ = nullptr; + OrtMutex tensorrt_mu_; +diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc +index 2db405d512..b431cfc53b 100644 +--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc ++++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc +@@ -30,6 +30,11 @@ constexpr const char* kForceSequentialEngineBuild = "trt_force_sequential_engine + // add new provider option name here. + constexpr const char* kContextMemorySharingEnable = "trt_context_memory_sharing_enable"; + constexpr const char* kLayerNormFP32Fallback = "trt_layer_norm_fp32_fallback"; ++constexpr const char* kBuildHeuristics = "trt_build_heuristics_enable"; ++constexpr const char* kSparsityEnable = "trt_sparsity_enable"; ++constexpr const char* kBuilderOptimizationLevel = "trt_builder_optimization_level"; ++constexpr const char* kAuxiliaryStreams = "trt_auxiliary_streams"; ++constexpr const char* kTacticSources = "trt_tactic_sources"; + } // namespace provider_option_names + } // namespace tensorrt + +@@ -66,6 +71,11 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions + .AddAssignmentToReference(tensorrt::provider_option_names::kForceSequentialEngineBuild, info.force_sequential_engine_build) + .AddAssignmentToReference(tensorrt::provider_option_names::kContextMemorySharingEnable, info.context_memory_sharing_enable) + .AddAssignmentToReference(tensorrt::provider_option_names::kLayerNormFP32Fallback, info.layer_norm_fp32_fallback) ++ .AddAssignmentToReference(tensorrt::provider_option_names::kBuildHeuristics, info.build_heuristics_enable) ++ .AddAssignmentToReference(tensorrt::provider_option_names::kSparsityEnable, info.sparsity_enable) ++ .AddAssignmentToReference(tensorrt::provider_option_names::kBuilderOptimizationLevel, info.builder_optimization_level) ++ .AddAssignmentToReference(tensorrt::provider_option_names::kAuxiliaryStreams, info.auxiliary_streams) ++ .AddAssignmentToReference(tensorrt::provider_option_names::kTacticSources, info.tactic_sources) + .Parse(options)); // add new provider option here. + + return info; +@@ -93,6 +103,11 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE + // add new provider option here. + {tensorrt::provider_option_names::kContextMemorySharingEnable, MakeStringWithClassicLocale(info.context_memory_sharing_enable)}, + {tensorrt::provider_option_names::kLayerNormFP32Fallback, MakeStringWithClassicLocale(info.layer_norm_fp32_fallback)}, ++ {tensorrt::provider_option_names::kBuildHeuristics, MakeStringWithClassicLocale(info.build_heuristics_enable)}, ++ {tensorrt::provider_option_names::kSparsityEnable, MakeStringWithClassicLocale(info.sparsity_enable)}, ++ {tensorrt::provider_option_names::kBuilderOptimizationLevel, MakeStringWithClassicLocale(info.builder_optimization_level)}, ++ {tensorrt::provider_option_names::kAuxiliaryStreams, MakeStringWithClassicLocale(info.auxiliary_streams)}, ++ {tensorrt::provider_option_names::kTacticSources, MakeStringWithClassicLocale(info.tactic_sources)}, + }; + return options; + } +diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h +index 1f1fdb679f..d0715756a2 100644 +--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h ++++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h +@@ -33,6 +33,11 @@ struct TensorrtExecutionProviderInfo { + bool force_sequential_engine_build{false}; + bool context_memory_sharing_enable{false}; + bool layer_norm_fp32_fallback{false}; ++ bool build_heuristics_enable{false}; ++ bool sparsity_enable{false}; ++ int builder_optimization_level{2}; ++ int auxiliary_streams{-1}; ++ std::string tactic_sources{""}; + + static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options); + static ProviderOptions ToProviderOptions(const TensorrtExecutionProviderInfo& info); +diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc +index dd6915878e..8e0f1e50c6 100644 +--- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc ++++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc +@@ -70,6 +70,11 @@ struct Tensorrt_Provider : Provider { + info.force_sequential_engine_build = options.trt_force_sequential_engine_build != 0; + info.context_memory_sharing_enable = options.trt_context_memory_sharing_enable != 0; + info.layer_norm_fp32_fallback = options.trt_layer_norm_fp32_fallback != 0; ++ info.build_heuristics_enable = options.trt_build_heuristics_enable != 0; ++ info.build_heuristics_enable = options.trt_sparsity_enable; ++ info.build_heuristics_enable = options.trt_builder_optimization_level; ++ info.build_heuristics_enable = options.trt_auxiliary_streams; ++ info.build_heuristics_enable = options.trt_tactic_sources == nullptr ? "" : options.trt_tactic_sources; + return std::make_shared<TensorrtProviderFactory>(info); + } + +@@ -137,6 +142,24 @@ struct Tensorrt_Provider : Provider { + trt_options.trt_force_sequential_engine_build = internal_options.force_sequential_engine_build; + trt_options.trt_context_memory_sharing_enable = internal_options.context_memory_sharing_enable; + trt_options.trt_layer_norm_fp32_fallback = internal_options.layer_norm_fp32_fallback; ++ trt_options.trt_build_heuristics_enable = internal_options.build_heuristics_enable; ++ trt_options.trt_sparsity_enable = internal_options.build_heuristics_enable; ++ trt_options.trt_builder_optimization_level = internal_options.build_heuristics_enable; ++ trt_options.trt_auxiliary_streams = internal_options.build_heuristics_enable; ++ str_size = internal_options.tactic_sources.size(); ++ if (str_size == 0) { ++ trt_options.trt_tactic_sources = nullptr; ++ } else { ++ dest = new char[str_size + 1]; ++#ifdef _MSC_VER ++ strncpy_s(dest, str_size + 1, internal_options.tactic_sources.c_str(), str_size); ++#else ++ strncpy(dest, internal_options.tactic_sources.c_str(), str_size); ++#endif ++ dest[str_size] = '\0'; ++ trt_options.trt_tactic_sources = (const char*)dest; ++ } ++ + } + + ProviderOptions GetProviderOptions(const void* provider_options) override { +diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc +index 81510120f4..17545f2c06 100644 +--- a/onnxruntime/core/session/provider_bridge_ort.cc ++++ b/onnxruntime/core/session/provider_bridge_ort.cc +@@ -1274,6 +1274,11 @@ OrtTensorRTProviderOptionsV2 OrtTensorRTProviderOptionsToOrtTensorRTProviderOpti + // Use default value as this field is not available in OrtTensorRTProviderOptionsV + trt_options_converted.trt_context_memory_sharing_enable = 0; + trt_options_converted.trt_layer_norm_fp32_fallback = 0; ++ trt_options_converted.trt_build_heuristics_enable = 0; ++ trt_options_converted.trt_sparsity_enable = 0; ++ trt_options_converted.trt_builder_optimization_level = 2; ++ trt_options_converted.trt_auxiliary_streams = -1; ++ trt_options_converted.trt_tactic_sources = ""; + return trt_options_converted; + } + +diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc +index f61fe7b878..dc54198d55 100644 +--- a/onnxruntime/python/onnxruntime_pybind_state.cc ++++ b/onnxruntime/python/onnxruntime_pybind_state.cc +@@ -366,7 +366,12 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance( + nullptr, + 0, + 0, +- 0}; ++ 0, ++ 0, ++ 0, ++ 2, ++ -1, ++ nullptr}; + for (auto option : it->second) { + if (option.first == "device_id") { + if (!option.second.empty()) { +diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc +index 5465f81270..172c83be60 100644 +--- a/onnxruntime/test/providers/cpu/model_tests.cc ++++ b/onnxruntime/test/providers/cpu/model_tests.cc +@@ -701,7 +701,8 @@ TEST_P(ModelTest, Run) { + if (test_case_name.find(ORT_TSTR("FLOAT16")) != std::string::npos) { + OrtTensorRTProviderOptionsV2 params{0, 0, nullptr, 1000, 1, 1 << 30, + 1, // enable fp16 +- 0, nullptr, 0, 0, 0, 0, 0, nullptr, 0, nullptr, 0, 0, 0}; ++ 0, nullptr, 0, 0, 0, 0, 0, nullptr, 0, nullptr, 0, 0, 0, 0, 0, 0, 0, 0, ++ 2, -1, nullptr}; + ortso.AppendExecutionProvider_TensorRT_V2(params); + } else { + OrtTensorRTProviderOptionsV2* ep_option = nullptr; +diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc +index f74ecd9213..2c358d0912 100644 +--- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc ++++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc +@@ -151,7 +151,12 @@ void RunWithOneSessionSingleThreadInference(std::string model_name, std::string + nullptr, + 0, + 0, +- 0}; ++ 0, ++ 0, ++ 0, ++ 2, ++ -1, ++ nullptr}; + + params.trt_engine_cache_enable = 1; + std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(¶ms); +@@ -222,7 +227,12 @@ void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string + nullptr, + 0, + 0, +- 0}; ++ 0, ++ 0, ++ 0, ++ 2, ++ -1, ++ nullptr}; + + params.trt_engine_cache_enable = 1; + std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(¶ms); +@@ -386,7 +396,12 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) { + nullptr, + 0, + 0, +- 0}; ++ 0, ++ 0, ++ 0, ++ 2, ++ -1, ++ nullptr}; + + if (cache_type.compare("engine") == 0) { + @@ -1,137 +1,336 @@ -# Maintainer: Chih-Hsuan Yen <yan12125@gmail.com> +# Maintainer: Gustavo Alvarez <sl1pkn07@gmail.com> +# Contributor: Chih-Hsuan Yen <yan12125@gmail.com> + +# MAKEFLAGS=-j2 # NOTE Can be usseful when got OOM + +_ENABLE_CUDA=1 +_ENABLE_TENSORRT=0 # NOTE: not working due https://github.com/microsoft/onnxruntime/issues/15131 pkgbase=python-onnxruntime # Not split DNNL EP to another package as it's needed unconditionally at runtime if built at compile time # https://github.com/microsoft/onnxruntime/blob/v1.9.1/onnxruntime/python/onnxruntime_pybind_state.cc#L533 -pkgname=(python-onnxruntime python-onnxruntime-cuda) -pkgver=1.9.1 +pkgname=( + 'onnxruntime' + 'python-onnxruntime' +) +pkgver=1.16.3 pkgdesc='Cross-platform, high performance scoring engine for ML models' -pkgrel=4 -arch=(x86_64) +pkgrel=1 +arch=('x86_64') url='https://github.com/microsoft/onnxruntime' -license=(MIT) -depends=(nsync re2 python-flatbuffers python-numpy python-onnx python-protobuf openmpi onednn) -makedepends=(git cmake gtest gmock pybind11 python-setuptools nlohmann-json chrono-date boost eigen flatbuffers cuda cudnn nccl clang) +license=('MIT') +makedepends=( + 'git' + 'cmake' + 'ninja' + 'gcc-libs' + 'glibc' + 'cxxopts' + 'pybind11' + 'abseil-cpp' + 'nlohmann-json' + 'chrono-date' + 'boost' + 'eigen' +# 'flatbuffers' + 'onednn' +# 're2' +# 'protobuf' + 'nsync' + 'openmpi' + 'python-coloredlogs' + 'python-flatbuffers' + 'python-numpy' +# 'python-protobuf' + 'python-sympy' + 'python-setuptools' + 'python-installer' + 'python-wheel' + 'python-build' + + 'chrpath' +) # not de-vendored libraries # onnx: needs shared libonnx (https://github.com/onnx/onnx/issues/3030) -source=("git+https://github.com/microsoft/onnxruntime#tag=v$pkgver" - "git+https://github.com/onnx/onnx.git" - "git+https://github.com/dcleblanc/SafeInt.git" - "git+https://github.com/martinmoene/optional-lite.git" - "git+https://github.com/tensorflow/tensorboard.git" - "git+https://github.com/dmlc/dlpack.git" - "git+https://github.com/jarro2783/cxxopts.git" - "pytorch_cpuinfo::git+https://github.com/pytorch/cpuinfo.git" - build-fixes.patch - clang.patch - system-dnnl.diff) -sha512sums=('SKIP' - 'SKIP' - 'SKIP' - 'SKIP' - 'SKIP' - 'SKIP' - 'SKIP' - 'SKIP' - '685f0235abed6e1277dd0eb9bda56c464d1987fe7fc90a3550e17ec70cc49fd15f34996a0e159f9622c4ca3e6bf29917fe51b7849342531fa2a6808d782f1e06' - 'ad94af8bb25744b244c4f82e9a06189741f82b295a88523ca0e8005568fac710c2299d783989457e9cf96ef8da0593fb4f70c8792d416f44ab29d6493e204f13' - '6735c7aca2ba2f1f2a5286eb064125bf7f2c68a575d572dd157769d15778ff3e717b3a53d696c767748229f23ee6c3a7c82679df1d86283d7c4dd0ec9103ae08') -# CUDA seems not working with LTO -options+=('!lto') +source=( + "git+https://github.com/microsoft/onnxruntime#tag=v${pkgver}" + 'install-orttraining-files.diff' + 'system-dnnl.diff' +# 'system-flatbuffers.patch' +) +sha512sums=( + 'SKIP' + 'SKIP' + 'SKIP' +# 'SKIP' +) +options=('debug') + +if [[ $_ENABLE_CUDA = 1 ]]; then + pkgname+=('onnxruntime-cuda') + makedepends+=( + 'cuda' + 'cudnn' + 'nccl' + ) +fi + +if [[ $_ENABLE_TENSORRT = 1 ]]; then + pkgname+=('onnxruntime-tensorrt') + makedepends+=('tensorrt') +# depends+=('protobuf' 'libprotobuf.so') +fi # Check PKGBUILDs of python-pytorch and tensorflow for CUDA architectures built by official packages -_CUDA_ARCHITECTURES="52-real;53-real;60-real;61-real;62-real;70-real;72-real;75-real;80-real;86-real;86-virtual" +_CUDA_ARCHITECTURES="52-real;53-real;60-real;61-real;62-real;70-real;72-real;75-real;80-real;86-real;87-real;89-real;90-real;90-virtual" prepare() { cd onnxruntime - patch -Np1 -i ../build-fixes.patch - patch -Np1 -i ../clang.patch - patch -Np1 -i ../system-dnnl.diff + # Use System Dlnn + patch -Np1 -i "${srcdir}/system-dnnl.diff" + + # Find system nlohmann-json + sed 's|3.10 ||g' \ + -i cmake/external/onnxruntime_external_deps.cmake + + # Find system chrono-date + sed -e 's|${DEP_SHA1_date}|&\n \ \ \ \ \ \FIND_PACKAGE_ARGS NAMES date|g' \ + -e 's|date_interface|date::date-tz|g' \ + -i cmake/external/onnxruntime_external_deps.cmake \ + -i cmake/onnxruntime_common.cmake \ + -i cmake/onnxruntime_unittests.cmake + + # Find system abseil-cpp + sed 's|ABSL_PATCH_COMMAND}|&\n\ \ \ \ \FIND_PACKAGE_ARGS NAMES absl|g' \ + -i cmake/external/abseil-cpp.cmake + + # Find system cxxopts + sed 's|${DEP_SHA1_cxxopts}|&\n\ \ \ \ \FIND_PACKAGE_ARGS NAMES cxxopts|g' \ + -i cmake/external/onnxruntime_external_deps.cmake + +# # Find system mimalloc +# sed 's|${DEP_SHA1_mimalloc}|&\n\ \ \ \ \ \ \FIND_PACKAGE_ARGS NAMES mimalloc|g' \ +# -i cmake/external/onnxruntime_external_deps.cmake + + # Find system nsync + sed -e 's|NAMES nsync|&_cpp|g' \ + -e '295aadd_library(nsync::nsync_cpp ALIAS nsync_cpp)' \ + -i cmake/external/onnxruntime_external_deps.cmake - git submodule init - for mod in onnx SafeInt optional-lite tensorboard dlpack cxxopts pytorch_cpuinfo; do - git config submodule.cmake/external/$mod.url "$srcdir"/$mod - git submodule update cmake/external/$mod - done + if [[ $_ENABLE_TENSORRT = 1 ]]; then + # Update Tensorboard 8343cad89d984c199637ead11c8d4c053191673a (2.15.1) +# # Update Tensorboard a01ceb5957d9ecd56314df115c09e3ddb60d12f7 +# sed -e 's|373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81|a01ceb5957d9ecd56314df115c09e3ddb60d12f7|g' \ +# -e 's|ff427b6a135344d86b65fa2928fbd29886eefaec|113750f323d131859ac4e17070d2c9417e80d701|g' \ +# -i cmake/deps.txt + + + # Update onnx_tensorrt 6ba67d3428e05f690145373ca87fb8d32f98df45 (8.6 GA) + sed -e 's|0462dc31ae78f48744b6141ae376df1f96d3f459|6ba67d3428e05f690145373ca87fb8d32f98df45|g' \ + -e 's|67b833913605a4f3f499894ab11528a702c2b381|67b833913605a4f3f499894ab11528a702c2b381|g' \ + -i cmake/deps.txt + fi + + patch -Np1 -i "${srcdir}/install-orttraining-files.diff" +# patch -Np1 -i "${srcdir}/system-flatbuffers.patch" + + # fix build with gcc12(?), take idea from https://github.com/microsoft/onnxruntime/pull/11667 and https://github.com/microsoft/onnxruntime/pull/10014 + sed 's|dims)|TensorShape(dims))|g' \ + -i onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_qdq.cc + + # fix missing #include <iostream> + sed '11a#include <iostream>' \ + -i orttraining/orttraining/test/training_api/trainer/trainer.cc + +# cd onnxruntime/core/flatbuffers/schema +# python compile_schema.py --flatc /usr/bin/flatc } build() { - cd "$srcdir"/onnxruntime - local cmake_args=( + if [[ ${_ENABLE_CUDA} = 1 ]]; then + export CC="/opt/cuda/bin/gcc" + export CXX="/opt/cuda/bin/g++" + export CUDAHOSTCXX="${CXX}" + fi + + # Gcc 12+ + CXXFLAGS+=" -Wno-maybe-uninitialized -Wno-error=restrict" + CFLAGS="${CFLAGS/_FORTIFY_SOURCE=2/_FORTIFY_SOURCE=0}" + CXXFLAGS="${CXXFLAGS/_FORTIFY_SOURCE=2/_FORTIFY_SOURCE=0}" + + + # Use -Donnxruntime_ENABLE_LAZY_TENSOR=OFF as it requires patched python-pytorch + # See: https://github.com/microsoft/onnxruntime/pull/10460 https://github.com/pytorch/pytorch/pulls/wschin + local _cmake_args=( + -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=/usr + -DCMAKE_SKIP_INSTALL_RPATH=OFF + -DCMAKE_SKIP_RPATH=OFF -Donnxruntime_ENABLE_PYTHON=ON - -Donnxruntime_PREFER_SYSTEM_LIB=ON -Donnxruntime_BUILD_SHARED_LIB=ON + -Donnxruntime_BUILD_UNIT_TESTS=OFF -Donnxruntime_ENABLE_TRAINING=ON + -Donnxruntime_ENABLE_LAZY_TENSOR=OFF -Donnxruntime_USE_MPI=ON - -Donnxruntime_USE_PREINSTALLED_EIGEN=ON -Donnxruntime_USE_DNNL=ON - -Deigen_SOURCE_PATH=/usr/include/eigen3 + -Donnxruntime_USE_PREINSTALLED_EIGEN=ON + -Deigen_SOURCE_PATH="$(pkg-config --cflags eigen3 | sed 's|-I||g')" + -DCMAKE_CXX_STANDARD=17 + -DCMAKE_IGNORE_PATH=/usr/lib/cmake/flatbuffers/\;/lib/cmake/flatbuffers/\;/usr/lib/cmake/protobuf/\;/lib/cmake/protobuf/ + -DBUILD_TESTING=OFF ) # Use protobuf-lite instead of full protobuf to workaround symbol conflicts # with onnx; see https://github.com/onnx/onnx/issues/1277 for details. - cmake_args+=( - -DONNX_CUSTOM_PROTOC_EXECUTABLE=/usr/bin/protoc + _cmake_args+=( -Donnxruntime_USE_FULL_PROTOBUF=OFF ) - # 1. Redefine ___is_signed to ___is_signed to workaround a regression - # from CUDA 11.3 -> 11.3.1 [1]. - # 2. Enable parallel builds for NVCC via -t0, which spawns multiple - # cicc and ptxas processes for each nvcc invocation. The number of - # total processes may be much larger than the number of cores - let - # the scheduler handle it. - # [1] https://forums.developer.nvidia.com/t/182176 - cmake_args+=( - -DCMAKE_CUDA_HOST_COMPILER=/usr/bin/clang - -DCMAKE_CUDA_FLAGS="-D__is_signed=___is_signed -t0" - -DCMAKE_CUDA_ARCHITECTURES="$_CUDA_ARCHITECTURES" - -Donnxruntime_USE_CUDA=ON - -Donnxruntime_CUDA_HOME=/opt/cuda - -Donnxruntime_CUDNN_HOME=/usr - -Donnxruntime_USE_NCCL=ON - ) + if [[ ${_ENABLE_CUDA} = 1 ]]; then + _cmake_args+=( + -DCMAKE_CUDA_ARCHITECTURES="${_CUDA_ARCHITECTURES}" + -DCMAKE_CUDA_STANDARD_REQUIRED=ON + -DCMAKE_CXX_STANDARD_REQUIRED=ON + -Donnxruntime_USE_CUDA=ON + -Donnxruntime_CUDA_HOME=/opt/cuda + -Donnxruntime_CUDNN_HOME=/usr + -Donnxruntime_USE_NCCL=ON + -Donnxruntime_NVCC_THREADS=1 + ) + fi - # Use clang as GCC does not work. GCC 11 crashes with internal - # compiler errors. GCC 10 does not work as some dependent packages - # (ex: re2) are built with libstdc++ from GCC 11, and thus linking - # onnxruntime with libstdc++ 10 fails. - CC=/usr/bin/clang CXX=/usr/bin/clang++ \ - cmake -B build -S cmake "${cmake_args[@]}" "$@" + if [[ ${_ENABLE_TENSORRT} = 1 ]]; then + _cmake_args+=( + -Donnxruntime_USE_TENSORRT=ON + -Donnxruntime_USE_TENSORRT_BUILTIN_PARSER=ON + ) + fi + cmake -S onnxruntime/cmake -B build \ + "${_cmake_args[@]}" \ + "$@" \ + -G Ninja + + LC_ALL=C cmake --build build #-v + + ( cd build - make - python ../setup.py build + install -Dm644 ../onnxruntime/docs/python/README.rst docs/python/README.rst + ln -s ../onnxruntime/setup.py . + python -m build --wheel --no-isolation + ) + } -package_python-onnxruntime() { - cd onnxruntime/build +package_onnxruntime() { + depends=( + 'gcc-libs' # libgcc_s.so libstdc++.so + 'glibc' # ld-linux-x86-64.so libc.so ibm.so + 'onednn' # libdnnl.so + 'openmpi' 'libmpi.so' + 'abseil-cpp' # libabsl_hash.so libabsl_raw_hash_set.so libabsl_raw_logging_internal.so libabsl_throw_delegate.so + 'nsync' # libnsync_cpp.so +# 'protobuf' 'libprotobuf-lite.so' + ) + provides=( + 'libonnxruntime.so' + 'libonnxruntime_providers_shared.so' + ) - make install DESTDIR="$pkgdir" + DESTDIR="${pkgdir}" cmake --install build - python ../setup.py install --root="$pkgdir" --skip-build --optimize=1 + # installed as split packages + rm -vf "${pkgdir}/usr/lib/"libonnxruntime_providers_{tensorrt,cuda}.so - PY_ORT_DIR="$(python -c 'import site; print(site.getsitepackages()[0])')/onnxruntime" - install -Ddm755 "$pkgdir"/usr/share/licenses/$pkgname - for f in LICENSE ThirdPartyNotices.txt ; do - ln -s "$PY_ORT_DIR/$f" "$pkgdir"/usr/share/licenses/$pkgname/$f - done - # already installed by `make install`, and not useful as this path is not looked up by the linker - rm -vf "$pkgdir/$PY_ORT_DIR"/capi/libonnxruntime_providers_* + chrpath -d "${pkgdir}/usr/lib/"libonnxruntime.so.* - # installed as split packages - rm -vf "$pkgdir"/usr/lib/libonnxruntime_providers_cuda.so + install -Dm644 onnxruntime/LICENSE "${pkgdir}/usr/share/licenses/${pkgname}/LICENSE" + install -Dm644 onnxruntime/ThirdPartyNotices.txt "${pkgdir}/usr/share/licenses/${pkgname}/ThirdPartyNotices.txt" +} + +package_python-onnxruntime() { + pkgdesc+=' (Python Bindings)' + depends=( + 'onnxruntime' + 'gcc-libs' # libgcc_s.so libstdc++.so + 'glibc' # ld-linux-x86-64.so libc.so libm.so + 'abseil-cpp' # libabsl_hash.so libabsl_raw_hash_set.so libabsl_raw_logging_internal.so libabsl_throw_delegate.so + 'openmpi' 'libmpi.so' + 'nsync' # libnsync_cpp.so +# 'protobuf' 'libprotobuf-lite.so' + 'python-coloredlogs' + 'python-flatbuffers' + 'python-numpy' +# 'python-protobuf' + 'python-sympy' + 'python-packaging' + 'python-setuptools' + 'python-requests' + ) + optdepends=( + # https://github.com/microsoft/onnxruntime/pull/9969 + 'python-onnx: for the backend API, quantization, orttraining, transformers and various tools' + 'python-psutil: for transformers' + 'python-py-cpuinfo: for transformers' + 'python-py3nvml: for transformers' + 'python-transformers: for transformers' + 'python-scipy: for transformers and various tools' + 'python-pytorch: for transformers, orttraining and various tools' + 'python-pytorch-cuda' + 'python-cerberus: for orttraining' + 'python-h5py: for orttraining' + 'python-matplotlib' + 'python-tensorflow-opt-cuda' + 'python-importlib-metadata' + ) + + python -m installer --destdir="${pkgdir}" build/dist/*.whl + + _PY_ORT_DIR="$(python -c 'import site; print(site.getsitepackages()[0])')/onnxruntime" + # already installed by `cmake --install`, and not useful as this path is not looked up by the linker + rm -vf "${pkgdir}/${_PY_ORT_DIR}"/capi/libonnxruntime_providers_* + chrpath -d "${pkgdir}/${_PY_ORT_DIR}/capi/onnxruntime_pybind11_state.so" + + install -Ddm755 "${pkgdir}/usr/share/licenses" + ln -s onnxruntime "${pkgdir}/usr/share/licenses/${pkgname}" } -package_python-onnxruntime-cuda() { - depends+=(cuda cudnn nccl python-onnxruntime) +package_onnxruntime-cuda() { pkgdesc+=' (CUDA execution provider)' + depends=( + 'gcc-libs' # libgcc_s.so libstdc++.so + 'glibc' # ld-linux-x86-64.so libc.so libm.so + 'cudnn' # libcudnn.so + 'nccl' # libnccl.so + 'openmpi' 'libmpi.so' + 'nsync' # libnsync_cpp.so + 'abseil-cpp' # libabsl_hash.so libabsl_raw_hash_set.so libabsl_raw_logging_internal.so libabsl_throw_delegate.so + 'cuda' 'libcublas.so' 'libcudart.so' # libcublasLt.so libcufft.so +# 'protobuf' 'libprotobuf-lite.so' + ) + conflicts=('python-onnxruntime-cuda') + replaces=('python-onnxruntime-cuda') + + install -Dm755 build/libonnxruntime_providers_cuda.so -t "${pkgdir}/usr/lib" + + install -Ddm755 "${pkgdir}/usr/share/licenses" + ln -s onnxruntime "${pkgdir}/usr/share/licenses/${pkgname}" +} + +package_onnxruntime-tensorrt() { + pkgdesc+=' (TensorRT execution provider)' + depends=( + 'tensorrt' +# 'protobuf' 'libprotobuf-lite.so' + 'nsync' +# 'flatbuffers' + ) + pkgdesc+=' (TENSORRT execution provider)' + + install -Dm755 build/libonnxruntime_providers_tensorrt.so -t "${pkgdir}/usr/lib" - cd onnxruntime/build - install -Dm755 libonnxruntime_providers_cuda.so -t "$pkgdir"/usr/lib - install -Ddm755 "$pkgdir"/usr/share/licenses - ln -s python-onnxruntime "$pkgdir"/usr/share/licenses/$pkgname + install -Ddm755 "${pkgdir}/usr/share/licenses" + ln -s onnxruntime "${pkgdir}/usr/share/licenses/${pkgname}" } diff --git a/build-fixes.patch b/build-fixes.patch index 129aa988523e..a0b9b3c3e060 100644 --- a/build-fixes.patch +++ b/build-fixes.patch @@ -2,44 +2,35 @@ diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index a027c69e0..eb7608518 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt -@@ -753,7 +753,8 @@ get_filename_component(ORTTRAINING_ROOT "${ORTTRAINING_ROOT}" ABSOLUTE) - get_filename_component(REPO_ROOT "${REPO_ROOT}" ABSOLUTE) - set(ONNXRUNTIME_INCLUDE_DIR ${REPO_ROOT}/include/onnxruntime) +@@ -733,7 +733,7 @@ --add_subdirectory(external/date EXCLUDE_FROM_ALL) -+find_package(date REQUIRED) -+add_library(date_interface ALIAS date::date) - - set(SAFEINT_INCLUDE_DIR ${REPO_ROOT}/cmake/external/SafeInt) - add_library(safeint_interface INTERFACE) -@@ -764,14 +765,17 @@ if(onnxruntime_DISABLE_EXCEPTIONS) - add_compile_definitions(optional_CONFIG_NO_EXCEPTIONS=1) + if (NOT WIN32) + if (onnxruntime_PREFER_SYSTEM_LIB) +- find_package(nsync) ++ find_package(nsync_cpp) + endif() + if (TARGET nsync_cpp) # linking error with nsync_FOUND (why?) + message("Use nsync from preinstalled system lib") +@@ -764,9 +765,11 @@ if(onnxruntime_DISABLE_EXCEPTIONS) endif() --add_subdirectory(external/mp11 EXCLUDE_FROM_ALL) -+add_library(boost_mp11 INTERFACE) -+add_library(Boost::mp11 ALIAS boost_mp11) - - set(JSON_BuildTests OFF CACHE INTERNAL "") - set(JSON_Install OFF CACHE INTERNAL "") --add_subdirectory(external/json EXCLUDE_FROM_ALL) -+find_package(nlohmann_json REQUIRED) - - if(onnxruntime_PREFER_SYSTEM_LIB) + if (onnxruntime_PREFER_SYSTEM_LIB) - find_package(re2) + find_package(PkgConfig) + pkg_check_modules(RE2 IMPORTED_TARGET re2) + add_library(re2::re2 ALIAS PkgConfig::RE2) endif() - if(NOT TARGET re2::re2) +-if (re2_FOUND) ++if (TARGET re2::re2) + message("Use re2 from preinstalled system lib") + else() add_subdirectory(external/re2 EXCLUDE_FROM_ALL) -@@ -1261,7 +1265,8 @@ set(FLATBUFFERS_INSTALL OFF CACHE BOOL "FLATBUFFERS_INSTALL" FORCE) - set(FLATBUFFERS_BUILD_FLATHASH OFF CACHE BOOL "FLATBUFFERS_BUILD_FLATHASH" FORCE) - set(FLATBUFFERS_BUILD_FLATLIB ON CACHE BOOL "FLATBUFFERS_BUILD_FLATLIB" FORCE) - set_msvc_c_cpp_compiler_warning_level(4) --add_subdirectory(external/flatbuffers EXCLUDE_FROM_ALL) -+find_package(Flatbuffers REQUIRED) -+add_library(flatbuffers ALIAS flatbuffers::flatbuffers_shared) - set_msvc_c_cpp_compiler_warning_level(3) - list(APPEND onnxruntime_EXTERNAL_DEPENDENCIES flatbuffers) - list(APPEND onnxruntime_EXTERNAL_LIBRARIES flatbuffers) +@@ -1421,7 +1421,7 @@ + endif() + if (Flatbuffers_FOUND) + message("Use flatbuffers from preinstalled system lib") +- add_library(flatbuffers ALIAS flatbuffers::flatbuffers) ++ add_library(flatbuffers ALIAS flatbuffers::flatbuffers_shared) + else() + message("Use flatbuffers from submodule") + # We do not need to build flatc for iOS or Android Cross Compile diff --git a/clang.patch b/clang.patch deleted file mode 100644 index ab6bb90fa315..000000000000 --- a/clang.patch +++ /dev/null @@ -1,22 +0,0 @@ -diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc -index 955df6d9a..f9fd53e15 100644 ---- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc -+++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc -@@ -39,7 +39,7 @@ optional<std::pair<int64_t, int64_t>> GetMinAndMaxContiguousAxes( - } - - // normalize axis values and sort -- const std::vector<int64_t> axes = [&original_axes, rank]() { -+ const std::vector<int64_t> axes = [&original_axes, rank]() -> std::vector<int64_t> { - std::vector<int64_t> result(original_axes); - std::for_each( - result.begin(), result.end(), -@@ -85,7 +85,7 @@ optional<std::pair<int64_t, int64_t>> GetMinAndMaxContiguousAxes( - return std::distance(dims.begin(), before_min_axis_rit.base()); - }(); - -- const int64_t max_axis = [&dims, &axes, &is_dim_one]() { -+ const int64_t max_axis = [&dims, &axes, &is_dim_one]() -> int64_t { - const auto& max_given_axis = axes.back(); - const auto after_max_given_axis_it = dims.begin() + max_given_axis + 1; - const auto after_max_axis_it = diff --git a/install-orttraining-files.diff b/install-orttraining-files.diff new file mode 100644 index 000000000000..8da1337458df --- /dev/null +++ b/install-orttraining-files.diff @@ -0,0 +1,21 @@ +diff --git a/setup.py b/setup.py +index 294b975a56..1612a4419c 100644 +--- a/setup.py ++++ b/setup.py +@@ -523,7 +523,7 @@ classifiers = [ + if not enable_training: + classifiers.extend(["Operating System :: Microsoft :: Windows", "Operating System :: MacOS"]) + +-if enable_training or enable_training_apis: ++if True: + packages.append("onnxruntime.training") + if enable_training: + packages.extend( +@@ -565,6 +565,7 @@ if enable_training or enable_training_apis: + ] + ) + ++if enable_training: + requirements_file = "requirements-training.txt" + # with training, we want to follow this naming convention: + # stable: diff --git a/notes.txt b/notes.txt deleted file mode 100644 index 62efe9cd463b..000000000000 --- a/notes.txt +++ /dev/null @@ -1,3 +0,0 @@ -Build system changes in 1.10 - Build: respect onnxruntime_PREFER_SYSTEM_LIB for more things https://github.com/microsoft/onnxruntime/pull/9181 - Remove optional-lite https://github.com/microsoft/onnxruntime/pull/9424 diff --git a/system-dnnl.diff b/system-dnnl.diff index 1444d8acaa1e..c9d5bbdcb44b 100644 --- a/system-dnnl.diff +++ b/system-dnnl.diff @@ -1,14 +1,15 @@ diff --git a/cmake/external/dnnl.cmake b/cmake/external/dnnl.cmake -index 6a51a3d5d..a89635210 100644 +index 175ad41b6f..8c3ba11491 100644 --- a/cmake/external/dnnl.cmake +++ b/cmake/external/dnnl.cmake -@@ -26,6 +26,13 @@ elseif(onnxruntime_USE_DNNL AND onnxruntime_DNNL_GPU_RUNTIME STREQUAL "ocl" AND +@@ -26,6 +26,14 @@ elseif(onnxruntime_USE_DNNL AND onnxruntime_DNNL_GPU_RUNTIME STREQUAL "ocl" AND endif() if (onnxruntime_USE_DNNL) -+if (onnxruntime_PREFER_SYSTEM_LIB) -+ # https://oneapi-src.github.io/oneDNN/dev_guide_transition_to_dnnl.html -+ find_package(dnnl CONFIG REQUIRED) ++# https://oneapi-src.github.io/oneDNN/dev_guide_transition_to_dnnl.html ++find_package(dnnl CONFIG) ++if (TARGET DNNL::dnnl) ++ message(STATUS "DNNL version: ${dnnl_VERSION}") + add_library(project_dnnl INTERFACE) + add_library(dnnl INTERFACE) + target_link_libraries(dnnl INTERFACE DNNL::dnnl) @@ -16,20 +17,33 @@ index 6a51a3d5d..a89635210 100644 set(DNNL_SOURCE ${CMAKE_CURRENT_BINARY_DIR}/dnnl/src/dnnl/src) set(DNNL_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/dnnl/install) set(DNNL_LIB_DIR ${DNNL_INSTALL}/${CMAKE_INSTALL_LIBDIR}) -@@ -55,3 +62,4 @@ if (onnxruntime_USE_DNNL) +@@ -55,3 +66,4 @@ if (onnxruntime_USE_DNNL) ) link_directories(${DNNL_LIB_DIR}) endif() +endif() +diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake +index c24b6b9be5..11696cd761 100644 +--- a/cmake/onnxruntime_python.cmake ++++ b/cmake/onnxruntime_python.cmake +@@ -795,7 +795,7 @@ if (onnxruntime_ENABLE_TRAINING) + endif() + endif() + +-if (onnxruntime_USE_DNNL) ++if (onnxruntime_USE_DNNL AND NOT TARGET DNNL::dnnl) + add_custom_command( + TARGET onnxruntime_pybind11_state POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake -index 6bdb2d03c..514faa375 100644 +index 00c017298c..1861e1a7ad 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake -@@ -744,10 +744,12 @@ add_custom_command( - if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) - if (onnxruntime_USE_DNNL) +@@ -867,10 +867,12 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) + target_compile_definitions(onnxruntime_test_all PUBLIC -DDNNL_GPU_RUNTIME=OCL) + endif() list(APPEND onnx_test_libs dnnl) -+ if (NOT onnxruntime_PREFER_SYSTEM_LIB) ++ if (NOT TARGET DNNL::dnnl) add_custom_command( TARGET ${test_data_target} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${DNNL_DLL_PATH} $<TARGET_FILE_DIR:${test_data_target}> @@ -38,3 +52,4 @@ index 6bdb2d03c..514faa375 100644 endif() if(WIN32) if (onnxruntime_USE_TVM) + diff --git a/system-flatbuffers.patch b/system-flatbuffers.patch new file mode 100644 index 000000000000..87b010f4ae47 --- /dev/null +++ b/system-flatbuffers.patch @@ -0,0 +1,14 @@ +diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake +index 0c41945778..39e3fc89e7 100644 +--- a/cmake/external/onnxruntime_external_deps.cmake ++++ b/cmake/external/onnxruntime_external_deps.cmake +@@ -289,6 +289,9 @@ namespace std { using ::getenv; } + target_compile_options(flatc PRIVATE /FI${CMAKE_BINARY_DIR}/gdk_cstdlib_wrapper.h) + endif() + endif() ++else() ++ add_executable(flatc ALIAS flatbuffers::flatc) ++ add_library(flatbuffers::flatbuffers ALIAS flatbuffers::flatbuffers_shared) + endif() + + if (onnxruntime_BUILD_UNIT_TESTS) |