From 1c7cef008e59fe21c5813e84cad07227c82bff58 Mon Sep 17 00:00:00 2001 From: Hector Li Date: Thu, 29 Aug 2019 11:53:10 -0700 Subject: [PATCH] Fix a issue that CUDA EP fallback to much nodes to CPU for some case which cause huge data copy. If the node's inputs are all initializer, we shouldn't fallback the node to CPU. --- .../core/providers/cuda/cuda_execution_provider.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index c4de7e676c1bf..04a87a120bf0f 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -1018,6 +1018,7 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, // Note that nodes with only inputs from initializer would not be place on CUDA // Ideally, those nodes should be eliminated in constant folding bool should_force_outside = true; + bool all_input_are_initializer = true; node.ForEachWithIndex( node.InputDefs(), [&](const NodeArg& def, size_t index) { @@ -1025,12 +1026,17 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, // The input is not a initializer and the input is from CPU // or the input declared as CPU memory and is from CPU // in that case we should still keep the node on CUDA - if ((!graph.GetInitializedTensor(def.Name(), initializer) && !defs_outside_cuda.count(&def)) || + bool initializer_input = graph.GetInitializedTensor(def.Name(), initializer); + if ((!initializer_input && !defs_outside_cuda.count(&def)) || (defs_outside_cuda.count(&def) && cuda_kernel_def->kernel_def->IsInputOnCpu(index))) should_force_outside = false; + if (!initializer_input) { + all_input_are_initializer = false; + } return Status::OK(); }); - if (should_force_outside) { + // If all the inputs are initialier, we shouldn't force it to CPU + if (should_force_outside && !all_input_are_initializer) { force_outside = true; } }