-
Notifications
You must be signed in to change notification settings - Fork 3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[TensorRT] Fix perf issue for DDS nodes run by TRT 10 #23424
base: main
Are you sure you want to change the base?
Changes from 3 commits
f3d58e8
74f81f0
8785c3c
08e22bf
16f0e08
2b214ff
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -2481,17 +2481,16 @@ | |||||||||||||||||||||||
std::vector<size_t> nodes_vector(number_of_ort_nodes); | ||||||||||||||||||||||||
std::iota(std::begin(nodes_vector), std::end(nodes_vector), 0); | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
std::set<std::string> exclude_ops_set; | ||||||||||||||||||||||||
std::set<std::string> exclude_ops_set; // currently not support to exclude ops | ||||||||||||||||||||||||
Check warning on line 2484 in onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc GitHub Actions / Optional Lint C++
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
/* | ||||||||||||||||||||||||
* There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) in TRT 10. | ||||||||||||||||||||||||
* TRT EP automatically excludes DDS ops from running on TRT. | ||||||||||||||||||||||||
* There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) when running TRT EP with TRT 10. | ||||||||||||||||||||||||
* DDS op needs special handling here. | ||||||||||||||||||||||||
*/ | ||||||||||||||||||||||||
if (trt_version_ >= 100000 && trt_version_ < 110000) { | ||||||||||||||||||||||||
exclude_ops_set.insert("NonMaxSuppression"); | ||||||||||||||||||||||||
exclude_ops_set.insert("NonZero"); | ||||||||||||||||||||||||
exclude_ops_set.insert("RoiAlign"); | ||||||||||||||||||||||||
LOGS_DEFAULT(VERBOSE) << "There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) in TRT 10. TRT EP automatically excludes DDS ops from running on TRT, if applicable"; | ||||||||||||||||||||||||
if (trt_version_ >= 100000) { | ||||||||||||||||||||||||
dds_op_set_.insert("NonMaxSuppression"); | ||||||||||||||||||||||||
dds_op_set_.insert("NonZero"); | ||||||||||||||||||||||||
dds_op_set_.insert("RoiAlign"); | ||||||||||||||||||||||||
} | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
SubGraphCollection_t parser_nodes_vector, supported_nodes_vector; | ||||||||||||||||||||||||
|
@@ -2649,6 +2648,10 @@ | |||||||||||||||||||||||
} | ||||||||||||||||||||||||
LOGS_DEFAULT(INFO) << "[TensorRT EP] Whole graph will run on TensorRT execution provider"; | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
#if NV_TENSORRT_MAJOR >= 10 | ||||||||||||||||||||||||
// TRT EP will take appropriate actions later to prevent performance degradation if the graph has DDS op that run by TRT 10. | ||||||||||||||||||||||||
is_dds_op_in_graph_ = IsDDSOpInSubGraph(graph, result, dds_op_set_); | ||||||||||||||||||||||||
#endif | ||||||||||||||||||||||||
// The context map is only used during EP compile time, release it to save memory space. | ||||||||||||||||||||||||
subgraph_context_map_.clear(); | ||||||||||||||||||||||||
return result; | ||||||||||||||||||||||||
|
@@ -2665,6 +2668,11 @@ | |||||||||||||||||||||||
} | ||||||||||||||||||||||||
} | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
#if NV_TENSORRT_MAJOR >= 10 | ||||||||||||||||||||||||
// TRT EP will take appropriate actions later to prevent performance degradation if the graph has DDS op that run by TRT 10. | ||||||||||||||||||||||||
is_dds_op_in_graph_ = IsDDSOpInSubGraph(graph, result, dds_op_set_); | ||||||||||||||||||||||||
#endif | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||||
const size_t number_of_subgraphs = supported_nodes_vector.size(); | ||||||||||||||||||||||||
if (number_of_trt_nodes == 0) { | ||||||||||||||||||||||||
LOGS_DEFAULT(WARNING) << "[TensorRT EP] No graph will run on TensorRT execution provider"; | ||||||||||||||||||||||||
|
@@ -2765,6 +2773,18 @@ | |||||||||||||||||||||||
|
||||||||||||||||||||||||
common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs, | ||||||||||||||||||||||||
std::vector<NodeComputeInfo>& node_compute_funcs) { | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
#if NV_TENSORRT_MAJOR >= 10 | ||||||||||||||||||||||||
// There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) when running TRT EP with TRT 10. | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||||
// The issue arises because when cudaStreamSynchronize is called after inference, GPU memory is released back to the OS. | ||||||||||||||||||||||||
// As a result, for the next inference run, TRT reallocates GPU memory from the OS, introducing overhead and leading to performance degradation. | ||||||||||||||||||||||||
// The solution is to increase the memory pool threshold, allowing TRT to retain the allocated memory and reduce this overhead. | ||||||||||||||||||||||||
if (is_dds_op_in_graph_) { | ||||||||||||||||||||||||
trt_gpu_allocator_ = std::make_unique<onnxruntime::PoolAllocator>(); | ||||||||||||||||||||||||
runtime_->setGpuAllocator(trt_gpu_allocator_.get()); | ||||||||||||||||||||||||
} | ||||||||||||||||||||||||
#endif | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
for (auto& fused_node_graph : fused_nodes_and_graphs) { | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||||
const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph; | ||||||||||||||||||||||||
const Node& fused_node = fused_node_graph.fused_node; | ||||||||||||||||||||||||
|
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -114,6 +114,50 @@ | |||||||||||||
using unique_pointer = std::unique_ptr<T, TensorrtInferDeleter>; | ||||||||||||||
}; // namespace tensorrt_ptr | ||||||||||||||
|
||||||||||||||
#if NV_TENSORRT_MAJOR >= 10 | ||||||||||||||
/* | ||||||||||||||
* Application-implemented class for controlling asynchronous (stream ordered) memory allocation on the GPU. | ||||||||||||||
* | ||||||||||||||
*/ | ||||||||||||||
class PoolAllocator : public nvinfer1::IGpuAsyncAllocator { | ||||||||||||||
public: | ||||||||||||||
PoolAllocator() { | ||||||||||||||
cudaMemPoolProps poolProps{}; | ||||||||||||||
poolProps.allocType = ::cudaMemAllocationTypePinned; | ||||||||||||||
poolProps.handleTypes = ::cudaMemHandleTypeNone; | ||||||||||||||
poolProps.location.type = ::cudaMemLocationTypeDevice; | ||||||||||||||
poolProps.location.id = 0; | ||||||||||||||
cudaMemPoolCreate(&mPool, &poolProps); | ||||||||||||||
auto maxThreshold = std::numeric_limits<std::uint64_t>::max(); | ||||||||||||||
Check warning on line 131 in onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h GitHub Actions / Optional Lint C++
|
||||||||||||||
// cudaMemPoolAttrReleaseThreshold: | ||||||||||||||
// Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS. | ||||||||||||||
// When more than the release threshold bytes of memory are held by the memory pool, the allocator | ||||||||||||||
// will try to release memory back to the OS on the next call to stream, event or context synchronize | ||||||||||||||
cudaMemPoolSetAttribute(mPool, cudaMemPoolAttrReleaseThreshold, &maxThreshold); | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
void* allocateAsync(uint64_t const size, uint64_t const alignment, nvinfer1::AllocatorFlags const flags, | ||||||||||||||
cudaStream_t stream) noexcept override { | ||||||||||||||
void* memory{nullptr}; | ||||||||||||||
cudaMallocFromPoolAsync(&memory, size, mPool, stream); | ||||||||||||||
return memory; | ||||||||||||||
} | ||||||||||||||
bool deallocateAsync(void* const memory, cudaStream_t stream) noexcept override { | ||||||||||||||
cudaFreeAsync(memory, stream); | ||||||||||||||
return true; | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
~PoolAllocator() { | ||||||||||||||
if (mPool) { | ||||||||||||||
cudaMemPoolDestroy(mPool); | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
private: | ||||||||||||||
cudaMemPool_t mPool{nullptr}; | ||||||||||||||
}; | ||||||||||||||
#endif | ||||||||||||||
|
||||||||||||||
// | ||||||||||||||
// Class to allocate memory for outputs with data-dependent shapes. The sizes of those are unknown so pre-allocation is | ||||||||||||||
// not possible. | ||||||||||||||
|
@@ -312,6 +356,7 @@ | |||||||||||||
std::string tactic_sources_; | ||||||||||||||
std::string global_cache_path_, cache_path_, engine_decryption_lib_path_; | ||||||||||||||
std::unique_ptr<nvinfer1::IRuntime> runtime_ = nullptr; | ||||||||||||||
std::unique_ptr<PoolAllocator> trt_gpu_allocator_ = nullptr; | ||||||||||||||
std::mutex tensorrt_mu_; | ||||||||||||||
int device_id_; | ||||||||||||||
std::string compute_capability_; | ||||||||||||||
|
@@ -351,6 +396,9 @@ | |||||||||||||
std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"}; | ||||||||||||||
mutable std::unordered_map<std::string, std::unique_ptr<SubGraphContext>> subgraph_context_map_; | ||||||||||||||
|
||||||||||||||
mutable std::unordered_set<std::string> dds_op_set_; | ||||||||||||||
mutable bool is_dds_op_in_graph_ = false; | ||||||||||||||
|
||||||||||||||
mutable std::unique_ptr<nvinfer1::IBuilder> builder_; | ||||||||||||||
|
||||||||||||||
// Following maps that hold TRT objects will be accessible by different threads if ORT is using multithreading. | ||||||||||||||
|
@@ -590,5 +638,12 @@ | |||||||||||||
* This function only creates the instance at the first time it's being called." | ||||||||||||||
*/ | ||||||||||||||
nvinfer1::IBuilder* GetBuilder(TensorrtLogger& trt_logger) const; | ||||||||||||||
|
||||||||||||||
/** | ||||||||||||||
* Check if DDS op is in the ComputeCapability/subgraph. | ||||||||||||||
*/ | ||||||||||||||
bool IsDDSOpInSubGraph(const GraphViewer& graph, | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||
std::vector<std::unique_ptr<ComputeCapability>>& compute_capabilities, | ||||||||||||||
Check warning on line 646 in onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h GitHub Actions / Optional Lint C++
|
||||||||||||||
std::unordered_set<std::string>& dds_op_set) const; | ||||||||||||||
Check warning on line 647 in onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h GitHub Actions / Optional Lint C++
Check warning on line 647 in onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h GitHub Actions / Optional Lint C++
|
||||||||||||||
}; | ||||||||||||||
} // namespace onnxruntime |
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -258,4 +258,22 @@ | |||||||||||||||||
|
||||||||||||||||||
graph.SetInputs(graph_inputs_including_initializers); | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
// Check if DDS op is in the ComputeCapability/subgraph. | ||||||||||||||||||
bool TensorrtExecutionProvider::IsDDSOpInSubGraph(const GraphViewer& graph, | ||||||||||||||||||
std::vector<std::unique_ptr<ComputeCapability>>& compute_capabilities, | ||||||||||||||||||
Check warning on line 264 in onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc GitHub Actions / Optional Lint C++
|
||||||||||||||||||
std::unordered_set<std::string>& dds_op_set) const { | ||||||||||||||||||
Check warning on line 265 in onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc GitHub Actions / Optional Lint C++
Check warning on line 265 in onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc GitHub Actions / Optional Lint C++
|
||||||||||||||||||
auto is_dds_op = [&](const auto& node) { | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
if (dds_op_set.find(node->OpType()) != dds_op_set.end()) return true; | ||||||||||||||||||
return false; | ||||||||||||||||||
}; | ||||||||||||||||||
|
||||||||||||||||||
for (auto& compute_capability : compute_capabilities) { | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
auto& indexed_sub_graph = compute_capability->SubGraph(); | ||||||||||||||||||
for (auto i : indexed_sub_graph->Nodes()) { | ||||||||||||||||||
if (is_dds_op(graph.GetNode(i))) return true; | ||||||||||||||||||
} | ||||||||||||||||||
} | ||||||||||||||||||
return false; | ||||||||||||||||||
} | ||||||||||||||||||
} // namespace onnxruntime |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.