diff --git a/MANIFEST.in b/MANIFEST.in index 16f9cc8938..9834de5cd7 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,5 @@ include requirements/runtime.txt include mmcv/model_zoo/open_mmlab.json mmcv/model_zoo/deprecated.json mmcv/model_zoo/mmcls.json -include mmcv/ops/csrc/*.cuh mmcv/ops/csrc/*.hpp -include mmcv/ops/csrc/pytorch/*.cu mmcv/ops/csrc/pytorch/*.cpp -include mmcv/ops/csrc/parrots/*.cu mmcv/ops/csrc/parrots/*.cpp +include mmcv/ops/csrc/common/cuda/*.cuh mmcv/ops/csrc/common/cuda/*.hpp mmcv/ops/csrc/common/*.hpp +include mmcv/ops/csrc/pytorch/*.cpp mmcv/ops/csrc/pytorch/cuda/*.cu +include mmcv/ops/csrc/parrots/*.h mmcv/ops/csrc/parrots/*.cpp diff --git a/docs/compatibility.md b/docs/compatibility.md new file mode 100644 index 0000000000..d33ffdf1ff --- /dev/null +++ b/docs/compatibility.md @@ -0,0 +1,105 @@ +## Compatibility of MMCV + +### MMCV v1.3.11 + +In order to flexibly support more backends and hardwares like `NVIDIA GPUs` and `AMD GPUs`, the directory of `mmcv/ops/csrc` is refactored. Note that this refactoring will not affect the usage in API. For related information, please refer to [PR1206](https://github.com/open-mmlab/mmcv/pull/1206). + +The original directory was organized as follows. + +``` +. +├── common_cuda_helper.hpp +├── ops_cuda_kernel.cuh +├── pytorch_cpp_helper.hpp +├── pytorch_cuda_helper.hpp +├── parrots_cpp_helper.hpp +├── parrots_cuda_helper.hpp +├── parrots_cudawarpfunction.cuh +├── onnxruntime +│   ├── onnxruntime_register.h +│   ├── onnxruntime_session_options_config_keys.h +│   ├── ort_mmcv_utils.h +│   ├── ... +│   ├── onnx_ops.h +│   └── cpu +│ ├── onnxruntime_register.cpp +│      ├── ... +│      └── onnx_ops_impl.cpp +├── parrots +│   ├── ... +│   ├── ops.cpp +│   ├── ops_cuda.cu +│   ├── ops_parrots.cpp +│   └── ops_pytorch.h +├── pytorch +│   ├── ... +│   ├── ops.cpp +│   ├── ops_cuda.cu +│   ├── pybind.cpp +└── tensorrt + ├── trt_cuda_helper.cuh + ├── trt_plugin_helper.hpp + ├── trt_plugin.hpp + ├── trt_serialize.hpp + ├── ... + ├── trt_ops.hpp + └── plugins +    ├── trt_cuda_helper.cu +    ├── trt_plugin.cpp +    ├── ... +    ├── trt_ops.cpp +    └── trt_ops_kernel.cu +``` + +After refactored, it is organized as follows. + +``` +. +├── common +│ ├── box_iou_rotated_utils.hpp +│ ├── parrots_cpp_helper.hpp +│ ├── parrots_cuda_helper.hpp +│ ├── pytorch_cpp_helper.hpp +│ ├── pytorch_cuda_helper.hpp +│   └── cuda +│   ├── common_cuda_helper.hpp +│   ├── parrots_cudawarpfunction.cuh +│   ├── ... +│   └── ops_cuda_kernel.cuh +├── onnxruntime +│   ├── onnxruntime_register.h +│   ├── onnxruntime_session_options_config_keys.h +│   ├── ort_mmcv_utils.h +│   ├── ... +│   ├── onnx_ops.h +│   └── cpu +│ ├── onnxruntime_register.cpp +│      ├── ... +│      └── onnx_ops_impl.cpp +├── parrots +│   ├── ... +│   ├── ops.cpp +│   ├── ops_parrots.cpp +│   └── ops_pytorch.h +├── pytorch +│   ├── info.cpp +│   ├── pybind.cpp +│   ├── ... +│   ├── ops.cpp +│   └── cuda +│      ├── ... +│      └── ops_cuda.cu +└── tensorrt + ├── trt_cuda_helper.cuh + ├── trt_plugin_helper.hpp + ├── trt_plugin.hpp + ├── trt_serialize.hpp + ├── ... + ├── trt_ops.hpp + └── plugins +    ├── trt_cuda_helper.cu +    ├── trt_plugin.cpp +    ├── ... +    ├── trt_ops.cpp +    └── trt_ops_kernel.cu +``` diff --git a/docs/index.rst b/docs/index.rst index 64e796f9b1..a751e451fa 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,6 +10,7 @@ You can switch between Chinese and English documents in the lower-left corner of deployment.rst understand_mmcv.rst api.rst + compatibility.md faq.md community.rst diff --git a/docs_zh_CN/compatibility.md b/docs_zh_CN/compatibility.md new file mode 100644 index 0000000000..ea564dd9fc --- /dev/null +++ b/docs_zh_CN/compatibility.md @@ -0,0 +1,105 @@ +## MMCV 兼容性说明 + +### MMCV v1.3.11 + +为了灵活地支持更多的后端和硬件,例如 `NVIDIA GPUs` 、`AMD GPUs`,我们重构了 `mmcv/ops/csrc` 目录。注意,这次重构不会影响 API 的使用。更多相关信息,请参考 [PR1206](https://github.com/open-mmlab/mmcv/pull/1206)。 + +原始的目录结构如下所示 + +``` +. +├── common_cuda_helper.hpp +├── ops_cuda_kernel.cuh +├── pytorch_cpp_helper.hpp +├── pytorch_cuda_helper.hpp +├── parrots_cpp_helper.hpp +├── parrots_cuda_helper.hpp +├── parrots_cudawarpfunction.cuh +├── onnxruntime +│   ├── onnxruntime_register.h +│   ├── onnxruntime_session_options_config_keys.h +│   ├── ort_mmcv_utils.h +│   ├── ... +│   ├── onnx_ops.h +│   └── cpu +│ ├── onnxruntime_register.cpp +│      ├── ... +│      └── onnx_ops_impl.cpp +├── parrots +│   ├── ... +│   ├── ops.cpp +│   ├── ops_cuda.cu +│   ├── ops_parrots.cpp +│   └── ops_pytorch.h +├── pytorch +│   ├── ... +│   ├── ops.cpp +│   ├── ops_cuda.cu +│   ├── pybind.cpp +└── tensorrt + ├── trt_cuda_helper.cuh + ├── trt_plugin_helper.hpp + ├── trt_plugin.hpp + ├── trt_serialize.hpp + ├── ... + ├── trt_ops.hpp + └── plugins +    ├── trt_cuda_helper.cu +    ├── trt_plugin.cpp +    ├── ... +    ├── trt_ops.cpp +    └── trt_ops_kernel.cu +``` + +重构之后,它的结构如下所示 + +``` +. +├── common +│ ├── box_iou_rotated_utils.hpp +│ ├── parrots_cpp_helper.hpp +│ ├── parrots_cuda_helper.hpp +│ ├── pytorch_cpp_helper.hpp +│ ├── pytorch_cuda_helper.hpp +│   └── cuda +│   ├── common_cuda_helper.hpp +│   ├── parrots_cudawarpfunction.cuh +│   ├── ... +│   └── ops_cuda_kernel.cuh +├── onnxruntime +│   ├── onnxruntime_register.h +│   ├── onnxruntime_session_options_config_keys.h +│   ├── ort_mmcv_utils.h +│   ├── ... +│   ├── onnx_ops.h +│   └── cpu +│ ├── onnxruntime_register.cpp +│      ├── ... +│      └── onnx_ops_impl.cpp +├── parrots +│   ├── ... +│   ├── ops.cpp +│   ├── ops_parrots.cpp +│   └── ops_pytorch.h +├── pytorch +│   ├── info.cpp +│   ├── pybind.cpp +│   ├── ... +│   ├── ops.cpp +│   └── cuda +│      ├── ... +│      └── ops_cuda.cu +└── tensorrt + ├── trt_cuda_helper.cuh + ├── trt_plugin_helper.hpp + ├── trt_plugin.hpp + ├── trt_serialize.hpp + ├── ... + ├── trt_ops.hpp + └── plugins +    ├── trt_cuda_helper.cu +    ├── trt_plugin.cpp +    ├── ... +    ├── trt_ops.cpp +    └── trt_ops_kernel.cu +``` diff --git a/docs_zh_CN/index.rst b/docs_zh_CN/index.rst index f4a26fe924..9fc00260ce 100644 --- a/docs_zh_CN/index.rst +++ b/docs_zh_CN/index.rst @@ -10,6 +10,7 @@ deployment.rst understand_mmcv.rst api.rst + compatibility.md faq.md community.rst diff --git a/mmcv/ops/csrc/README.md b/mmcv/ops/csrc/README.md new file mode 100644 index 0000000000..91c237f3d7 --- /dev/null +++ b/mmcv/ops/csrc/README.md @@ -0,0 +1,169 @@ +# Code Structure of CUDA operators + +This folder contains all non-python code for MMCV custom ops. Please follow the same architecture if you want to add new ops. + +## Directories Tree + +```folder +. +├── common +│ ├── box_iou_rotated_utils.hpp +│ ├── parrots_cpp_helper.hpp +│ ├── parrots_cuda_helper.hpp +│ ├── pytorch_cpp_helper.hpp +│ ├── pytorch_cuda_helper.hpp +│   └── cuda +│   ├── common_cuda_helper.hpp +│   ├── parrots_cudawarpfunction.cuh +│   ├── ... +│   └── ops_cuda_kernel.cuh +├── onnxruntime +│   ├── onnxruntime_register.h +│   ├── onnxruntime_session_options_config_keys.h +│   ├── ort_mmcv_utils.h +│   ├── ... +│   ├── onnx_ops.h +│   └── cpu +│ ├── onnxruntime_register.cpp +│      ├── ... +│      └── onnx_ops_impl.cpp +├── parrots +│   ├── ... +│   ├── ops.cpp +│   ├── ops_parrots.cpp +│   └── ops_pytorch.h +├── pytorch +│   ├── info.cpp +│   ├── pybind.cpp +│   ├── ... +│   ├── ops.cpp +│   └── cuda +│      ├── ... +│      └── ops_cuda.cu +└── tensorrt + ├── trt_cuda_helper.cuh + ├── trt_plugin_helper.hpp + ├── trt_plugin.hpp + ├── trt_serialize.hpp + ├── ... + ├── trt_ops.hpp + └── plugins +    ├── trt_cuda_helper.cu +    ├── trt_plugin.cpp +    ├── ... +    ├── trt_ops.cpp +    └── trt_ops_kernel.cu +``` + +## Components + +- `common`: This directory contains all tools and shared codes. + - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax. +- `onnxruntime`: **ONNX Runtime** support for custom ops. + - `cpu`: CPU implementation of supported ops. +- `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory. +- `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory. + - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops. +- `tensorrt`: **TensorRT** support for custom ops. + - `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`. + +## How to add new PyTorch ops? + +1. (Optional) Add shared kernel in `common` to support special hardware platform. + + ```c++ + // src/common/cuda/new_ops_cuda_kernel.cuh + + template + __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) { + // forward here + } + + ``` + + Add cuda kernel launcher in `pytorch/cuda`. + + ```c++ + // src/pytorch/cuda + #include + + void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){ + // initialize + at::cuda::CUDAGuard device_guard(input.device()); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + ... + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] { + new_ops_forward_cuda_kernel + <<>>( + input.data_ptr(), output.data_ptr(),...); + })); + AT_CUDA_CHECK(cudaGetLastError()); + } + ``` + +2. Add ops implementation in `pytorch` directory. Select different implementations according to device type. + + ```c++ + // src/pytorch/new_ops.cpp + #ifdef MMCV_WITH_CUDA + Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){ + // implement cuda forward here + // use `NewOpsForwardCUDAKernelLauncher` here + } + #else + + Tensor new_ops_forward_cpu(Tensor input, Tensor output, ...){ + // implement cpu forward here + } + + ... + + Tensor new_ops_forward(Tensor input, Tensor output, ...){ + // select implementation by input device type + if (boxes.device().is_cuda()) { + #ifdef MMCV_WITH_CUDA + CHECK_CUDA_INPUT(input); + CHECK_CUDA_INPUT(output); + return new_ops_forward_cuda(input, output, ...); + #else + AT_ERROR("new ops is not compiled with GPU support"); + #endif + } else { + CHECK_CPU_INPUT(input); + CHECK_CPU_INPUT(output); + return new_ops_forward_cpu(input, output, ...); + } + } + ``` + +3. Binding the implementation in `pytorch/pybind.cpp` + + ```c++ + // src/pytorch/pybind.cpp + + ... + + Tensor new_ops_forward(Tensor input, Tensor output, ...); + + ... + + // bind with pybind11 + m.def("new_ops_forward", &new_ops_forward, "new_ops_forward", + py::arg("input"), py::arg("output"), ...); + + ... + + ``` + +4. Build MMCV again. Enjoy new ops in python + + ```python + from ..utils import ext_loader + ext_module = ext_loader.load_ext('_ext', ['new_ops_forward']) + + ... + + ext_module.new_ops_forward(input, output, ...) + + ``` diff --git a/mmcv/ops/csrc/box_iou_rotated_utils.hpp b/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp similarity index 100% rename from mmcv/ops/csrc/box_iou_rotated_utils.hpp rename to mmcv/ops/csrc/common/box_iou_rotated_utils.hpp diff --git a/mmcv/ops/csrc/bbox_overlaps_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/bbox_overlaps_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/border_align_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/border_align_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/box_iou_rotated_cuda.cuh b/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh similarity index 100% rename from mmcv/ops/csrc/box_iou_rotated_cuda.cuh rename to mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh diff --git a/mmcv/ops/csrc/carafe_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/carafe_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/carafe_naive_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/carafe_naive_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/cc_attention_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/cc_attention_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/cc_attention_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/cc_attention_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/common_cuda_helper.hpp b/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp similarity index 100% rename from mmcv/ops/csrc/common_cuda_helper.hpp rename to mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp diff --git a/mmcv/ops/csrc/deform_conv_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/deform_conv_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/deform_roi_pool_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/deform_roi_pool_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/masked_conv2d_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/masked_conv2d_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/modulated_deform_conv_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/modulated_deform_conv_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/ms_deform_attn_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/ms_deform_attn_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/nms_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/nms_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/nms_rotated_cuda.cuh b/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh similarity index 100% rename from mmcv/ops/csrc/nms_rotated_cuda.cuh rename to mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh diff --git a/mmcv/ops/csrc/parrots_cudawarpfunction.cuh b/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh similarity index 100% rename from mmcv/ops/csrc/parrots_cudawarpfunction.cuh rename to mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh diff --git a/mmcv/ops/csrc/psamask_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/psamask_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/roi_align_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/roi_align_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/roi_align_rotated_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/roi_align_rotated_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/roi_pool_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/roi_pool_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/sigmoid_focal_loss_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/sigmoid_focal_loss_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/softmax_focal_loss_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/softmax_focal_loss_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/sync_bn_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/sync_bn_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/tin_shift_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh similarity index 100% rename from mmcv/ops/csrc/tin_shift_cuda_kernel.cuh rename to mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh diff --git a/mmcv/ops/csrc/parrots_cpp_helper.hpp b/mmcv/ops/csrc/common/parrots_cpp_helper.hpp similarity index 100% rename from mmcv/ops/csrc/parrots_cpp_helper.hpp rename to mmcv/ops/csrc/common/parrots_cpp_helper.hpp diff --git a/mmcv/ops/csrc/parrots_cuda_helper.hpp b/mmcv/ops/csrc/common/parrots_cuda_helper.hpp similarity index 100% rename from mmcv/ops/csrc/parrots_cuda_helper.hpp rename to mmcv/ops/csrc/common/parrots_cuda_helper.hpp diff --git a/mmcv/ops/csrc/pytorch_cpp_helper.hpp b/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp similarity index 100% rename from mmcv/ops/csrc/pytorch_cpp_helper.hpp rename to mmcv/ops/csrc/common/pytorch_cpp_helper.hpp diff --git a/mmcv/ops/csrc/pytorch_cuda_helper.hpp b/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp similarity index 100% rename from mmcv/ops/csrc/pytorch_cuda_helper.hpp rename to mmcv/ops/csrc/common/pytorch_cuda_helper.hpp diff --git a/mmcv/ops/csrc/parrots/fused_bias_leakyrelu_cuda.cu b/mmcv/ops/csrc/parrots/fused_bias_leakyrelu_cuda.cu deleted file mode 100644 index 9598431545..0000000000 --- a/mmcv/ops/csrc/parrots/fused_bias_leakyrelu_cuda.cu +++ /dev/null @@ -1,109 +0,0 @@ -// from -// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act_kernel.cu -// Copyright (c) 2019, NVIDIA Corporation. All rights reserved. -// -// This work is made available under the Nvidia Source Code License-NC. -// To view a copy of this license, visit -// https://nvlabs.github.io/stylegan2/license.html - -#include -#include -#include -#include -#include -#include - -#include - -template -static __global__ void fused_bias_act_kernel( - scalar_t *out, const scalar_t *p_x, const scalar_t *p_b, - const scalar_t *p_ref, int act, int grad, scalar_t alpha, scalar_t scale, - int loop_x, int size_x, int step_b, int size_b, int use_bias, int use_ref) { - int xi = blockIdx.x * loop_x * blockDim.x + threadIdx.x; - - scalar_t zero = 0.0; - - for (int loop_idx = 0; loop_idx < loop_x && xi < size_x; - loop_idx++, xi += blockDim.x) { - scalar_t x = p_x[xi]; - - if (use_bias) { - x += p_b[(xi / step_b) % size_b]; - } - - scalar_t ref = use_ref ? p_ref[xi] : zero; - - scalar_t y; - - // act = 1: linear layer - // act = 3: leaky relu layer - // grad = 0: direct forward path - // grad = 1: first order deviation - // grad = 2: second order deviation - switch (act * 10 + grad) { - default: - case 10: - y = x; - break; - case 11: - y = x; - break; - case 12: - y = 0.0; - break; - - case 30: - y = (x > 0.0) ? x : x * alpha; - break; - case 31: - y = (ref > 0.0) ? x : x * alpha; - break; - case 32: - y = 0.0; - break; - } - - out[xi] = y * scale; - } -} - -torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor &input, - const torch::Tensor &bias, - const torch::Tensor &refer, int act, - int grad, float alpha, float scale) { - int curDevice = -1; - cudaGetDevice(&curDevice); - cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice); - - auto x = input.contiguous(); - auto b = bias.contiguous(); - auto ref = refer.contiguous(); - - int use_bias = b.numel() ? 1 : 0; - int use_ref = ref.numel() ? 1 : 0; - - int size_x = x.numel(); - int size_b = b.numel(); - int step_b = 1; - - for (int i = 1 + 1; i < x.dim(); i++) { - step_b *= x.size(i); - } - - int loop_x = 4; - int block_size = 4 * 32; - int grid_size = (size_x - 1) / (loop_x * block_size) + 1; - - auto y = torch::empty_like(x); - - AT_DISPATCH_FLOATING_TYPES_AND_HALF( - x.scalar_type(), "fused_bias_act_kernel", [&] { - fused_bias_act_kernel<<>>( - y.data_ptr(), x.data_ptr(), - b.data_ptr(), ref.data_ptr(), act, grad, alpha, - scale, loop_x, size_x, step_b, size_b, use_bias, use_ref); - }); - - return y; -} diff --git a/mmcv/ops/csrc/parrots/ms_deform_attn_cuda.cu b/mmcv/ops/csrc/parrots/ms_deform_attn_cuda.cu deleted file mode 100644 index 693131b382..0000000000 --- a/mmcv/ops/csrc/parrots/ms_deform_attn_cuda.cu +++ /dev/null @@ -1,360 +0,0 @@ -/*! -************************************************************************************************** -* Deformable DETR -* Copyright (c) 2020 SenseTime. All Rights Reserved. -* Licensed under the Apache License, Version 2.0 [see LICENSE for details] -************************************************************************************************** -* Modified from -*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 -************************************************************************************************** -*/ - -#include -#include -#include -#include - -#include -#include -#include - -template -void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t *data_value, - const int64_t *data_spatial_shapes, - const int64_t *data_level_start_index, - const scalar_t *data_sampling_loc, - const scalar_t *data_attn_weight, - const int batch_size, const int spatial_size, - const int num_heads, const int channels, - const int num_levels, const int num_query, - const int num_point, scalar_t *data_col) { - const int num_kernels = batch_size * num_query * num_heads * channels; - const int num_actual_kernels = batch_size * num_query * num_heads * channels; - const int num_threads = CUDA_NUM_THREADS; - ms_deformable_im2col_gpu_kernel - <<>>( - num_kernels, data_value, data_spatial_shapes, data_level_start_index, - data_sampling_loc, data_attn_weight, batch_size, spatial_size, - num_heads, channels, num_levels, num_query, num_point, data_col); - - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); - } -} - -template -void ms_deformable_col2im_cuda( - cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_value, - const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, - const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, - const int batch_size, const int spatial_size, const int num_heads, - const int channels, const int num_levels, const int num_query, - const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, - scalar_t *grad_attn_weight) { - const int num_threads = - (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels; - const int num_kernels = batch_size * num_query * num_heads * channels; - const int num_actual_kernels = batch_size * num_query * num_heads * channels; - if (channels > 1024) { - if ((channels & 1023) == 0) { - ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks - <<>>( - num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, spatial_size, num_heads, channels, num_levels, - num_query, num_point, grad_value, grad_sampling_loc, - grad_attn_weight); - } else { - ms_deformable_col2im_gpu_kernel_gm - <<>>(num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, spatial_size, num_heads, - channels, num_levels, num_query, num_point, grad_value, - grad_sampling_loc, grad_attn_weight); - } - } else { - switch (channels) { - case 1: - ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 - <<>>(num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, spatial_size, num_heads, - channels, num_levels, num_query, num_point, grad_value, - grad_sampling_loc, grad_attn_weight); - break; - case 2: - ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 - <<>>(num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, spatial_size, num_heads, - channels, num_levels, num_query, num_point, grad_value, - grad_sampling_loc, grad_attn_weight); - break; - case 4: - ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 - <<>>(num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, spatial_size, num_heads, - channels, num_levels, num_query, num_point, grad_value, - grad_sampling_loc, grad_attn_weight); - break; - case 8: - ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 - <<>>(num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, spatial_size, num_heads, - channels, num_levels, num_query, num_point, grad_value, - grad_sampling_loc, grad_attn_weight); - break; - case 16: - ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 - <<>>(num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, spatial_size, num_heads, - channels, num_levels, num_query, num_point, grad_value, - grad_sampling_loc, grad_attn_weight); - break; - case 32: - ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 - <<>>(num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, spatial_size, num_heads, - channels, num_levels, num_query, num_point, grad_value, - grad_sampling_loc, grad_attn_weight); - break; - case 64: - ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 - <<>>(num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, spatial_size, num_heads, - channels, num_levels, num_query, num_point, grad_value, - grad_sampling_loc, grad_attn_weight); - break; - case 128: - ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 - <<>>(num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, spatial_size, num_heads, - channels, num_levels, num_query, num_point, grad_value, - grad_sampling_loc, grad_attn_weight); - break; - case 256: - ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 - <<>>(num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, spatial_size, num_heads, - channels, num_levels, num_query, num_point, grad_value, - grad_sampling_loc, grad_attn_weight); - break; - case 512: - ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 - <<>>(num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, spatial_size, num_heads, - channels, num_levels, num_query, num_point, grad_value, - grad_sampling_loc, grad_attn_weight); - break; - case 1024: - ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 - <<>>(num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, spatial_size, num_heads, - channels, num_levels, num_query, num_point, grad_value, - grad_sampling_loc, grad_attn_weight); - break; - default: - if (channels < 64) { - ms_deformable_col2im_gpu_kernel_shm_reduce_v1 - <<>>( - num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, spatial_size, num_heads, channels, num_levels, - num_query, num_point, grad_value, grad_sampling_loc, - grad_attn_weight); - } else { - ms_deformable_col2im_gpu_kernel_shm_reduce_v2 - <<>>( - num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, spatial_size, num_heads, channels, num_levels, - num_query, num_point, grad_value, grad_sampling_loc, - grad_attn_weight); - } - } - } - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); - } -} - -at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value, - const at::Tensor &spatial_shapes, - const at::Tensor &level_start_index, - const at::Tensor &sampling_loc, - const at::Tensor &attn_weight, - const int im2col_step) { - AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); - AT_ASSERTM(spatial_shapes.is_contiguous(), - "spatial_shapes tensor has to be contiguous"); - AT_ASSERTM(level_start_index.is_contiguous(), - "level_start_index tensor has to be contiguous"); - AT_ASSERTM(sampling_loc.is_contiguous(), - "sampling_loc tensor has to be contiguous"); - AT_ASSERTM(attn_weight.is_contiguous(), - "attn_weight tensor has to be contiguous"); - - AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); - AT_ASSERTM(spatial_shapes.type().is_cuda(), - "spatial_shapes must be a CUDA tensor"); - AT_ASSERTM(level_start_index.type().is_cuda(), - "level_start_index must be a CUDA tensor"); - AT_ASSERTM(sampling_loc.type().is_cuda(), - "sampling_loc must be a CUDA tensor"); - AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); - - const int batch = value.size(0); - const int spatial_size = value.size(1); - const int num_heads = value.size(2); - const int channels = value.size(3); - - const int num_levels = spatial_shapes.size(0); - - const int num_query = sampling_loc.size(1); - const int num_point = sampling_loc.size(4); - - const int im2col_step_ = std::min(batch, im2col_step); - - AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", - batch, im2col_step_); - - auto output = - at::zeros({batch, num_query, num_heads, channels}, value.options()); - - const int batch_n = im2col_step_; - auto output_n = output.view( - {batch / im2col_step_, batch_n, num_query, num_heads, channels}); - auto per_value_size = spatial_size * num_heads * channels; - auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; - auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; - for (int n = 0; n < batch / im2col_step_; ++n) { - auto columns = output_n.select(0, n); - AT_DISPATCH_FLOATING_TYPES( - value.type(), "ms_deform_attn_forward_cuda", ([&] { - ms_deformable_im2col_cuda( - at::cuda::getCurrentCUDAStream(), - value.data() + n * im2col_step_ * per_value_size, - spatial_shapes.data(), level_start_index.data(), - sampling_loc.data() + - n * im2col_step_ * per_sample_loc_size, - attn_weight.data() + - n * im2col_step_ * per_attn_weight_size, - batch_n, spatial_size, num_heads, channels, num_levels, num_query, - num_point, columns.data()); - })); - } - - output = output.view({batch, num_query, num_heads * channels}); - - return output; -} - -void ms_deform_attn_cuda_backward( - const at::Tensor &value, const at::Tensor &spatial_shapes, - const at::Tensor &level_start_index, const at::Tensor &sampling_loc, - const at::Tensor &attn_weight, const at::Tensor &grad_output, - at::Tensor &grad_value, at::Tensor &grad_sampling_loc, - at::Tensor &grad_attn_weight, const int im2col_step) { - AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); - AT_ASSERTM(spatial_shapes.is_contiguous(), - "spatial_shapes tensor has to be contiguous"); - AT_ASSERTM(level_start_index.is_contiguous(), - "level_start_index tensor has to be contiguous"); - AT_ASSERTM(sampling_loc.is_contiguous(), - "sampling_loc tensor has to be contiguous"); - AT_ASSERTM(attn_weight.is_contiguous(), - "attn_weight tensor has to be contiguous"); - AT_ASSERTM(grad_output.is_contiguous(), - "grad_output tensor has to be contiguous"); - - AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); - AT_ASSERTM(spatial_shapes.type().is_cuda(), - "spatial_shapes must be a CUDA tensor"); - AT_ASSERTM(level_start_index.type().is_cuda(), - "level_start_index must be a CUDA tensor"); - AT_ASSERTM(sampling_loc.type().is_cuda(), - "sampling_loc must be a CUDA tensor"); - AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); - AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); - - const int batch = value.size(0); - const int spatial_size = value.size(1); - const int num_heads = value.size(2); - const int channels = value.size(3); - - const int num_levels = spatial_shapes.size(0); - - const int num_query = sampling_loc.size(1); - const int num_point = sampling_loc.size(4); - - const int im2col_step_ = std::min(batch, im2col_step); - - AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", - batch, im2col_step_); - - const int batch_n = im2col_step_; - auto per_value_size = spatial_size * num_heads * channels; - auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; - auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; - auto grad_output_n = grad_output.view( - {batch / im2col_step_, batch_n, num_query, num_heads, channels}); - - for (int n = 0; n < batch / im2col_step_; ++n) { - auto grad_output_g = grad_output_n.select(0, n); - AT_DISPATCH_FLOATING_TYPES( - value.type(), "ms_deform_attn_backward_cuda", ([&] { - ms_deformable_col2im_cuda( - at::cuda::getCurrentCUDAStream(), grad_output_g.data(), - value.data() + n * im2col_step_ * per_value_size, - spatial_shapes.data(), level_start_index.data(), - sampling_loc.data() + - n * im2col_step_ * per_sample_loc_size, - attn_weight.data() + - n * im2col_step_ * per_attn_weight_size, - batch_n, spatial_size, num_heads, channels, num_levels, num_query, - num_point, - grad_value.data() + n * im2col_step_ * per_value_size, - grad_sampling_loc.data() + - n * im2col_step_ * per_sample_loc_size, - grad_attn_weight.data() + - n * im2col_step_ * per_attn_weight_size); - })); - } -} diff --git a/mmcv/ops/csrc/parrots/upfirdn2d_kernel.cu b/mmcv/ops/csrc/parrots/upfirdn2d_kernel.cu deleted file mode 100644 index 52a175bfd2..0000000000 --- a/mmcv/ops/csrc/parrots/upfirdn2d_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// from -// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d_kernel.cu -// Copyright (c) 2019, NVIDIA Corporation. All rights reserved. -// -// This work is made available under the Nvidia Source Code License-NC. -// To view a copy of this license, visit -// https://nvlabs.github.io/stylegan2/license.html - -#include -#include -#include -#include -#include -#include - -#include - -static __host__ __device__ __forceinline__ int floor_div(int a, int b) { - int c = a / b; - - if (c * b > a) { - c--; - } - - return c; -} - -struct UpFirDn2DKernelParams { - int up_x; - int up_y; - int down_x; - int down_y; - int pad_x0; - int pad_x1; - int pad_y0; - int pad_y1; - - int major_dim; - int in_h; - int in_w; - int minor_dim; - int kernel_h; - int kernel_w; - int out_h; - int out_w; - int loop_major; - int loop_x; -}; - -template -__global__ void upfirdn2d_kernel_large(scalar_t *out, const scalar_t *input, - const scalar_t *kernel, - const UpFirDn2DKernelParams p) { - int minor_idx = blockIdx.x * blockDim.x + threadIdx.x; - int out_y = minor_idx / p.minor_dim; - minor_idx -= out_y * p.minor_dim; - int out_x_base = blockIdx.y * p.loop_x * blockDim.y + threadIdx.y; - int major_idx_base = blockIdx.z * p.loop_major; - - if (out_x_base >= p.out_w || out_y >= p.out_h || - major_idx_base >= p.major_dim) { - return; - } - - int mid_y = out_y * p.down_y + p.up_y - 1 - p.pad_y0; - int in_y = min(max(floor_div(mid_y, p.up_y), 0), p.in_h); - int h = min(max(floor_div(mid_y + p.kernel_h, p.up_y), 0), p.in_h) - in_y; - int kernel_y = mid_y + p.kernel_h - (in_y + 1) * p.up_y; - - for (int loop_major = 0, major_idx = major_idx_base; - loop_major < p.loop_major && major_idx < p.major_dim; - loop_major++, major_idx++) { - for (int loop_x = 0, out_x = out_x_base; - loop_x < p.loop_x && out_x < p.out_w; loop_x++, out_x += blockDim.y) { - int mid_x = out_x * p.down_x + p.up_x - 1 - p.pad_x0; - int in_x = min(max(floor_div(mid_x, p.up_x), 0), p.in_w); - int w = min(max(floor_div(mid_x + p.kernel_w, p.up_x), 0), p.in_w) - in_x; - int kernel_x = mid_x + p.kernel_w - (in_x + 1) * p.up_x; - - const scalar_t *x_p = - &input[((major_idx * p.in_h + in_y) * p.in_w + in_x) * p.minor_dim + - minor_idx]; - const scalar_t *k_p = &kernel[kernel_y * p.kernel_w + kernel_x]; - int x_px = p.minor_dim; - int k_px = -p.up_x; - int x_py = p.in_w * p.minor_dim; - int k_py = -p.up_y * p.kernel_w; - - scalar_t v = 0.0f; - - for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) { - v += static_cast(*x_p) * static_cast(*k_p); - x_p += x_px; - k_p += k_px; - } - - x_p += x_py - w * x_px; - k_p += k_py - w * k_px; - } - - out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim + - minor_idx] = v; - } - } -} - -template -__global__ void upfirdn2d_kernel(scalar_t *out, const scalar_t *input, - const scalar_t *kernel, - const UpFirDn2DKernelParams p) { - const int tile_in_h = ((tile_out_h - 1) * down_y + kernel_h - 1) / up_y + 1; - const int tile_in_w = ((tile_out_w - 1) * down_x + kernel_w - 1) / up_x + 1; - - __shared__ volatile float sk[kernel_h][kernel_w]; - __shared__ volatile float sx[tile_in_h][tile_in_w]; - - int minor_idx = blockIdx.x; - int tile_out_y = minor_idx / p.minor_dim; - minor_idx -= tile_out_y * p.minor_dim; - tile_out_y *= tile_out_h; - int tile_out_x_base = blockIdx.y * p.loop_x * tile_out_w; - int major_idx_base = blockIdx.z * p.loop_major; - - if (tile_out_x_base >= p.out_w | tile_out_y >= p.out_h | - major_idx_base >= p.major_dim) { - return; - } - - for (int tap_idx = threadIdx.x; tap_idx < kernel_h * kernel_w; - tap_idx += blockDim.x) { - int ky = tap_idx / kernel_w; - int kx = tap_idx - ky * kernel_w; - scalar_t v = 0.0; - - if (kx < p.kernel_w & ky < p.kernel_h) { - v = kernel[(p.kernel_h - 1 - ky) * p.kernel_w + (p.kernel_w - 1 - kx)]; - } - - sk[ky][kx] = v; - } - - for (int loop_major = 0, major_idx = major_idx_base; - loop_major < p.loop_major & major_idx < p.major_dim; - loop_major++, major_idx++) { - for (int loop_x = 0, tile_out_x = tile_out_x_base; - loop_x < p.loop_x & tile_out_x < p.out_w; - loop_x++, tile_out_x += tile_out_w) { - int tile_mid_x = tile_out_x * down_x + up_x - 1 - p.pad_x0; - int tile_mid_y = tile_out_y * down_y + up_y - 1 - p.pad_y0; - int tile_in_x = floor_div(tile_mid_x, up_x); - int tile_in_y = floor_div(tile_mid_y, up_y); - - __syncthreads(); - - for (int in_idx = threadIdx.x; in_idx < tile_in_h * tile_in_w; - in_idx += blockDim.x) { - int rel_in_y = in_idx / tile_in_w; - int rel_in_x = in_idx - rel_in_y * tile_in_w; - int in_x = rel_in_x + tile_in_x; - int in_y = rel_in_y + tile_in_y; - - scalar_t v = 0.0; - - if (in_x >= 0 & in_y >= 0 & in_x < p.in_w & in_y < p.in_h) { - v = input[((major_idx * p.in_h + in_y) * p.in_w + in_x) * - p.minor_dim + - minor_idx]; - } - - sx[rel_in_y][rel_in_x] = v; - } - - __syncthreads(); - for (int out_idx = threadIdx.x; out_idx < tile_out_h * tile_out_w; - out_idx += blockDim.x) { - int rel_out_y = out_idx / tile_out_w; - int rel_out_x = out_idx - rel_out_y * tile_out_w; - int out_x = rel_out_x + tile_out_x; - int out_y = rel_out_y + tile_out_y; - - int mid_x = tile_mid_x + rel_out_x * down_x; - int mid_y = tile_mid_y + rel_out_y * down_y; - int in_x = floor_div(mid_x, up_x); - int in_y = floor_div(mid_y, up_y); - int rel_in_x = in_x - tile_in_x; - int rel_in_y = in_y - tile_in_y; - int kernel_x = (in_x + 1) * up_x - mid_x - 1; - int kernel_y = (in_y + 1) * up_y - mid_y - 1; - - scalar_t v = 0.0; - -#pragma unroll - for (int y = 0; y < kernel_h / up_y; y++) -#pragma unroll - for (int x = 0; x < kernel_w / up_x; x++) - v += sx[rel_in_y + y][rel_in_x + x] * - sk[kernel_y + y * up_y][kernel_x + x * up_x]; - - if (out_x < p.out_w & out_y < p.out_h) { - out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim + - minor_idx] = v; - } - } - } - } -} - -torch::Tensor upfirdn2d_op(const torch::Tensor &input, - const torch::Tensor &kernel, int up_x, int up_y, - int down_x, int down_y, int pad_x0, int pad_x1, - int pad_y0, int pad_y1) { - int curDevice = -1; - cudaGetDevice(&curDevice); - cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice); - - UpFirDn2DKernelParams p; - - auto x = input.contiguous(); - auto k = kernel.contiguous(); - - p.major_dim = x.size(0); - p.in_h = x.size(1); - p.in_w = x.size(2); - p.minor_dim = x.size(3); - p.kernel_h = k.size(0); - p.kernel_w = k.size(1); - p.up_x = up_x; - p.up_y = up_y; - p.down_x = down_x; - p.down_y = down_y; - p.pad_x0 = pad_x0; - p.pad_x1 = pad_x1; - p.pad_y0 = pad_y0; - p.pad_y1 = pad_y1; - - p.out_h = (p.in_h * p.up_y + p.pad_y0 + p.pad_y1 - p.kernel_h + p.down_y) / - p.down_y; - p.out_w = (p.in_w * p.up_x + p.pad_x0 + p.pad_x1 - p.kernel_w + p.down_x) / - p.down_x; - - auto out = - at::empty({p.major_dim, p.out_h, p.out_w, p.minor_dim}, x.options()); - - int mode = -1; - - int tile_out_h = -1; - int tile_out_w = -1; - - if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 && - p.kernel_h <= 4 && p.kernel_w <= 4) { - mode = 1; - tile_out_h = 16; - tile_out_w = 64; - } - - if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 && - p.kernel_h <= 3 && p.kernel_w <= 3) { - mode = 2; - tile_out_h = 16; - tile_out_w = 64; - } - - if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 && - p.kernel_h <= 4 && p.kernel_w <= 4) { - mode = 3; - tile_out_h = 16; - tile_out_w = 64; - } - - if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 && - p.kernel_h <= 2 && p.kernel_w <= 2) { - mode = 4; - tile_out_h = 16; - tile_out_w = 64; - } - - if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 && - p.kernel_h <= 4 && p.kernel_w <= 4) { - mode = 5; - tile_out_h = 8; - tile_out_w = 32; - } - - if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 && - p.kernel_h <= 2 && p.kernel_w <= 2) { - mode = 6; - tile_out_h = 8; - tile_out_w = 32; - } - - dim3 block_size; - dim3 grid_size; - - if (tile_out_h > 0 && tile_out_w > 0) { - p.loop_major = (p.major_dim - 1) / 16384 + 1; - p.loop_x = 1; - block_size = dim3(32 * 8, 1, 1); - grid_size = dim3(((p.out_h - 1) / tile_out_h + 1) * p.minor_dim, - (p.out_w - 1) / (p.loop_x * tile_out_w) + 1, - (p.major_dim - 1) / p.loop_major + 1); - } else { - p.loop_major = (p.major_dim - 1) / 16384 + 1; - p.loop_x = 4; - block_size = dim3(4, 32, 1); - grid_size = dim3((p.out_h * p.minor_dim - 1) / block_size.x + 1, - (p.out_w - 1) / (p.loop_x * block_size.y) + 1, - (p.major_dim - 1) / p.loop_major + 1); - } - - AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] { - switch (mode) { - case 1: - upfirdn2d_kernel - <<>>(out.data_ptr(), - x.data_ptr(), - k.data_ptr(), p); - - break; - - case 2: - upfirdn2d_kernel - <<>>(out.data_ptr(), - x.data_ptr(), - k.data_ptr(), p); - - break; - - case 3: - upfirdn2d_kernel - <<>>(out.data_ptr(), - x.data_ptr(), - k.data_ptr(), p); - - break; - - case 4: - upfirdn2d_kernel - <<>>(out.data_ptr(), - x.data_ptr(), - k.data_ptr(), p); - - break; - - case 5: - upfirdn2d_kernel - <<>>(out.data_ptr(), - x.data_ptr(), - k.data_ptr(), p); - - break; - - case 6: - upfirdn2d_kernel - <<>>(out.data_ptr(), - x.data_ptr(), - k.data_ptr(), p); - - break; - - default: - upfirdn2d_kernel_large<<>>( - out.data_ptr(), x.data_ptr(), - k.data_ptr(), p); - } - }); - - return out; -} diff --git a/mmcv/ops/csrc/pytorch/box_iou_rotated_cuda.cu b/mmcv/ops/csrc/pytorch/box_iou_rotated_cuda.cu deleted file mode 100644 index d399b5ce7f..0000000000 --- a/mmcv/ops/csrc/pytorch/box_iou_rotated_cuda.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -// modified from -// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu -#include "box_iou_rotated_cuda.cuh" -#include "pytorch_cuda_helper.hpp" - -void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious, - const int mode_flag, const bool aligned) { - using scalar_t = float; - AT_ASSERTM(boxes1.type().is_cuda(), "boxes1 must be a CUDA tensor"); - AT_ASSERTM(boxes2.type().is_cuda(), "boxes2 must be a CUDA tensor"); - - int output_size = ious.numel(); - int num_boxes1 = boxes1.size(0); - int num_boxes2 = boxes2.size(0); - - at::cuda::CUDAGuard device_guard(boxes1.device()); - cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - box_iou_rotated_cuda_kernel - <<>>( - num_boxes1, num_boxes2, boxes1.data_ptr(), - boxes2.data_ptr(), (scalar_t*)ious.data_ptr(), - mode_flag, aligned); - AT_CUDA_CHECK(cudaGetLastError()); -} diff --git a/mmcv/ops/csrc/parrots/bbox_overlaps_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/bbox_overlaps_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu diff --git a/mmcv/ops/csrc/parrots/border_align_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/border_align_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu diff --git a/mmcv/ops/csrc/parrots/box_iou_rotated_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/box_iou_rotated_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu diff --git a/mmcv/ops/csrc/parrots/carafe_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/carafe_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu diff --git a/mmcv/ops/csrc/parrots/carafe_naive_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/carafe_naive_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu diff --git a/mmcv/ops/csrc/parrots/cc_attention_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/cc_attention_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/cc_attention_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/cc_attention_cuda.cu diff --git a/mmcv/ops/csrc/pytorch/deform_conv_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu similarity index 100% rename from mmcv/ops/csrc/pytorch/deform_conv_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu diff --git a/mmcv/ops/csrc/parrots/deform_roi_pool_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/deform_roi_pool_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu diff --git a/mmcv/ops/csrc/parrots/focal_loss_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/focal_loss_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu diff --git a/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu similarity index 100% rename from mmcv/ops/csrc/pytorch/fused_bias_leakyrelu_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu diff --git a/mmcv/ops/csrc/parrots/masked_conv2d_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/masked_conv2d_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu diff --git a/mmcv/ops/csrc/parrots/modulated_deform_conv_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/modulated_deform_conv_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu diff --git a/mmcv/ops/csrc/pytorch/ms_deform_attn_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu similarity index 99% rename from mmcv/ops/csrc/pytorch/ms_deform_attn_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu index 693131b382..07fe93cb4d 100644 --- a/mmcv/ops/csrc/pytorch/ms_deform_attn_cuda.cu +++ b/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu @@ -15,9 +15,10 @@ #include #include -#include #include +#include "ms_deform_attn_cuda_kernel.cuh" + template void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t *data_value, const int64_t *data_spatial_shapes, diff --git a/mmcv/ops/csrc/parrots/nms_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/nms_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu diff --git a/mmcv/ops/csrc/parrots/nms_rotated_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/nms_rotated_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu diff --git a/mmcv/ops/csrc/parrots/psamask_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/psamask_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu diff --git a/mmcv/ops/csrc/parrots/roi_align_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/roi_align_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu diff --git a/mmcv/ops/csrc/pytorch/roi_align_rotated_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu similarity index 100% rename from mmcv/ops/csrc/pytorch/roi_align_rotated_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu diff --git a/mmcv/ops/csrc/parrots/roi_pool_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/roi_pool_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu diff --git a/mmcv/ops/csrc/parrots/sync_bn_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/sync_bn_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu diff --git a/mmcv/ops/csrc/parrots/tin_shift_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu similarity index 100% rename from mmcv/ops/csrc/parrots/tin_shift_cuda.cu rename to mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu diff --git a/mmcv/ops/csrc/pytorch/upfirdn2d_kernel.cu b/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu similarity index 100% rename from mmcv/ops/csrc/pytorch/upfirdn2d_kernel.cu rename to mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu diff --git a/mmcv/ops/csrc/pytorch/nms_rotated_cuda.cu b/mmcv/ops/csrc/pytorch/nms_rotated_cuda.cu deleted file mode 100644 index 60fe7d15e2..0000000000 --- a/mmcv/ops/csrc/pytorch/nms_rotated_cuda.cu +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -// modified from -// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu -#include "nms_rotated_cuda.cuh" -#include "pytorch_cuda_helper.hpp" - -Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores, - const Tensor order_t, const Tensor dets_sorted, - float iou_threshold, const int multi_label) { - // using scalar_t = float; - AT_ASSERTM(dets.type().is_cuda(), "dets must be a CUDA tensor"); - AT_ASSERTM(scores.type().is_cuda(), "scores must be a CUDA tensor"); - at::cuda::CUDAGuard device_guard(dets.device()); - - int dets_num = dets.size(0); - - const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock); - - Tensor mask = - at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong)); - - dim3 blocks(col_blocks, col_blocks); - dim3 threads(threadsPerBlock); - cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - - AT_DISPATCH_FLOATING_TYPES_AND_HALF( - dets_sorted.type(), "nms_rotated_kernel_cuda", [&] { - nms_rotated_cuda_kernel<<>>( - dets_num, iou_threshold, dets_sorted.data(), - (unsigned long long*)mask.data(), multi_label); - }); - - Tensor mask_cpu = mask.to(at::kCPU); - unsigned long long* mask_host = (unsigned long long*)mask_cpu.data(); - - std::vector remv(col_blocks); - memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); - - Tensor keep = - at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU)); - int64_t* keep_out = keep.data(); - - int num_to_keep = 0; - for (int i = 0; i < dets_num; i++) { - int nblock = i / threadsPerBlock; - int inblock = i % threadsPerBlock; - - if (!(remv[nblock] & (1ULL << inblock))) { - keep_out[num_to_keep++] = i; - unsigned long long* p = mask_host + i * col_blocks; - for (int j = nblock; j < col_blocks; j++) { - remv[j] |= p[j]; - } - } - } - - AT_CUDA_CHECK(cudaGetLastError()); - return order_t.index( - {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep) - .to(order_t.device(), keep.scalar_type())}); -} diff --git a/mmcv/ops/csrc/pytorch/roi_align_cpu.cpp b/mmcv/ops/csrc/pytorch/roi_align_cpu.cpp index 14d25b2ca0..3f797cb639 100644 --- a/mmcv/ops/csrc/pytorch/roi_align_cpu.cpp +++ b/mmcv/ops/csrc/pytorch/roi_align_cpu.cpp @@ -4,7 +4,7 @@ #include #include -#include "../pytorch_cpp_helper.hpp" +#include "pytorch_cpp_helper.hpp" // implementation taken from Caffe2 template diff --git a/mmcv/ops/csrc/pytorch/roi_align_rotated_cpu.cpp b/mmcv/ops/csrc/pytorch/roi_align_rotated_cpu.cpp index efc9ac22a7..8033de4f0f 100644 --- a/mmcv/ops/csrc/pytorch/roi_align_rotated_cpu.cpp +++ b/mmcv/ops/csrc/pytorch/roi_align_rotated_cpu.cpp @@ -4,7 +4,7 @@ #include #include -#include "../pytorch_cpp_helper.hpp" +#include "pytorch_cpp_helper.hpp" // implementation taken from Caffe2 template diff --git a/model.pth b/model.pth new file mode 100644 index 0000000000..9a60bd6576 Binary files /dev/null and b/model.pth differ diff --git a/modelA.pth b/modelA.pth new file mode 100644 index 0000000000..49cfa686fe Binary files /dev/null and b/modelA.pth differ diff --git a/setup.py b/setup.py index 4122c814d2..02476518e3 100644 --- a/setup.py +++ b/setup.py @@ -145,11 +145,10 @@ def get_extensions(): library_dirs += [tensorrt_lib_path] libraries += ['nvinfer', 'nvparsers', 'nvinfer_plugin'] libraries += ['cudart'] - kwargs = {} define_macros = [] extra_compile_args = {'cxx': []} - include_path = os.path.abspath('./mmcv/ops/csrc') + include_path = os.path.abspath('./mmcv/ops/csrc/common/cuda') include_trt_path = os.path.abspath('./mmcv/ops/csrc/tensorrt') include_dirs.append(include_path) include_dirs.append(include_trt_path) @@ -163,9 +162,6 @@ def get_extensions(): extra_compile_args['nvcc'] = [cuda_args] if cuda_args else [] library_dirs += library_paths(cuda=True) - kwargs['library_dirs'] = library_dirs - kwargs['libraries'] = libraries - from setuptools import Extension ext_ops = Extension( name=ext_name, @@ -187,9 +183,11 @@ def get_extensions(): # new parrots op impl do not use MMCV_USE_PARROTS # define_macros = [('MMCV_USE_PARROTS', None)] define_macros = [] - op_files = glob.glob('./mmcv/ops/csrc/parrots/*.cu') +\ + include_dirs = [] + op_files = glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') +\ glob.glob('./mmcv/ops/csrc/parrots/*.cpp') - include_dirs = [os.path.abspath('./mmcv/ops/csrc')] + include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common')) + include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda')) cuda_args = os.getenv('MMCV_CUDA_ARGS') extra_compile_args = { 'nvcc': [cuda_args] if cuda_args else [], @@ -219,6 +217,7 @@ def get_extensions(): os.environ.setdefault('MAX_JOBS', '4') define_macros = [] extra_compile_args = {'cxx': []} + include_dirs = [] is_rocm_pytorch = False if parse_version(torch.__version__) >= parse_version('1.5'): @@ -226,13 +225,13 @@ def get_extensions(): is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False - this_dir = 'mmcv/ops/csrc/' + project_dir = 'mmcv/ops/csrc/' if is_rocm_pytorch: from torch.utils.hipify import hipify_python hipify_python.hipify( - project_directory=this_dir, - output_directory=this_dir, + project_directory=project_dir, + output_directory=project_dir, includes='mmcv/ops/csrc/*', show_detailed=True, is_pytorch_extension=True, @@ -243,25 +242,26 @@ def get_extensions(): extra_compile_args['nvcc'] = [cuda_args] if cuda_args else [] op_files = glob.glob('./mmcv/ops/csrc/pytorch/hip/*') extension = CUDAExtension - include_path = os.path.abspath('./mmcv/ops/csrc/hip') - + include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/hip')) elif torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1': define_macros += [('MMCV_WITH_CUDA', None)] cuda_args = os.getenv('MMCV_CUDA_ARGS') extra_compile_args['nvcc'] = [cuda_args] if cuda_args else [] - op_files = glob.glob('./mmcv/ops/csrc/pytorch/*') + op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \ + glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') extension = CUDAExtension - include_path = os.path.abspath('./mmcv/ops/csrc') + include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common')) + include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda')) else: print(f'Compiling {ext_name} without CUDA') op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') extension = CppExtension - include_path = os.path.abspath('./mmcv/ops/csrc') + include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common')) ext_ops = extension( name=ext_name, sources=op_files, - include_dirs=[include_path], + include_dirs=include_dirs, define_macros=define_macros, extra_compile_args=extra_compile_args) extensions.append(ext_ops) @@ -276,7 +276,6 @@ def get_extensions(): ort_path = os.getenv('ONNXRUNTIME_DIR', '0') library_dirs += [os.path.join(ort_path, 'lib')] libraries.append('onnxruntime') - kwargs = {} define_macros = [] extra_compile_args = {'cxx': []} @@ -297,9 +296,6 @@ def get_extensions(): include_dirs += include_paths(cuda=False) library_dirs += library_paths(cuda=False) - kwargs['library_dirs'] = library_dirs - kwargs['libraries'] = libraries - from setuptools import Extension ext_ops = Extension( name=ext_name,