diff --git a/example/test_linear.py b/example/test_linear.py index c2ad4db..8ccbbbc 100644 --- a/example/test_linear.py +++ b/example/test_linear.py @@ -118,7 +118,11 @@ def load_specified_linear_weights(): model.post_init() -res = model(x) +stream = torch.cuda.Stream() +with torch.cuda.stream(stream): + res = model(x) +stream.synchronize() + print(f'tm.linear.res: {res}') abs_diff = torch.abs(res - ref_res).float() rel_diff = abs_diff / torch.max(torch.abs(ref_res), torch.abs(res)) diff --git a/src/turbomind/api/python/bind.cpp b/src/turbomind/api/python/bind.cpp index 8140b46..5eadc1e 100644 --- a/src/turbomind/api/python/bind.cpp +++ b/src/turbomind/api/python/bind.cpp @@ -329,16 +329,17 @@ PYBIND11_MODULE(_turbomind_ext, m) { .def(py::init([](size_t in_features, size_t out_features, int w_bit, int group_size) { return new turbomind::Linear(in_features, out_features, w_bit, group_size); })) - .def("post_init", [](turbomind::Linear* linear, py::object qweight, py::object scales, py::object qzeros, + .def("post_init", [](turbomind::Linear* self, py::object qweight, py::object scales, py::object qzeros, bool simt){ auto _qweight = TorchTensorToTurbomindTensor(qweight); auto _scales = TorchTensorToTurbomindTensor(scales); auto _qzeros = TorchTensorToTurbomindTensor(qzeros); - linear->post_init(_qweight, *_scales, *_qzeros, simt); + self->post_init(_qweight, *_scales, *_qzeros, simt); }) - .def("forward", [](turbomind::Linear* linear, py::object in, py::object out) { + .def("forward", [](turbomind::Linear* self, py::object in, py::object out, int64_t stream_id = 0) { auto _in = TorchTensorToTurbomindTensor(in); auto _out = TorchTensorToTurbomindTensor(out); - return linear->forward(*_in, *_out); + auto stream = reinterpret_cast(stream_id); + return self->forward(*_in, *_out, stream); }); } diff --git a/src/turbomind/api/python/linear.cc b/src/turbomind/api/python/linear.cc index 6641110..d05c7d4 100644 --- a/src/turbomind/api/python/linear.cc +++ b/src/turbomind/api/python/linear.cc @@ -36,15 +36,15 @@ struct Linear::Impl { workspace_.barriers_size = gemm::Gemm::kBarriersSize; workspace_.partials_size = gemm::Gemm::kPartialsSize; - cudaMallocAsync(&workspace_.barriers, workspace_.barriers_size, stream_); - cudaMallocAsync(&workspace_.partials, workspace_.partials_size, stream_); - cudaMemsetAsync(workspace_.barriers, 0, workspace_.barriers_size, stream_); + cudaMallocAsync(&workspace_.barriers, workspace_.barriers_size, 0); + cudaMallocAsync(&workspace_.partials, workspace_.partials_size, 0); + cudaMemsetAsync(workspace_.barriers, 0, workspace_.barriers_size, 0); } ~Impl() { - cudaFreeAsync(workspace_.barriers, stream_); - cudaFreeAsync(workspace_.partials, stream_); + cudaFreeAsync(workspace_.barriers, 0); + cudaFreeAsync(workspace_.partials, 0); workspace_ = {}; check_cuda_error(cudaFree(scales_zeros_)); } @@ -60,7 +60,7 @@ struct Linear::Impl { check_cuda_error(cudaFree(workspace)); } - void forward(const Tensor& in, Tensor& out) { + void forward(const Tensor& in, Tensor& out, cudaStream_t stream) { TM_CHECK(in.type == TYPE_FP16 && out.type == TYPE_FP16); TM_CHECK(in.shape.size() == 2 && in.shape[1] == input_dims_); TM_CHECK(out.shape.size() == 2 && out.shape[0] == in.shape[0] && out.shape[1] == output_dims_); @@ -106,7 +106,7 @@ struct Linear::Impl { const_cast(out.data), c_desc, workspace_, - stream_); + stream); if (ec) { printf("%s: %d", __PRETTY_FUNCTION__, ec); @@ -266,7 +266,6 @@ struct Linear::Impl { gemm::Gemm gemm_; gemm::DispatchPolicy dispatch_policy_{gemm::DispatchPolicy::kDefault}; gemm::Workspace workspace_; - cudaStream_t stream_{}; size_t input_dims_; size_t output_dims_; @@ -288,8 +287,8 @@ void Linear::post_init(std::shared_ptr qweight, const Tensor& scales, co impl_->post_init(qweight, scales, qzeros, simt); } -void Linear::forward(const Tensor& in, Tensor& out) +void Linear::forward(const Tensor& in, Tensor& out, cudaStream_t stream) { - impl_->forward(in, out); + impl_->forward(in, out, stream); } } // namespace turbomind diff --git a/src/turbomind/api/python/linear.h b/src/turbomind/api/python/linear.h index 11e81eb..8b611f2 100644 --- a/src/turbomind/api/python/linear.h +++ b/src/turbomind/api/python/linear.h @@ -28,7 +28,7 @@ class Linear { Linear(size_t input_dims, size_t output_dims, int w_bit, int group_size); void post_init(std::shared_ptr qweight, const Tensor& scales, const Tensor& qzeros, bool simt); - void forward(const Tensor& in, Tensor& out); + void forward(const Tensor& in, Tensor& out, cudaStream_t stream = nullptr); ~Linear() {} private: diff --git a/turbomind/linear.py b/turbomind/linear.py index 1e9d221..a7fea9b 100644 --- a/turbomind/linear.py +++ b/turbomind/linear.py @@ -163,7 +163,8 @@ def forward(self, x): dtype=torch.float16, device=x.device, ) - self.linear.forward(x, out) + stream = torch.cuda.current_stream() + self.linear.forward(x, out, stream.cuda_stream) out = torch.from_dlpack(out) if self.bias is not None: out.add_(self.bias)