diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc index ef92f9ae3175..22dde1c46389 100644 --- a/src/codegen/codegen_cuda.cc +++ b/src/codegen/codegen_cuda.cc @@ -57,6 +57,10 @@ std::string CodeGenCUDA::Finish() { decl_stream << "#include \n"; } + if (need_math_constants_h_) { + decl_stream << "#include \n"; + } + return CodeGenC::Finish(); } @@ -318,8 +322,19 @@ inline void PrintConst(const FloatImm* op, std::ostream& os, CodeGenCUDA* p) { / switch (op->type.bits()) { case 64: case 32: { std::ostringstream temp; - temp << std::scientific << op->value; - if (op->type.bits() == 32) temp << 'f'; + if (std::isinf(op->value)) { + if (op->value < 0) { + temp << "-"; + } + temp << ((op->type.bits() == 32) ? "CUDART_INF_F" : "CUDART_INF"); + p->need_math_constants_h_ = true; + } else if (std::isnan(op->value)) { + temp << ((op->type.bits() == 32) ? "CUDART_NAN_F" : "CUDART_NAN"); + p->need_math_constants_h_ = true; + } else { + temp << std::scientific << op->value; + if (op->type.bits() == 32) temp << 'f'; + } p->MarkConst(temp.str()); os << temp.str(); break; diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h index 381784a13a57..acd759f33889 100644 --- a/src/codegen/codegen_cuda.h +++ b/src/codegen/codegen_cuda.h @@ -39,7 +39,9 @@ class CodeGenCUDA final : public CodeGenC { void Init(bool output_ssa); void AddFunction(LoweredFunc f); std::string Finish(); - bool need_include_path() { return (enable_fp16_ || enable_int8_); } + bool need_include_path() { + return (enable_fp16_ || enable_int8_ || need_math_constants_h_); + } // override behavior void VisitStmt_(const ir::For* op) final; void PrintStorageSync(const Call* op) final; @@ -70,6 +72,9 @@ class CodeGenCUDA final : public CodeGenC { bool enable_fp16_{false}; // whether enable int8 bool enable_int8_{false}; + // whether need math_constants.h + bool need_math_constants_h_{false}; + friend void PrintConst(const FloatImm* op, std::ostream& os, CodeGenCUDA* p); }; } // namespace codegen diff --git a/src/codegen/codegen_opencl.cc b/src/codegen/codegen_opencl.cc index 382124a7ed2d..0b33bf43c151 100644 --- a/src/codegen/codegen_opencl.cc +++ b/src/codegen/codegen_opencl.cc @@ -247,6 +247,19 @@ void CodeGenOpenCL::VisitExpr_(const Select* op, std::ostream& os) { // NOLINT( CodeGenC::VisitExpr_(op, os); } +void CodeGenOpenCL::VisitExpr_(const FloatImm *op, std::ostream& os) { // NOLINT(*) + if (std::isinf(op->value)) { + if (op->value < 0) { + os << "-"; + } + os << "INFINITY"; + } else if (std::isnan(op->value)) { + os << "NAN"; + } else { + CodeGenC::VisitExpr_(op, os); + } +} + runtime::Module BuildOpenCL(Array funcs) { using tvm::runtime::Registry; bool output_ssa = false; diff --git a/src/codegen/codegen_opencl.h b/src/codegen/codegen_opencl.h index 0eff3a633ba3..36a55a545cbd 100644 --- a/src/codegen/codegen_opencl.h +++ b/src/codegen/codegen_opencl.h @@ -59,6 +59,7 @@ class CodeGenOpenCL final : public CodeGenC { void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*) void VisitExpr_(const Call* op, std::ostream& os) final; // NOLINT(*) void VisitExpr_(const Select* op, std::ostream& os) final; // NOLINT(*) + void VisitExpr_(const FloatImm *op, std::ostream& os) final; // NOLINT(*) private: // whether enable fp16 and fp64 extension diff --git a/tests/python/unittest/test_codegen_cuda.py b/tests/python/unittest/test_codegen_cuda.py index f28b4ccfd1da..8fe6720830a5 100644 --- a/tests/python/unittest/test_codegen_cuda.py +++ b/tests/python/unittest/test_codegen_cuda.py @@ -125,8 +125,38 @@ def check_cuda(n, value): check_cuda(64, 0) check_cuda(64, -3) + +def test_cuda_inf_nan(): + target = 'cuda' + def check_inf_nan(ctx, n, value, dtype): + A = tvm.placeholder((n,), name='A', dtype=dtype) + inf_value = tvm.const(value, dtype=dtype) + C = tvm.compute((n,), lambda i: inf_value, name='C') + s = tvm.create_schedule(C.op) + s[C].bind(s[C].op.axis[0], tvm.thread_axis("threadIdx.x")) + fun = tvm.build(s, [A, C], target) + a = tvm.nd.empty((n,), A.dtype, ctx) + c = tvm.nd.empty((n,), A.dtype, ctx) + # Only need to test compiling here + fun(a, c) + + if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"): + print("skip because cuda is not enabled..") + return + + ctx = tvm.context(target, 0) + + check_inf_nan(ctx, 1, -float('inf'), 'float32') + check_inf_nan(ctx, 1, -float('inf'), 'float64') + check_inf_nan(ctx, 1, float('inf'), 'float32') + check_inf_nan(ctx, 1, float('inf'), 'float64') + check_inf_nan(ctx, 1, float('nan'), 'float32') + check_inf_nan(ctx, 1, float('nan'), 'float64') + + if __name__ == "__main__": test_cuda_vectorize_add() test_cuda_multiply_add() test_cuda_vectorize_load() test_cuda_make_int8x4() + test_cuda_inf_nan() diff --git a/tests/python/unittest/test_codegen_opencl.py b/tests/python/unittest/test_codegen_opencl.py index c484664bdfd8..71fc4f9a7f35 100644 --- a/tests/python/unittest/test_codegen_opencl.py +++ b/tests/python/unittest/test_codegen_opencl.py @@ -66,6 +66,33 @@ def check_select(ctx, n, dtype): check_select(ctx, 1, 'int16') check_select(ctx, 1, 'uint16') +def test_opencl_inf_nan(): + def check_inf_nan(ctx, n, value, dtype): + A = tvm.placeholder((n,), name='A', dtype=dtype) + inf_value = tvm.const(value, dtype=dtype) + C = tvm.compute((n,), lambda i: inf_value, name='C') + s = tvm.create_schedule(C.op) + s[C].bind(s[C].op.axis[0], tvm.thread_axis("threadIdx.x")) + fun = tvm.build(s, [A, C], target) + a = tvm.nd.empty((n,), A.dtype, ctx) + c = tvm.nd.empty((n,), A.dtype, ctx) + # Only need to test compiling here + fun(a, c) + + if not tvm.module.enabled(target): + print("skip because opencl is not enabled..") + return + + ctx = tvm.context(target, 0) + + check_inf_nan(ctx, 1, -float('inf'), 'float32') + check_inf_nan(ctx, 1, -float('inf'), 'float64') + check_inf_nan(ctx, 1, float('inf'), 'float32') + check_inf_nan(ctx, 1, float('inf'), 'float64') + check_inf_nan(ctx, 1, float('nan'), 'float32') + check_inf_nan(ctx, 1, float('nan'), 'float64') + if __name__ == "__main__": test_opencl_ternary_expression() + test_opencl_inf_nan()