Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

avoid mixed usage of Array<T, N> and T[N] when passing to template function #1475

Merged
merged 6 commits into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 32 additions & 6 deletions csrc/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,9 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
initStringStreamFormat(code_);
}

// aligned array of registers used in the kernel
std::unordered_set<Val*> aligned_array_of_regs_;

using kir::ConstIrVisitor::handle;

void initStringStreamFormat(std::stringstream& ss) {
Expand Down Expand Up @@ -246,6 +249,24 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
return val_to_name_.at(v);
}

// If the variable is an aligned array, append ".array" to use the reguar
// array. This avoid the type mismatch in template functions when one of the
// arguments is an aligned array (Array<T,N>) while the other is a regular
// array T[N].
std::string genVariableNameConvertAlignedArray(Val* v) {
TensorView* tv = nullptr;
if (v->isA<kir::TensorIndex>()) {
tv = v->as<kir::TensorIndex>()->view();
} else if (v->isA<TensorView>()) {
tv = v->as<TensorView>();
}
if (tv && aligned_array_of_regs_.count(tv)) {
return genVariableName(tv).append(".array");
} else {
return genVariableName(v);
}
}

// Generates the kernel function declaration
void genDeclaration(const std::string& kernel_name) {
code_ << "__global__ void " << kernel_name << "(";
Expand Down Expand Up @@ -2297,13 +2318,14 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
ArgumentBuilder func_args;

// outputs
func_args.arg(genVariableName(output.get(0)));
func_args.arg(genVariableName(output.get(1)));
func_args.arg(genVariableName(output.get(2)));
func_args.arg(genVariableNameConvertAlignedArray(output.get(0)));
func_args.arg(genVariableNameConvertAlignedArray(output.get(1)));
func_args.arg(genVariableNameConvertAlignedArray(output.get(2)));
// inputs
func_args.arg(genVariableName(input.get(0)));
func_args.arg(genVariableName(input.get(1)));
func_args.arg(genVariableName(input.get(2))).append("[0]");
func_args.arg(genVariableNameConvertAlignedArray(input.get(0)));
func_args.arg(genVariableNameConvertAlignedArray(input.get(1)));
func_args.arg(genVariableNameConvertAlignedArray(input.get(2)))
.append("[0]");

// global buf
for (const auto i : c10::irange(3)) {
Expand Down Expand Up @@ -2802,6 +2824,9 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
<< " = *reinterpret_cast<Array<" << buffer_dtype << ", "
<< genInline(size) << ">*>(&" << genVariableName(alias_tv)
<< ");\n";
if (alloc->memoryType() == MemoryType::Local) {
aligned_array_of_regs_.insert(tv);
}
}
} else {
// Standard Memory Allocation
Expand All @@ -2828,6 +2853,7 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
indent() << "Array<" << buffer_dtype << ", " << genInline(size)
<< ", " << va.at(tv) << "> " << genVariableName(tv)
<< ";\n";
aligned_array_of_regs_.insert(tv);
} else {
indent() << buffer_dtype << " " << genVariableName(tv) << "["
<< genInline(size) << "];\n";
Expand Down
48 changes: 48 additions & 0 deletions test/test_gpu3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8783,6 +8783,54 @@ TEST_F(NVFuserTest, UnsupportedBFloat) {
testing::ThrowsMessage<nvfuser::nvfError>(
testing::HasSubstr("Reason: Fusion contains BFloat16")));
}

// Issue #1470 reproduction:
// `nvfuser_index_t T5[4]` is aliased as `Array<float, 4> T9`.
// `float T4[4]` is aliased as `auto& T10 = T4`.
// Using `T9` and `T10` in `welfordGroupOuter` function causes a compilation
// error due to type mismatch: `T9` is an aligned array, while `T10` is a
// regular array. Should generate fun<>(T9.array, T10) instead of
// fun<>(T9, T10).
TEST_F(NVFuserTest, TemplateFunctionTypeMismatch) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
auto fusion = fusion_ptr.get();
FusionGuard fg(fusion);

const int batch_size = 8192;
const int hidden_size = 1024;
DataType input_dtype = DataType::Float;
auto tv0 = makeContigTensor(2, input_dtype);
fusion->addInput(tv0);
auto tv1 = set(tv0);
auto tv2 = add(tv1, tv1);
auto tv3 = Welford(tv2, {0});
auto tv4 = broadcast(tv3.avg, {true, false});
auto tv5 = div(tv2, tv4);

auto tv6 = exp(tv5);
auto tv7 = Welford(tv6, {0});
auto tv8 = broadcast(tv7.avg, {true, false});
auto tv9 = div(tv6, tv8);

fusion->addOutput(tv5);
fusion->addOutput(tv9);

auto options = at::TensorOptions()
.dtype(data_type_to_aten(input_dtype))
.device(at::kCUDA, 0);
auto t0 = at::randn({batch_size, hidden_size}, options);
std::vector<c10::IValue> inputs{t0};

auto persistent_params = getOuterPersistentHeuristics(fusion, inputs);
NVF_CHECK(persistent_params, "Reduction schedule was not generated!");
scheduleOuterPersistentKernel(fusion, *persistent_params);
KernelArgumentHolder args =
KernelArgumentHolder::createKernelArgumentHolder(inputs);
FusionExecutor fe;
fe.compileFusion(
fusion, args, persistent_params->lparams, persistent_params->cparams);
auto cg_outputs = fe.runFusion(args, persistent_params->lparams);
}
// Test file size should be up to 10K LoC. Create a new file for more tests.

} // namespace nvfuser
Loading