Skip to content

Commit

Permalink
add genVarForTemplateFunction
Browse files Browse the repository at this point in the history
  • Loading branch information
liqiangxl committed Feb 17, 2024
1 parent bfb086e commit 7905dde
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 6 deletions.
34 changes: 28 additions & 6 deletions csrc/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,8 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
initStringStreamFormat(code_);
}

std::unordered_set<Val*> aligned_array_of_regs_;

using kir::ConstIrVisitor::handle;

void initStringStreamFormat(std::stringstream& ss) {
Expand Down Expand Up @@ -247,6 +249,24 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
return val_to_name_.at(v);
}

// A wrapper around genVariableName that also appends the array suffix if the
// variable is an aligned array of registers. This avoid the type mismatch in
// template functions when one of the arguments is an aligned array
// (Array<T,N>) while another is a regular array T[N].
std::string genVarForTemplateFunction(Val* v) {
TensorView* tv = nullptr;
if (v->isA<kir::TensorIndex>()) {
tv = v->as<kir::TensorIndex>()->view();
} else if (v->isA<TensorView>()) {
tv = v->as<TensorView>();
}
if (tv && aligned_array_of_regs_.count(tv)) {
return genVariableName(tv).append(".array");
} else {
return genVariableName(v);
}
}

// Generates the kernel function declaration
void genDeclaration(const std::string& kernel_name) {
code_ << "__global__ void " << kernel_name << "(";
Expand Down Expand Up @@ -2298,13 +2318,13 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
ArgumentBuilder func_args;

// outputs
func_args.arg(genVariableName(output.get(0)));
func_args.arg(genVariableName(output.get(1)));
func_args.arg(genVariableName(output.get(2)));
func_args.arg(genVarForTemplateFunction(output.get(0)));
func_args.arg(genVarForTemplateFunction(output.get(1)));
func_args.arg(genVarForTemplateFunction(output.get(2)));
// inputs
func_args.arg(genVariableName(input.get(0)));
func_args.arg(genVariableName(input.get(1)));
func_args.arg(genVariableName(input.get(2))).append("[0]");
func_args.arg(genVarForTemplateFunction(input.get(0)));
func_args.arg(genVarForTemplateFunction(input.get(1)));
func_args.arg(genVarForTemplateFunction(input.get(2))).append("[0]");

// global buf
for (const auto i : c10::irange(3)) {
Expand Down Expand Up @@ -2803,6 +2823,7 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
<< " = *reinterpret_cast<Array<" << buffer_dtype << ", "
<< genInline(size) << ">*>(&" << genVariableName(alias_tv)
<< ");\n";
aligned_array_of_regs_.insert(tv);
}
} else {
// Standard Memory Allocation
Expand All @@ -2829,6 +2850,7 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
indent() << "Array<" << buffer_dtype << ", " << genInline(size)
<< ", " << va.at(tv) << "> " << genVariableName(tv)
<< ";\n";
aligned_array_of_regs_.insert(tv);
} else {
indent() << buffer_dtype << " " << genVariableName(tv) << "["
<< genInline(size) << "];\n";
Expand Down
48 changes: 48 additions & 0 deletions test/test_gpu3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8738,6 +8738,54 @@ TEST_F(NVFuserTest, AvoidCachingSliceInput) {
}
}
}

// Issue #1470 reproduction:
// `nvfuser_index_t T5[4]` is aliased as `Array<float, 4> T9`.
// `float T4[4]` is aliased as `auto& T10 = T4`.
// Using `T9` and `T10` in `welfordGroupOuter` function causes a compilation
// error due to type mismatch: `T9` is a custom array type, while `T10` is a
// native float array. when aliasing different types, use Array<T, N> or T[N]
// should depend on how the original tv was allocated.
TEST_F(NVFuserTest, TemplateFunctionTypeMismatch) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
auto fusion = fusion_ptr.get();
FusionGuard fg(fusion);

const int batch_size = 8192;
const int hidden_size = 1024;
DataType input_dtype = DataType::Float;
auto tv0 = makeContigTensor(2, input_dtype);
fusion->addInput(tv0);
auto tv1 = set(tv0);
auto tv2 = add(tv1, tv1);
auto tv3 = Welford(tv2, {0});
auto tv4 = broadcast(tv3.avg, {true, false});
auto tv5 = div(tv2, tv4);

auto tv6 = exp(tv5);
auto tv7 = Welford(tv6, {0});
auto tv8 = broadcast(tv7.avg, {true, false});
auto tv9 = div(tv6, tv8);

fusion->addOutput(tv5);
fusion->addOutput(tv9);

auto options = at::TensorOptions()
.dtype(data_type_to_aten(input_dtype))
.device(at::kCUDA, 0);
auto t0 = at::randn({batch_size, hidden_size}, options);
std::vector<c10::IValue> inputs{t0};

auto persistent_params = getOuterPersistentHeuristics(fusion, inputs);
NVF_CHECK(persistent_params, "Reduction schedule was not generated!");
scheduleOuterPersistentKernel(fusion, *persistent_params);
KernelArgumentHolder args =
KernelArgumentHolder::createKernelArgumentHolder(inputs);
FusionExecutor fe;
fe.compileFusion(
fusion, args, persistent_params->lparams, persistent_params->cparams);
auto cg_outputs = fe.runFusion(args, persistent_params->lparams);
}
// Test file size should be up to 10K LoC. Create a new file for more tests.

} // namespace nvfuser

0 comments on commit 7905dde

Please sign in to comment.