forked from root-project/cling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIncrementalCUDADeviceCompiler.cpp
431 lines (361 loc) · 15.3 KB
/
IncrementalCUDADeviceCompiler.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
//--------------------------------------------------------------------*- C++ -*-
// CLING - the C++ LLVM-based InterpreterG :)
// author: Simeon Ehrig <[email protected]>
//
// This file is dual-licensed: you can choose to license it under the University
// of Illinois Open Source License or the GNU Lesser General Public License. See
// LICENSE.TXT for details.
//------------------------------------------------------------------------------
#include "cling/Interpreter/IncrementalCUDADeviceCompiler.h"
#include "cling/Interpreter/Interpreter.h"
#include "cling/Interpreter/InvocationOptions.h"
#include "cling/Interpreter/Transaction.h"
#include "clang/Basic/TargetOptions.h"
#include "clang/Frontend/CompilerInstance.h"
#include "clang/Lex/HeaderSearchOptions.h"
#include "llvm/Support/Process.h"
#include "llvm/Support/Program.h"
#include "llvm/Support/raw_ostream.h"
#include <llvm/IR/LegacyPassManager.h>
#include <llvm/Support/TargetRegistry.h>
#include <llvm/Support/TargetSelect.h>
#include <llvm/Target/TargetMachine.h>
#include <llvm/Target/TargetOptions.h>
#include <algorithm>
#include <bitset>
#include <string>
#include <system_error>
namespace cling {
IncrementalCUDADeviceCompiler::IncrementalCUDADeviceCompiler(
const std::string& filePath, const int optLevel,
const cling::InvocationOptions& invocationOptions,
const clang::CompilerInstance& CI)
: m_FilePath(filePath),
m_FatbinFilePath(CI.getCodeGenOpts().CudaGpuBinaryFileName) {
if (m_FatbinFilePath.empty()) {
llvm::errs() << "Error: CudaGpuBinaryFileNames can't be empty\n";
return;
}
setCuArgs(CI.getLangOpts(), invocationOptions,
CI.getCodeGenOpts().getDebugInfo(),
llvm::Triple(CI.getTargetOpts().Triple));
// cling -std=c++xx -Ox -x cuda -S --cuda-gpu-arch=sm_xx --cuda-device-only
// ${include headers} ${-I/paths} [-v] [-g] ${m_CuArgs->additionalPtxOpt}
argv = {"cling",
m_CuArgs->cppStdVersion.c_str(),
"-O" + std::to_string(optLevel),
"-x",
"cuda",
"-S",
std::string("--cuda-gpu-arch=sm_")
.append(std::to_string(m_CuArgs->smVersion)),
"--cuda-device-only"};
addHeaderSearchPathFlags(argv, CI.getHeaderSearchOptsPtr());
if (m_CuArgs->verbose)
argv.push_back("-v");
if (m_CuArgs->debug)
argv.push_back("-g");
argv.insert(argv.end(), m_CuArgs->additionalPtxOpt.begin(),
m_CuArgs->additionalPtxOpt.end());
// add included files to the cling ptx
for (const char* c : invocationOptions.CompilerOpts.Remaining) {
std::string s(c);
if (s.find("-include") == 0)
argv.push_back(s);
}
std::vector<const char*> argvChar;
argvChar.resize(argv.size() + 1);
std::transform(argv.begin(), argv.end(), argvChar.begin(),
[&](const std::string& s) { return s.c_str(); });
// argv list have to finish with a nullptr.
argvChar.push_back(nullptr);
// create incremental compiler instance
m_PTX_interp.reset(new Interpreter(argvChar.size(), argvChar.data()));
if (!m_PTX_interp) {
llvm::errs() << "Could not create PTX interpreter instance\n";
return;
}
// initialize NVPTX backend
LLVMInitializeNVPTXTargetInfo();
LLVMInitializeNVPTXTarget();
LLVMInitializeNVPTXTargetMC();
LLVMInitializeNVPTXAsmPrinter();
m_Init = true;
}
void IncrementalCUDADeviceCompiler::setCuArgs(
const clang::LangOptions& langOpts,
const cling::InvocationOptions& invocationOptions,
const clang::codegenoptions::DebugInfoKind debugInfo,
const llvm::Triple hostTriple) {
std::string cppStdVersion;
// Set the c++ standard. Just one condition is possible.
if (langOpts.CPlusPlus11)
cppStdVersion = "-std=c++11";
if (langOpts.CPlusPlus14)
cppStdVersion = "-std=c++14";
if (langOpts.CPlusPlus17)
cppStdVersion = "-std=c++1z";
if (langOpts.CPlusPlus2a)
cppStdVersion = "-std=c++2a";
if (cppStdVersion.empty())
llvm::errs()
<< "IncrementalCUDADeviceCompiler: No valid c++ standard is set.\n";
uint32_t smVersion = 20;
if (!invocationOptions.CompilerOpts.CUDAGpuArch.empty()) {
llvm::StringRef(invocationOptions.CompilerOpts.CUDAGpuArch)
.drop_front(3 /* sm_ */)
.getAsInteger(10, smVersion);
}
// FIXME : Should not reduce the fine granulated debug options to a simple.
// -g
bool debug = false;
if (debugInfo == clang::codegenoptions::DebugLineTablesOnly ||
debugInfo == clang::codegenoptions::LimitedDebugInfo ||
debugInfo == clang::codegenoptions::FullDebugInfo)
debug = true;
// FIXME : Cling has problems to detect these arguments.
/*
if(langOpts.CUDADeviceFlushDenormalsToZero)
m_CuArgs.additionalPtxOpt.push_back("-fcuda-flush-denormals-to-zero");
if(langOpts.CUDADeviceApproxTranscendentals)
m_CuArgs.additionalPtxOpt.push_back("-fcuda-approx-transcendentals");
if(langOpts.CUDAAllowVariadicFunctions)
m_CuArgs.additionalPtxOpt.push_back("-fcuda-allow-variadic-functions");
*/
std::vector<std::string> additionalPtxOpt;
// search for defines (-Dmacros=value) in the args and add them to the PTX
// compiler args
for (const char* arg : invocationOptions.CompilerOpts.Remaining) {
std::string s = arg;
if (s.compare(0, 2, "-D") == 0)
additionalPtxOpt.push_back(s);
}
// use custom CUDA SDK path
if(!invocationOptions.CompilerOpts.CUDAPath.empty()){
additionalPtxOpt.push_back("--cuda-path=" + invocationOptions.CompilerOpts.CUDAPath);
}
enum FatBinFlags {
AddressSize64 = 0x01,
HasDebugInfo = 0x02,
ProducerCuda = 0x04,
HostLinux = 0x10,
HostMac = 0x20,
HostWindows = 0x40
};
uint32_t fatbinFlags = FatBinFlags::ProducerCuda;
if (debug)
fatbinFlags |= FatBinFlags::HasDebugInfo;
if (hostTriple.isArch64Bit())
fatbinFlags |= FatBinFlags::AddressSize64;
if (hostTriple.isOSWindows())
fatbinFlags |= FatBinFlags::HostWindows;
else if (hostTriple.isOSDarwin())
fatbinFlags |= FatBinFlags::HostMac;
else
fatbinFlags |= FatBinFlags::HostLinux;
m_CuArgs.reset(new IncrementalCUDADeviceCompiler::CUDACompilerArgs(
cppStdVersion, hostTriple, smVersion, fatbinFlags,
invocationOptions.Verbose(), debug, additionalPtxOpt));
}
void IncrementalCUDADeviceCompiler::addHeaderSearchPathFlags(
std::vector<std::string>& argv,
const std::shared_ptr<clang::HeaderSearchOptions> &headerSearchOptions) {
for (clang::HeaderSearchOptions::Entry e :
headerSearchOptions->UserEntries) {
if (e.Group == clang::frontend::IncludeDirGroup::Quoted) {
argv.push_back("-iquote");
argv.push_back(e.Path);
}
if (e.Group == clang::frontend::IncludeDirGroup::Angled)
argv.push_back("-I" + e.Path);
}
}
// FIXME: add the same arguments as the cling::Interpreter class -> need some
// modifications in the cling::Transaction class to store information from the
// device compiler
bool IncrementalCUDADeviceCompiler::process(const std::string& input) {
if (!m_Init) {
llvm::errs()
<< "Error: Initializiation of CUDA Device Code Compiler failed\n";
return false;
}
Interpreter::CompilationResult CR = m_PTX_interp->process(input);
if (CR == Interpreter::CompilationResult::kFailure) {
llvm::errs() << "IncrementalCUDADeviceCompiler::process()\n"
<< "failed at compile ptx code\n";
return false;
}
// for example blocks which are not closed
if (CR == Interpreter::CompilationResult::kMoreInputExpected)
return true;
if (!generatePTX() || !generateFatbinary())
return false;
return true;
}
// FIXME: see process()
bool IncrementalCUDADeviceCompiler::declare(const std::string& input) {
if (!m_Init) {
llvm::errs()
<< "Error: Initializiation of CUDA Device Code Compiler failed\n";
return false;
}
Interpreter::CompilationResult CR = m_PTX_interp->declare(input);
if (CR == Interpreter::CompilationResult::kFailure) {
llvm::errs() << "IncrementalCUDADeviceCompiler::declare()\n"
<< "failed at compile ptx code\n";
return false;
}
// for example blocks which are not closed
if (CR == Interpreter::CompilationResult::kMoreInputExpected)
return true;
if (!generatePTX() || !generateFatbinary())
return false;
return true;
}
// FIXME: see process()
bool IncrementalCUDADeviceCompiler::parse(const std::string& input) const {
if (!m_Init) {
llvm::errs()
<< "Error: Initializiation of CUDA Device Code Compiler failed\n";
return false;
}
Interpreter::CompilationResult CR = m_PTX_interp->parse(input);
if (CR == Interpreter::CompilationResult::kFailure) {
llvm::errs() << "IncrementalCUDADeviceCompiler::parse()"
<< "failed at compile ptx code\n";
return false;
}
return true;
}
bool cling::IncrementalCUDADeviceCompiler::generatePTX() {
// delete compiled PTX code of last input
m_PTX_code = "";
llvm::Module* module = m_PTX_interp->getLastTransaction()->getModule();
std::string error;
auto Target =
llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
if (!Target) {
llvm::errs() << error;
return 1;
}
// is not important, because PTX does not use any object format
llvm::Optional<llvm::Reloc::Model> RM =
llvm::Optional<llvm::Reloc::Model>(llvm::Reloc::Model::PIC_);
llvm::TargetOptions TO = llvm::TargetOptions();
llvm::TargetMachine* targetMachine = Target->createTargetMachine(
module->getTargetTriple(),
std::string("sm_").append(std::to_string(m_CuArgs->smVersion)), "", TO,
RM);
module->setDataLayout(targetMachine->createDataLayout());
llvm::raw_svector_ostream dest(m_PTX_code);
llvm::legacy::PassManager pass;
// it's important to use the type assembler
// object file is not supported and do not make sense
auto FileType = llvm::TargetMachine::CGFT_AssemblyFile;
if (targetMachine->addPassesToEmitFile(pass, dest, /*DwoOut*/ nullptr,
FileType)) {
llvm::errs() << "TargetMachine can't emit assembler code";
return 1;
}
return pass.run(*module);
}
bool IncrementalCUDADeviceCompiler::generateFatbinary() {
// FIXME: At the moment the fatbin code must be writen to a file so that
// CodeGen can use it. This should be replaced by a in-memory solution
// (e.g. virtual file).
std::error_code EC;
llvm::raw_fd_ostream os(m_FatbinFilePath, EC, llvm::sys::fs::F_None);
if (EC) {
llvm::errs() << "ERROR: cannot generate file " << m_FatbinFilePath
<< "\n";
return false;
}
// implementation is adapted from clangJIT
// (https://github.com/hfinkel/llvm-project-cxxjit/blob/cxxjit/clang/lib/CodeGen/JIT.cpp)
// void *resolveFunction(const void *NTTPValues, const char **TypeStrings,
// unsigned Idx)
// The outer header of the fat binary is documented in the CUDA
// fatbinary.h header. As mentioned there, the overall size must be a
// multiple of eight, and so we must make sure that the PTX is.
// We also need to make sure that the buffer is explicitly null
// terminated (cuobjdump, at least, seems to assume that it is).
m_PTX_code += '\0';
while (m_PTX_code.size() % 8)
m_PTX_code += '\0';
// NVIDIA, unfortunatly, does not provide full documentation on their
// fatbin format. There is some information on the outer header block in
// the CUDA fatbinary.h header. Also, it is possible to figure out more
// about the format by creating fatbins using the provided utilities
// and then observing what cuobjdump reports about the resulting files.
// There are some other online references which shed light on the format,
// including https://reviews.llvm.org/D8397 and FatBinaryContext.{cpp,h}
// from the GPU Ocelot project (https://github.com/gtcasl/gpuocelot).
struct FatBinHeader {
uint32_t Magic; // 0x00
uint16_t Version; // 0x04
uint16_t HeaderSize; // 0x06
uint32_t DataSize; // 0x08
uint32_t unknown0c; // 0x0c
public:
FatBinHeader(uint32_t DataSize)
: Magic(0xba55ed50), Version(1), HeaderSize(sizeof(*this)),
DataSize(DataSize), unknown0c(0) {}
};
struct FatBinFileHeader {
uint16_t Kind; // 0x00
uint16_t unknown02; // 0x02
uint32_t HeaderSize; // 0x04
uint32_t DataSize; // 0x08
uint32_t unknown0c; // 0x0c
uint32_t CompressedSize; // 0x10
uint32_t SubHeaderSize; // 0x14
uint16_t VersionMinor; // 0x18
uint16_t VersionMajor; // 0x1a
uint32_t CudaArch; // 0x1c
uint32_t unknown20; // 0x20
uint32_t unknown24; // 0x24
uint32_t Flags; // 0x28
uint32_t unknown2c; // 0x2c
uint32_t unknown30; // 0x30
uint32_t unknown34; // 0x34
uint32_t UncompressedSize; // 0x38
uint32_t unknown3c; // 0x3c
uint32_t unknown40; // 0x40
uint32_t unknown44; // 0x44
FatBinFileHeader(uint32_t DataSize, uint32_t CudaArch, uint32_t Flags)
: Kind(1 /*PTX*/), unknown02(0x0101), HeaderSize(sizeof(*this)),
DataSize(DataSize), unknown0c(0), CompressedSize(0),
SubHeaderSize(HeaderSize - 8), VersionMinor(2), VersionMajor(4),
CudaArch(CudaArch), unknown20(0), unknown24(0), Flags(Flags),
unknown2c(0), unknown30(0), unknown34(0), UncompressedSize(0),
unknown3c(0), unknown40(0), unknown44(0) {}
};
FatBinFileHeader fatBinFileHeader(m_PTX_code.size(), m_CuArgs->smVersion,
m_CuArgs->fatbinFlags);
FatBinHeader fatBinHeader(m_PTX_code.size() + fatBinFileHeader.HeaderSize);
os.write((char*)&fatBinHeader, fatBinHeader.HeaderSize);
os.write((char*)&fatBinFileHeader, fatBinFileHeader.HeaderSize);
os << m_PTX_code;
return true;
}
void IncrementalCUDADeviceCompiler::dump() {
llvm::outs() << "CUDA device compiler is valid: " << m_Init << "\n"
<< "file path: " << m_FilePath << "\n"
<< "fatbin file path: " << m_FatbinFilePath << "\n"
<< "m_CuArgs c++ standard: " << m_CuArgs->cppStdVersion << "\n"
<< "m_CuArgs host triple: " << m_CuArgs->hostTriple.str()
<< "\n"
<< "m_CuArgs Nvidia SM Version: " << m_CuArgs->smVersion
<< "\n"
<< "m_CuArgs Fatbin Flags (see "
"IncrementalCUDADeviceCompiler::setCuArgs()): "
<< std::bitset<7>(m_CuArgs->fatbinFlags).to_string() << "\n"
<< "m_CuArgs verbose: " << m_CuArgs->verbose << "\n"
<< "m_CuArgs debug: " << m_CuArgs->debug << "\n";
llvm::outs() << "m_CuArgs additional clang nvptx options: ";
for (const std::string& s : m_CuArgs->additionalPtxOpt) {
llvm::outs() << s << " ";
}
llvm::outs() << "\n";
}
} // end namespace cling