Skip to content

Commit

Permalink
Merge branch 'main' into cccl_std_conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
cliffburdick authored Jun 3, 2024
2 parents 8b099d7 + 381a6b2 commit 8e08f40
Show file tree
Hide file tree
Showing 67 changed files with 960 additions and 894 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.vscode
.cache
build*
*.pyc
20 changes: 12 additions & 8 deletions bench/00_operators/operators.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ template <typename ValueType>
void vector_add(nvbench::state &state, nvbench::type_list<ValueType>)
{
// Get current parameters:
cudaExecutor exec{0};
const int x_len = static_cast<int>(state.get_int64("Vector size"));

state.add_element_count(x_len, "NumElements");
Expand All @@ -19,9 +20,9 @@ void vector_add(nvbench::state &state, nvbench::type_list<ValueType>)

tensor_t<ValueType, 1> xv{{x_len}};
tensor_t<ValueType, 1> xv2{{x_len}};
xv.PrefetchDevice(0);
(xv = xv + xv2).run();
cudaDeviceSynchronize();

(xv = xv + xv2).run(exec);
exec.sync();

state.exec(
[&xv, &xv2](nvbench::launch &launch) {
Expand All @@ -38,6 +39,7 @@ using permute_types = nvbench::type_list<float, double, cuda::std::complex<float
template <typename ValueType>
void permute(nvbench::state &state, nvbench::type_list<ValueType>)
{
cudaExecutor exec{0};
auto x = make_tensor<ValueType>({1000,200,6,300});
auto y = make_tensor<ValueType>({300,1000,6,200});

Expand All @@ -46,7 +48,7 @@ void permute(nvbench::state &state, nvbench::type_list<ValueType>)
state.add_global_memory_writes<ValueType>(x.TotalSize());

x.PrefetchDevice(0);
cudaDeviceSynchronize();
exec.sync();

state.exec(
[&x, &y](nvbench::launch &launch) {
Expand All @@ -61,17 +63,18 @@ using random_types = nvbench::type_list<float, double, cuda::std::complex<float>
template <typename ValueType>
void random(nvbench::state &state, nvbench::type_list<ValueType>)
{
cudaExecutor exec{0};
auto x = make_tensor<ValueType>({1966800});
auto y = make_tensor<ValueType>({1966800});
x.PrefetchDevice(0);
y.PrefetchDevice(0);

(y = random<float>(x.Shape(), NORMAL)).run();
(y = random<float>(x.Shape(), NORMAL)).run(exec);

state.add_element_count(x.TotalSize(), "NumElements");
state.add_global_memory_writes<ValueType>(x.TotalSize());

cudaDeviceSynchronize();
exec.sync();

state.exec(
[&x, &y](nvbench::launch &launch) {
Expand All @@ -98,6 +101,7 @@ void sphericalharmonics(nvbench::state &state, nvbench::type_list<ValueType>)
int n = 600;
ValueType dx = M_PI/n;

cudaExecutor exec{};
auto col = range<0>({n+1},ValueType(0), ValueType(dx));
auto az = range<0>({2*n+1}, ValueType(0), ValueType(dx));

Expand All @@ -122,8 +126,8 @@ void sphericalharmonics(nvbench::state &state, nvbench::type_list<ValueType>)
auto Y = make_tensor<ValueType>(Ym.Shape());
auto Z = make_tensor<ValueType>(Zm.Shape());

cudaDeviceSynchronize();

exec.sync();
state.add_element_count(n+1, "Elements");

state.exec(
Expand Down
5 changes: 3 additions & 2 deletions bench/00_operators/reduction.cu
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ void reduce_4d(
nvbench::type_list<ValueType>
)
{
cudaExecutor exec{0};
const int size0 = static_cast<int>(state.get_int64("Size0"));
const int size1 = static_cast<int>(state.get_int64("Size1"));
const int size2 = static_cast<int>(state.get_int64("Size2"));
Expand All @@ -138,8 +139,8 @@ void reduce_4d(
t1.PrefetchDevice(0);
t4.PrefetchDevice(0);

(t4 = random<float>(t4.Shape(), UNIFORM)).run();
cudaDeviceSynchronize();
(t4 = random<float>(t4.Shape(), UNIFORM)).run(exec);
exec.sync();

state.exec([&t4, &t1](nvbench::launch &launch) {
(t1 = matx::sum(t4, {1, 2, 3})).run((cudaStream_t)launch.get_stream()); });
Expand Down
20 changes: 12 additions & 8 deletions bench/00_transform/conv.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ template <typename ValueType>
void conv1d_direct_4d_batch(nvbench::state &state,
nvbench::type_list<ValueType>)
{
cudaExecutor exec{0};
auto out = make_tensor<ValueType>({4, 2, 14, 288 + 4096 + 133 - 1});
auto at = make_tensor<ValueType>({ 4, 2, 14, 133});
auto bt = make_tensor<ValueType>({ 4, 2, 14, 288 + 4096});
Expand All @@ -21,7 +22,7 @@ void conv1d_direct_4d_batch(nvbench::state &state,
at.PrefetchDevice(0);
bt.PrefetchDevice(0);

cudaDeviceSynchronize();
exec.sync();
MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
state.exec(
[&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
Expand All @@ -35,7 +36,7 @@ template <typename ValueType>
void conv1d_direct_2d_batch(nvbench::state &state,
nvbench::type_list<ValueType>)
{

cudaExecutor exec{0};

auto out = make_tensor<ValueType>({4 * 2* 14, 288 + 4096 + 133 - 1});
auto at = make_tensor<ValueType>({ 4 * 2* 14, 133});
Expand All @@ -45,7 +46,7 @@ void conv1d_direct_2d_batch(nvbench::state &state,
at.PrefetchDevice(0);
bt.PrefetchDevice(0);

cudaDeviceSynchronize();
exec.sync();

state.exec(
[&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
Expand All @@ -56,6 +57,7 @@ template <typename ValueType>
void conv1d_direct_large(nvbench::state &state,
nvbench::type_list<ValueType>)
{
cudaExecutor exec{0};
auto at = make_tensor<ValueType>({state.get_int64("Signal Size")});
auto bt = make_tensor<ValueType>({state.get_int64("Filter Size")});
auto out = make_tensor<ValueType>({at.Size(at.Rank()-1) + bt.Size(bt.Rank()-1) - 1});
Expand All @@ -64,9 +66,9 @@ void conv1d_direct_large(nvbench::state &state,
at.PrefetchDevice(0);
bt.PrefetchDevice(0);

(out = conv1d(at, bt, MATX_C_MODE_FULL)).run();
(out = conv1d(at, bt, MATX_C_MODE_FULL)).run(exec);

cudaDeviceSynchronize();
exec.sync();

state.exec(
[&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
Expand All @@ -79,17 +81,18 @@ template <typename ValueType>
void conv1d_fft_large(nvbench::state &state,
nvbench::type_list<ValueType>)
{
cudaExecutor exec{0};
auto at = make_tensor<ValueType>({state.get_int64("Signal Size")});
auto bt = make_tensor<ValueType>({state.get_int64("Filter Size")});
auto out = make_tensor<ValueType>({at.Size(at.Rank()-1) + bt.Size(bt.Rank()-1) - 1});

(out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run();
(out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run(exec);

out.PrefetchDevice(0);
at.PrefetchDevice(0);
bt.PrefetchDevice(0);

cudaDeviceSynchronize();
exec.sync();

state.exec(
[&out, &at, &bt](nvbench::launch &launch) { (out = conv1d(at, bt, MATX_C_MODE_FULL, MATX_C_METHOD_FFT)).run(cudaExecutor(launch.get_stream())); });
Expand All @@ -103,6 +106,7 @@ template <typename ValueType>
void conv2d_direct_batch(nvbench::state &state,
nvbench::type_list<ValueType>)
{
cudaExecutor exec{0};
auto at = make_tensor<ValueType>({256, 1024, 1024});
auto bt = make_tensor<ValueType>({256, 16, 16});
auto out = make_tensor<ValueType>({256,
Expand All @@ -113,7 +117,7 @@ void conv2d_direct_batch(nvbench::state &state,
at.PrefetchDevice(0);
bt.PrefetchDevice(0);

cudaDeviceSynchronize();
exec.sync();

state.exec(
[&out, &at, &bt](nvbench::launch &launch) { (out = conv2d(at, bt, MATX_C_MODE_FULL)).run(cudaExecutor(launch.get_stream())); });
Expand Down
3 changes: 2 additions & 1 deletion bench/00_transform/cub.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,15 @@ void sort1d(
nvbench::type_list<ValueType>
)
{
cudaExecutor exec{0};
const int dataSize = static_cast<int>(state.get_int64("Tensor Size"));

auto sortedData = matx::make_tensor<ValueType>({dataSize});
auto randomData = matx::make_tensor<ValueType>({dataSize});

sortedData.PrefetchDevice(0);
randomData.PrefetchDevice(0);
cudaDeviceSynchronize();
exec.sync();

(randomData = random<float>(sortedData.Shape(), NORMAL)).run();

Expand Down
11 changes: 4 additions & 7 deletions bench/00_transform/qr.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ void qr_batch(nvbench::state &state,

cudaStream_t stream = 0;
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
cudaExecutor exec{stream};

int batch = state.get_int64("batch");
int m = state.get_int64("rows");
Expand All @@ -26,17 +27,13 @@ void qr_batch(nvbench::state &state,
auto Q = make_tensor<AType>({batch, m, m});
auto R = make_tensor<AType>({batch, m, n});

A.PrefetchDevice(stream);
Q.PrefetchDevice(stream);
R.PrefetchDevice(stream);

(A = random<float>({batch, m, n}, NORMAL)).run(stream);
(A = random<float>({batch, m, n}, NORMAL)).run(exec);

// warm up
nvtxRangePushA("Warmup");
(mtie(Q, R) = qr(A)).run(stream);
(mtie(Q, R) = qr(A)).run(exec);

cudaDeviceSynchronize();
exec.sync();
nvtxRangePop();

MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
Expand Down
38 changes: 15 additions & 23 deletions bench/00_transform/svd_power.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ void svdpi_batch(nvbench::state &state,

cudaStream_t stream = 0;
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
cudaExecutor exec{stream};

int batch = state.get_int64("batch");
int m = state.get_int64("rows");
Expand All @@ -30,22 +31,17 @@ void svdpi_batch(nvbench::state &state,

int iterations = 10;

(A = random<float>({batch, m, n}, NORMAL)).run(stream);

A.PrefetchDevice(stream);
U.PrefetchDevice(stream);
S.PrefetchDevice(stream);
VT.PrefetchDevice(stream);

(U = 0).run(stream);
(S = 0).run(stream);
(VT = 0).run(stream);
(A = random<float>({batch, m, n}, NORMAL)).run(exec);

(U = 0).run(exec);
(S = 0).run(exec);
(VT = 0).run(exec);
auto x0 = random<float>({batch, r}, NORMAL);

// warm up
nvtxRangePushA("Warmup");
(mtie(U, S, VT) = svdpi(A, x0, iterations, r)).run(stream);
cudaDeviceSynchronize();
(mtie(U, S, VT) = svdpi(A, x0, iterations, r)).run(exec);
exec.sync();
nvtxRangePop();

MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
Expand All @@ -70,6 +66,7 @@ void svdbpi_batch(nvbench::state &state,

cudaStream_t stream = 0;
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
cudaExecutor exec{stream};

int batch = state.get_int64("batch");
int m = state.get_int64("rows");
Expand All @@ -83,21 +80,16 @@ void svdbpi_batch(nvbench::state &state,

int iterations = 10;

(A = random<float>({batch, m, n}, NORMAL)).run(stream);

A.PrefetchDevice(stream);
U.PrefetchDevice(stream);
S.PrefetchDevice(stream);
VT.PrefetchDevice(stream);
(A = random<float>({batch, m, n}, NORMAL)).run(exec);

(U = 0).run(stream);
(S = 0).run(stream);
(VT = 0).run(stream);
(U = 0).run(exec);
(S = 0).run(exec);
(VT = 0).run(exec);

// warm up
nvtxRangePushA("Warmup");
(mtie(U, S, VT) = svdbpi(A, iterations)).run(stream);
cudaDeviceSynchronize();
(mtie(U, S, VT) = svdbpi(A, iterations)).run(exec);
exec.sync();
nvtxRangePop();

MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
Expand Down
1 change: 1 addition & 0 deletions docs_input/api/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ API Reference
casting/index.rst
window/index.rst
signalimage/index.rst
synchronization/index.rst
polynomials/index.rst
random/random.rst
dft/index.rst
Expand Down
10 changes: 10 additions & 0 deletions docs_input/api/synchronization/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
.. _synchronization:

Synchronization
###############

.. toctree::
:maxdepth: 1
:glob:

*
18 changes: 18 additions & 0 deletions docs_input/api/synchronization/sync.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
.. _sync_func:

sync
====

Wait for any code running on an executor to complete.

.. doxygenfunction:: matx::cudaExecutor::sync()
.. doxygenfunction:: matx::HostExecutor::sync()

Examples
~~~~~~~~

.. literalinclude:: ../../../examples/cgsolve.cu
:language: cpp
:start-after: example-begin sync-test-1
:end-before: example-end sync-test-1
:dedent:
2 changes: 0 additions & 2 deletions docs_input/notebooks/exercises/example1_assignment1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ int main() {
// t2 = ;
/*** End editing ***/

t2.PrefetchDevice(0);

/****************************************************************************************************
* Get a slice of the second and third rows with all columns
* https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#slicing-and-dicing
Expand Down
8 changes: 5 additions & 3 deletions docs_input/notebooks/exercises/example2_assignment1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ int main() {
auto B = make_tensor<float>({2, 3});
auto V = make_tensor<float>({3});

cudaExecutor exec{};

/****************************************************************************************************
* Initialize tensor A with increasing values from 0.5 to 3.0 in steps of 0.4,
*and tensor V from -1 to -3 in steps of -1.
Expand Down Expand Up @@ -83,7 +85,7 @@ int main() {

/*** End editing ***/

cudaStreamSynchronize(0);
exec.sync();

step = 0.5;
for (int row = 0; row < A.Size(0); row++) {
Expand Down Expand Up @@ -111,7 +113,7 @@ int main() {
/// auto tvs = ;
/*** End editing. ***/

// cudaStreamSynchronize(0);
// exec.sync();

// step = 0.5;
// for (int row = 0; row < A.Size(0); row++) {
Expand All @@ -137,7 +139,7 @@ int main() {

/*** End editing ***/

cudaStreamSynchronize(0);
exec.sync();

for (int row = 0; row < B.Size(0); row++) {
for (int col = 0; col < B.Size(1); col++) {
Expand Down
Loading

0 comments on commit 8e08f40

Please sign in to comment.