Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

510 - Assert-fail if MPI functions are accessed inside scheduled callbacks from within VT #792

Merged
merged 23 commits into from
Jun 2, 2020
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
82a499d
#510 add MPI guard access feature to CMake/configs
pnstickne Apr 28, 2020
d5ab586
#510 add method to control MPI access
pnstickne Apr 28, 2020
e194794
#510 add MPI/PMPI wrappers with guard checks
pnstickne Apr 20, 2020
17cee22
#510 ensure MPI access suppressed by default in scheduler
pnstickne Apr 28, 2020
f909ecf
#510 add MPI access guard tests
pnstickne May 12, 2020
8671bc4
#510 add vtAssertMPISuccess macro
pnstickne May 12, 2020
58ae11d
#510 limit ScopedMPIAccess depth
pnstickne May 12, 2020
de7b698
#510 allow internal VT calls to access MPI
pnstickne Apr 28, 2020
6b2b90a
#510 allow direct MPI access in collective_alg.cc
pnstickne May 25, 2020
35c2409
#510 generate MPI wrappers as part of CMake
pnstickne May 26, 2020
43e85aa
#510: rdma: fix multiple nesting during merge
lifflander May 28, 2020
10bfe63
#510 ensure lower-case CMAKE build compare
pnstickne May 30, 2020
2fbdd76
#510 improve MPI access test resiliency
pnstickne May 31, 2020
d818865
#510 remove extra mpi_access.h inclusion
pnstickne May 31, 2020
11c2fe3
#510 exclude guard checks for MPI_Comm_get_* and MPI_Get_*
pnstickne May 31, 2020
0cc211a
#510 allow collective_alg action() access to MPI calls - again
pnstickne May 31, 2020
a2d253d
#510: pmpi: generate a new 3.1 function list
lifflander Jun 1, 2020
350fde5
#510: pmpi: overwrite the old mpi_function.h.in file
lifflander Jun 2, 2020
aed7b05
#510: pmpi: comment out functions that don't build
lifflander Jun 2, 2020
64faf90
#510: CI: add VT_MPI_GUARD_ENABLED to build scripts
lifflander Jun 2, 2020
92cb893
#510: pmpi: include Get_accumulate
lifflander Jun 2, 2020
a968520
#510: pmpi: remove LICENSE file now that we have our own functions
lifflander Jun 2, 2020
4e00b04
#510: CI: change default to guard enabled if nothing passed
lifflander Jun 2, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,13 @@ include(cmake/check_compiler.cmake)

option(vt_fcontext_enabled "Build VT with fcontext (ULT) enabled" OFF)

string(TOLOWER ${CMAKE_BUILD_TYPE} LOWERCASE_CMAKE_BUILD_TYPE)
if (LOWERCASE_CMAKE_BUILD_TYPE STREQUAL "debug")
option(vt_mpi_guards "Build VT with poison MPI calls: code invoked from VT callbacks cannot invoke MPI functions" ON)
else()
option(vt_mpi_guards "Build VT with poison MPI calls: code invoked from VT callbacks cannot invoke MPI functions" OFF)
endif()

# Bundled dependencies

if (${vt_fcontext_enabled})
Expand Down
12 changes: 12 additions & 0 deletions cmake/define_build_types.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,18 @@ else()
set(vt_feature_cmake_mimalloc "0")
endif()

if (vt_mpi_guards AND PERL_FOUND)
message(STATUS "Building VT with user MPI prevention guards enabled")
set(vt_feature_cmake_mpi_access_guards "1")
elseif (vt_mpi_guards AND NOT PERL_FOUND)
# No perl? Can't generate wrapper source file.
message(STATUS "Building VT with user MPI prevention guards disabled (requested, but perl not found)")
set(vt_feature_cmake_mpi_access_guards "0")
else()
message(STATUS "Building VT with user MPI prevention guards disabled")
set(vt_feature_cmake_mpi_access_guards "0")
endif()

set(vt_feature_cmake_no_feature "0")
set(vt_feature_cmake_production "0")

Expand Down
4 changes: 4 additions & 0 deletions cmake/load_packages.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,16 @@ include(cmake/load_local_packages.cmake)

# MPI package
include(cmake/load_mpi_package.cmake)

# ZLIB package
include(cmake/load_zlib_package.cmake)

# Discover and load threading configuration
include(cmake/load_threading_package.cmake)

# Perl is used to build the PMPI wrappers
find_package(Perl)

# Doxygen package
include(cmake/load_doxygen.cmake)

Expand Down
1 change: 1 addition & 0 deletions cmake_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
#define vt_feature_cmake_fcontext @vt_feature_cmake_fcontext@
#define vt_feature_cmake_test_trace_on @vt_feature_cmake_test_trace_on@
#define vt_feature_cmake_mimalloc @vt_feature_cmake_mimalloc@
#define vt_feature_cmake_mpi_access_guards @vt_feature_cmake_mpi_access_guards@

#cmakedefine vt_quirked_trivially_copyable_on_msg
#cmakedefine vt_quirked_serialize_method_detection
Expand Down
22 changes: 22 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ set(TOP_LEVEL_SUBDIRS
objgroup
pool
pipe
pmpi
rdma
topos
vrt
Expand Down Expand Up @@ -172,6 +173,27 @@ foreach(SUB_DIR ${PROJECT_SUBDIRS_LIST})
)
endforeach()

# Generate PMPI wrappers, if enabled and possible.
if(vt_mpi_guards AND PERL_FOUND)
set(MPI_WRAP_GENERATED "${PROJECT_BIN_DIR}/src/vt/pmpi/generated/mpiwrap.cc")

file(MAKE_DIRECTORY "${PROJECT_BIN_DIR}/src/vt/pmpi/generated")

add_custom_command(
OUTPUT ${MPI_WRAP_GENERATED}
COMMAND ${PERL_EXECUTABLE}
ARGS "${CMAKE_CURRENT_SOURCE_DIR}/vt/pmpi/generate_mpi_wrappers.pl"
"${CMAKE_CURRENT_SOURCE_DIR}/vt/pmpi/mpi_functions.h.in"
lifflander marked this conversation as resolved.
Show resolved Hide resolved
"${MPI_WRAP_GENERATED}"
)

list(
APPEND
SOURCE_FILES
${MPI_WRAP_GENERATED}
)
endif()

add_library(
${VIRTUAL_TRANSPORT_LIBRARY}
STATIC
Expand Down
110 changes: 66 additions & 44 deletions src/vt/collective/collective_alg.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@

#include "vt/collective/collective_alg.h"

#include <cstdint>

namespace vt { namespace collective {

CollectiveAlg::CollectiveAlg()
Expand Down Expand Up @@ -72,6 +74,59 @@ CollectiveScope CollectiveAlg::makeCollectiveScope(TagType in_scope_tag) {
return CollectiveScope(is_user_tag, scope_tag);
}

// We can not use a VT broadcast here because it involves the scheduler and
// MPI progress and there's no safe way to guarantee that the message has been
// received and processed and doesn't require progress from another node
static void broadcastConsensus(
int root, TagType& consensus_tag, TagType& consensus_scope, bool& consensus_is_user_tag
) {
auto comm = theContext()->getComm();

MPI_Request req;

// MPI_INT32_T x 3
int32_t req_buffer[] = {
int32_t{consensus_tag},
int32_t{consensus_scope},
int32_t{consensus_is_user_tag}
};

// Do a async broadcast of the scope/seq we intend to execute collectively
{
VT_ALLOW_MPI_CALLS;
MPI_Ibcast(req_buffer, 3, MPI_INT32_T, root, comm, &req);
}

theSched()->runSchedulerWhile([&req]{
VT_ALLOW_MPI_CALLS;
int flag = 0;
MPI_Test(&req, &flag, MPI_STATUS_IGNORE);
return not flag;
});

consensus_tag = req_buffer[0];
consensus_scope = req_buffer[1];
consensus_is_user_tag = req_buffer[2] not_eq 0;

vtAssert(consensus_tag != no_tag, "Selected tag must be valid");
vtAssert(consensus_scope != no_tag, "Selected scope must be valid");
PhilMiller marked this conversation as resolved.
Show resolved Hide resolved

// We need a barrier here so the root doesn't finish the broadcast first and
// then enter the collective. We could replace the Ibcast/Ibarrier with a
// Iallreduce, but I think this is cheaper
{
VT_ALLOW_MPI_CALLS;
MPI_Ibarrier(comm, &req);
}

theSched()->runSchedulerWhile([&req]{
VT_ALLOW_MPI_CALLS;
int flag = 0;
MPI_Test(&req, &flag, MPI_STATUS_IGNORE);
return not flag;
});
}

/*static*/ void CollectiveAlg::runCollective(CollectiveMsg* msg) {
// We need a reentrancy counter to ensure that this is only on the scheduler
// stack once!
Expand All @@ -88,7 +143,7 @@ CollectiveScope CollectiveAlg::makeCollectiveScope(TagType in_scope_tag) {
auto const this_node = theContext()->getNode();
TagType consensus_scope = no_tag;
TagType consensus_tag = no_tag;
int consensus_is_user_tag = 0; // bool encoded as an int for MPI
bool consensus_is_user_tag = false;

// The root decides the next tag and tells the remaining nodes
if (msg->root_ == this_node) {
Expand All @@ -111,47 +166,10 @@ CollectiveScope CollectiveAlg::makeCollectiveScope(TagType in_scope_tag) {
consensus_is_user_tag = msg->is_user_tag_;
}

// We can not use a VT broadcast here because it involves the scheduler and
// MPI progress and there's no safe way to guarantee that the message has been
// received and processed and doesn't require progress from another node

int const root = msg->root_;
auto comm = theContext()->getComm();
MPI_Request req1, req2, req3;

// Do a async broadcast of the scope/seq we intend to execute collectively
// This potentially could be merged into one Ibcast if TagType remains 32-bits
MPI_Ibcast(&consensus_tag, 1, MPI_INT32_T, root, comm, &req1);
MPI_Ibcast(&consensus_scope, 1, MPI_INT32_T, root, comm, &req2);
MPI_Ibcast(&consensus_is_user_tag, 1, MPI_C_BOOL, root, comm, &req3);

int flag1 = 0, flag2 = 0, flag3 = 0;
theSched()->runSchedulerWhile([&req1,&req2,&req3,&flag1,&flag2,&flag3]{
if (not flag1) {
MPI_Test(&req1, &flag1, MPI_STATUS_IGNORE);
}
if (not flag2) {
MPI_Test(&req2, &flag2, MPI_STATUS_IGNORE);
}
if (not flag3) {
MPI_Test(&req3, &flag3, MPI_STATUS_IGNORE);
}
return not (flag1 and flag2 and flag3);
});

vtAssert(consensus_tag != no_tag, "Selected tag must be valid");
vtAssert(consensus_scope != no_tag, "Selected scope must be valid");

// We need a barrier here so the root doesn't finish the broadcast first and
// then enter the collective. We could replace the Ibcast/Ibarrier with a
// Iallreduce, but I think this is cheaper
MPI_Ibarrier(comm, &req1);

theSched()->runSchedulerWhile([&req1]{
int flag4 = 0;
MPI_Test(&req1, &flag4, MPI_STATUS_IGNORE);
return not flag4;
});
broadcastConsensus(
msg->root_,
PhilMiller marked this conversation as resolved.
Show resolved Hide resolved
consensus_scope, consensus_tag, consensus_is_user_tag // in-out refs
);

debug_print(
gen, node,
Expand All @@ -177,8 +195,12 @@ CollectiveScope CollectiveAlg::makeCollectiveScope(TagType in_scope_tag) {
);
auto action = iter_seq->second.action_;

// Run the collective safely
action();
// Run the collective safely.
// The action is expected to use MPI calls; not VT calls.
{
VT_ALLOW_MPI_CALLS;
action();
}

// Erase the tag that was actually executed
impl->planned_collective_.erase(iter_seq);
Expand Down
14 changes: 13 additions & 1 deletion src/vt/configs/error/config_assert.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,14 @@
#define vtAssertExpr(cond) vt_force_use(cond)
#define vtAssertExprInfo(cond,...) vt_force_use(cond,__VA_ARGS__)
#define vtWarnInfo(cond,str,...) vt_force_use(cond,__VA_ARGS__)
/**
* \internal
* Assert that the MPI call's return code is MPI_SUCCESS (0).
*
* The failure assert contains the MPI call name (eg. "MPI_Iprobe"),
* short summary (eg. "failed"), and the actual return value.
*/
#define vtAssertMPISuccess(ret,mpi_name) vt_force_use(ret)
#else
#define vtAssertImpl(fail,cond,str) \
do { \
Expand Down Expand Up @@ -160,8 +168,12 @@

#define vtAssertNot(cond,str) vtAssert(INVERT_COND(cond),str)
#define vtAssertNotExpr(cond) vtAssertExpr(INVERT_COND(cond))
#define vtAssertNotInfo(cond,str,...) \
#define vtAssertNotInfo(cond,str,...) \
vtAssertInfo(INVERT_COND(cond),str,__VA_ARGS__)

#define vtAssertMPISuccess(ret,mpi_name) vtAssertInfo( \
(ret == 0), "MPI call '" mpi_name "' failed.", ret \
)
#endif

#endif /*INCLUDED_CONFIGS_ERROR_CONFIG_ASSERT_H*/
1 change: 1 addition & 0 deletions src/vt/configs/features/features_defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,5 +67,6 @@
#define vt_feature_cons_multi_idx 0 || vt_feature_cmake_cons_multi_idx
#define vt_feature_fcontext 0 || vt_feature_cmake_fcontext
#define vt_feature_mimalloc 0 || vt_feature_cmake_mimalloc
#define vt_feature_mpi_access_guards 0 || vt_feature_cmake_mpi_access_guards

#endif /*INCLUDED_VT_CONFIGS_FEATURES_FEATURES_DEFINES_H*/
1 change: 1 addition & 0 deletions src/vt/configs/features/features_featureswitch.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,5 +65,6 @@
#define vt_feature_str_trace_enabled "Tracing Projections"
#define vt_feature_str_cons_multi_idx "Collection Constructor Positional"
#define vt_feature_str_priorities "Message priorities"
#define vt_feature_str_mpi_access_guards "MPI access guards"

#endif /*INCLUDED_VT_CONFIGS_FEATURES_FEATURES_FEATURESWITCH_H*/
8 changes: 6 additions & 2 deletions src/vt/event/event_record.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@

#include "vt/event/event.h"
#include "vt/event/event_record.h"
#include "vt/runtime/mpi_access.h"

#include <memory>

Expand All @@ -56,7 +57,7 @@ EventRecord::EventRecord(EventRecordType const& type, EventType const& id)

switch (type) {
case EventRecordType::MPI_EventRecord:
pnstickne marked this conversation as resolved.
Show resolved Hide resolved
event_union_.mpi_req = MPI_Request();
event_union_.mpi_req = MPI_REQUEST_NULL;
break;
case EventRecordType::NormalEventRecord:
break;
Expand All @@ -83,7 +84,10 @@ bool EventRecord::testMPIEventReady() {
MPI_Request* req = getRequest();
MPI_Status stat;

MPI_Test(req, &flag, &stat);
{
VT_ALLOW_MPI_CALLS;
MPI_Test(req, &flag, &stat);
}

bool const mpiready = flag == 1;

Expand Down
Loading