Skip to content

Commit

Permalink
Merge branch 'microsoft:main' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
WolframRhodium authored Apr 18, 2024
2 parents 24652b9 + 745b426 commit 2911a63
Show file tree
Hide file tree
Showing 107 changed files with 4,775 additions and 1,113 deletions.
2 changes: 1 addition & 1 deletion .pipelines/nuget_config/x64/packages.config
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="python" version="3.9.7" targetFramework="native" />
<package id="Microsoft.AI.DirectML" version="1.13.1" targetFramework="native" />
<package id="Microsoft.AI.DirectML" version="1.14.0" targetFramework="native" />
<package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
</packages>
2 changes: 1 addition & 1 deletion .pipelines/nuget_config/x86/packages.config
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="pythonx86" version="3.9.7" targetFramework="native" />
<package id="Microsoft.AI.DirectML" version="1.13.1" targetFramework="native" />
<package id="Microsoft.AI.DirectML" version="1.14.0" targetFramework="native" />
<package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
</packages>
3 changes: 3 additions & 0 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -848,6 +848,9 @@ if (onnxruntime_USE_DML)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_DML=1)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_DML=1)
list(APPEND ONNXRUNTIME_PROVIDER_NAMES dml)
if(onnxruntime_ENABLE_NPU_ADAPTER_ENUMERATION)
list(APPEND ORT_PROVIDER_FLAGS -DENABLE_NPU_ADAPTER_ENUMERATION=1)
endif()
endif()
if (onnxruntime_USE_MIGRAPHX)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_MIGRAPHX=1)
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/dml.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config)
set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config)
get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE)
set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.13.1)
set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.14.0)

# Restore nuget packages, which will pull down the DirectML redist package.
add_custom_command(
Expand Down
7 changes: 7 additions & 0 deletions cmake/onnxruntime_python.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,9 @@ if (onnxruntime_ENABLE_TRAINING)
file(GLOB onnxruntime_python_ortmodule_graph_optimizers_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/graph_optimizers/*"
)
file(GLOB onnxruntime_python_ortmodule_pipe_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/experimental/pipe/*"
)
file(GLOB onnxruntime_python_ort_triton_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ort_triton/*.py"
)
Expand Down Expand Up @@ -756,6 +759,7 @@ if (onnxruntime_ENABLE_TRAINING)
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cuda/torch_gpu_allocator
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cuda/fused_ops
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/graph_optimizers
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/experimental/pipe
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ort_triton
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ort_triton/kernel
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/utils
Expand Down Expand Up @@ -806,6 +810,9 @@ if (onnxruntime_ENABLE_TRAINING)
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_graph_optimizers_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/graph_optimizers/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_pipe_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/experimental/pipe/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ort_triton_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ort_triton/
Expand Down
3 changes: 3 additions & 0 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1255,6 +1255,9 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
if (onnxruntime_USE_TENSORRT)
list(APPEND onnxruntime_shared_lib_test_LIBS ${TENSORRT_LIBRARY_INFER})
endif()
if (onnxruntime_USE_DML)
list(APPEND onnxruntime_shared_lib_test_LIBS d3d12.lib)
endif()
if (CMAKE_SYSTEM_NAME STREQUAL "Android")
list(APPEND onnxruntime_shared_lib_test_LIBS ${android_shared_libs})
endif()
Expand Down
28 changes: 28 additions & 0 deletions docs/ORTModule_Training_Guidelines.md
Original file line number Diff line number Diff line change
Expand Up @@ -495,3 +495,31 @@ for epoch in range(start_epoch, n_epochs):
```
Check [LoadBalancingDistributedBatchSampler implementation](../orttraining/orttraining/python/training/utils/data/sampler.py) for more details.
## 8 Using ORTPipelineModule for Deepspeed Pipeline Parallel
You can use `ORTPipelineModule` to support Deepspeed Pipeline Parallelism. Here's how you can integrate it into your pipeline:

```python
from onnxruntime.training.ortmodule import DebugOptions
from onnxruntime.training.ortmodule.experimental.pipe import ORTPipelineModule
# Create a debug configuration if needed
# Since we're exporting multiple graphs here, this will generate multiple graphs with their index added as a prefix to differentiate them.
debug_options = DebugOptions(save_onnx=True, log_level=LogLevel.VERBOSE, onnx_prefix="model_name")
# Keep your deepspeed script the same and use ORTPipelineModule instead of PipelineModule
# Initialize the ORTPipelineModule
pipeline_module = ORTPipelineModule(
layers,
num_stages=2, # Set your number of stages
base_seed=1234,
partition_method="parameters",
debug_options=debug_options # Pass the debug configuration if needed
)
# Keep the rest of the script as it is.
```

Check [ORTPipelineModule implementation](../orttraining/orttraining/python/training/ortmodule/experimental/pipe/_ort_pipeline_module.py) for more details.
6 changes: 6 additions & 0 deletions include/onnxruntime/core/graph/graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -727,6 +727,12 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
* and replaces graph initializers with its content.
*/
common::Status InjectExternalInitializedTensors(const InlinedHashMap<std::string, OrtValue>& external_initializers);

/** This function takes externally provided files in memory for initializers with external
* data and replaces graph initializers with its content.
*/
common::Status InjectExternalInitializersFromFilesInMemory(
const InlinedHashMap<PathString, std::pair<char*, size_t>>& external_initializer_files);
#endif // !defined(DISABLE_EXTERNAL_INITIALIZERS)

#endif // !defined(ORT_MINIMAL_BUILD)
Expand Down
31 changes: 28 additions & 3 deletions include/onnxruntime/core/session/onnxruntime_c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -3506,15 +3506,15 @@ struct OrtApi {
* \param[in] options
* \param[in] initializer_names Array of null terminated UTF-8 encoded strings of the initializers names.
* \param[in] initializers Array of ::OrtValue type
* \param[in] initializers_num Number of elements in the initializer_names and initializers
* \param[in] num_initializers Number of elements in the initializer_names and initializers
*
* \snippet{doc} snippets.dox OrtStatus Return Value
*
* \since Version 1.12.
*/
ORT_API2_STATUS(AddExternalInitializers, _In_ OrtSessionOptions* options,
_In_reads_(input_len) const char* const* initializer_names,
_In_reads_(input_len) const OrtValue* const* initializers, size_t initializers_num);
_In_reads_(num_initializers) const char* const* initializer_names,
_In_reads_(num_initializers) const OrtValue* const* initializers, size_t num_initializers);

/** \brief: Create attribute of onnxruntime operator
*
Expand Down Expand Up @@ -4631,6 +4631,31 @@ struct OrtApi {
* \snippet{doc} snippets.dox OrtStatus Return Value
*/
ORT_API2_STATUS(KernelInfoGetAllocator, _In_ const OrtKernelInfo* info, _In_ OrtMemType mem_type, _Outptr_ OrtAllocator** out);

/** \brief Replace initialized Tensors with external data with the provided files in memory
*
* The function will find the initialized TensorProtos with external data in the graph with the provided
* external file names and the file content in memory. The API gets the external file name, offset, data length
* from TensorProto, and locate the tensor data from the file in memory buffer.
* It creates a Tensor to replace the existing Tensor in graph. The replacement
* will occur before any of the optimizations take place. The data will be copied into the graph
* since TensorProto can't refer to the user provided buffers.
*
* \param[in] session options
* \param[in] external_initializer_file_names Array of null terminated UTF-8 encoded strings of the file names
* which holds the external initializers.
* \param[in] external_initializer_file_buffer_array Array of pointers to the buffer of the file content.
* The buffer can be freed after session creation.
* \param[in] external_initializer_file_lengths Array of size_t to indicate the length of file content
* \param[in] num_external_initializer_files Number of external files
*
* \snippet{doc} snippets.dox OrtStatus Return Value
*/
ORT_API2_STATUS(AddExternalInitializersFromFilesInMemory, _In_ OrtSessionOptions* options,
_In_reads_(num_external_initializer_files) const ORTCHAR_T* const* external_initializer_file_names,
_In_reads_(num_external_initializer_files) char* const* external_initializer_file_buffer_array,
_In_reads_(num_external_initializer_files) const size_t* external_initializer_file_lengths,
size_t num_external_initializer_files);
};

/*
Expand Down
3 changes: 3 additions & 0 deletions include/onnxruntime/core/session/onnxruntime_cxx_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -873,6 +873,9 @@ struct SessionOptionsImpl : ConstSessionOptionsImpl<T> {

SessionOptionsImpl& AddInitializer(const char* name, const OrtValue* ort_val); ///< Wraps OrtApi::AddInitializer
SessionOptionsImpl& AddExternalInitializers(const std::vector<std::string>& names, const std::vector<Value>& ort_values); ///< Wraps OrtApi::AddExternalInitializers
SessionOptionsImpl& AddExternalInitializersFromFilesInMemory(const std::vector<std::basic_string<ORTCHAR_T>>& external_initializer_file_names,
const std::vector<char*>& external_initializer_file_buffer_array,
const std::vector<size_t>& external_initializer_file_lengths); ///< Wraps OrtApi::AddExternalInitializersFromFilesInMemory

SessionOptionsImpl& AppendExecutionProvider_CUDA(const OrtCUDAProviderOptions& provider_options); ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CUDA
SessionOptionsImpl& AppendExecutionProvider_CUDA_V2(const OrtCUDAProviderOptionsV2& provider_options); ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CUDA_V2
Expand Down
21 changes: 21 additions & 0 deletions include/onnxruntime/core/session/onnxruntime_cxx_inline.h
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,27 @@ inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AddExternalInitializers(con
return *this;
}

template <typename T>
inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AddExternalInitializersFromFilesInMemory(const std::vector<std::basic_string<ORTCHAR_T>>& file_names,
const std::vector<char*>& buffer_array,
const std::vector<size_t>& file_lengths) {
const size_t inputs_num = file_names.size();
if (inputs_num != buffer_array.size()) {
ORT_CXX_API_THROW("Expecting names and buffer_array to have the same length", ORT_INVALID_ARGUMENT);
}
if (inputs_num != file_lengths.size()) {
ORT_CXX_API_THROW("Expecting names and file_lengths to have the same length", ORT_INVALID_ARGUMENT);
}
std::vector<const ORTCHAR_T*> names_ptr;
names_ptr.reserve(inputs_num);
for (size_t i = 0; i < inputs_num; ++i) {
names_ptr.push_back(file_names[i].c_str());
}
ThrowOnError(GetApi().AddExternalInitializersFromFilesInMemory(this->p_, names_ptr.data(), buffer_array.data(),
file_lengths.data(), inputs_num));
return *this;
}

template <typename T>
inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_CUDA(const OrtCUDAProviderOptions& provider_options) {
ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_CUDA(this->p_, &provider_options));
Expand Down
1 change: 1 addition & 0 deletions js/node/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ node_modules/

/lib/**/*.js
/script/**/*.js
!/script/install.js
/test/**/*.js
1 change: 1 addition & 0 deletions js/node/.npmignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/.vscode/
/build/
/script/
!/script/install.js
/src/
/test/

Expand Down
28 changes: 23 additions & 5 deletions js/node/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,38 @@ Refer to [ONNX Runtime JavaScript examples](https://github.com/microsoft/onnxrun

## Requirements

ONNXRuntime works on Node.js v12.x+ or Electron v5.x+.
ONNXRuntime works on Node.js v16.x+ (recommend v18.x+) or Electron v15.x+ (recommend v28.x+).

Following platforms are supported with pre-built binaries:
The following table lists the supported versions of ONNX Runtime Node.js binding provided with pre-built binaries.

- Windows x64 CPU NAPI_v3
- Linux x64 CPU NAPI_v3
- MacOS x64 CPU NAPI_v3
| EPs/Platforms | Windows x64 | Windows arm64 | Linux x64 | Linux arm64 | MacOS x64 | MacOS arm64 |
| ------------- | ----------- | ------------- | ----------------- | ----------- | --------- | ----------- |
| CPU | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ |
| DirectML | ✔️ | ✔️ |||||
| CUDA ||| ✔️<sup>\[1]</sup> ||||

- \[1]: CUDA v11.8.

To use on platforms without pre-built binaries, you can build Node.js binding from source and consume it by `npm install <onnxruntime_repo_root>/js/node/`. See also [instructions](https://onnxruntime.ai/docs/build/inferencing.html#apis-and-language-bindings) for building ONNX Runtime Node.js binding locally.

# GPU Support

Right now, the Windows version supports only the DML provider. Linux x64 can use CUDA and TensorRT.

## CUDA EP Installation

To use CUDA EP, you need to install the CUDA EP binaries. By default, the CUDA EP binaries are installed automatically when you install the package. If you want to skip the installation, you can pass the `--onnxruntime-node-install-cuda=skip` flag to the installation command.

```
npm install onnxruntime-node --onnxruntime-node-install-cuda=skip
```

You can also use this flag to specify the version of the CUDA: (v11 or v12)

```
npm install onnxruntime-node --onnxruntime-node-install-cuda=v12
```

## License

License information can be found [here](https://github.com/microsoft/onnxruntime/blob/main/README.md#license).
Loading

0 comments on commit 2911a63

Please sign in to comment.