From d0633455d5634453484557378b4fa73db96e29d1 Mon Sep 17 00:00:00 2001 From: Otto Bittner Date: Tue, 27 Aug 2024 08:46:03 -0700 Subject: [PATCH] nvproxy: add ioctl `NV_CONF_COMPUTE_CTRL_CMD_GPU_GET_KEY_ROTATION_STATE` Hey, this adds a missing ioctl required to run workloads on H100s with CC mode on. I couldn't find the respective ioctl in any supported driver version prior to 550.90.07, hence I added it only to that version's ABI. Without this patch the following example crashes: ```bash $ docker run --runtime=runsc --gpus=all pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime python -c "import torch; torch.cuda.init()" ``` The error is: ``` Traceback (most recent call last): File "/test.py", line 3, in torch.cuda.init() File "/opt/conda/lib/python3.10/site-packages/torch/cuda/__init__.py", line 260, in init _lazy_init() File "/opt/conda/lib/python3.10/site-packages/torch/cuda/__init__.py", line 293, in _lazy_init torch._C._cuda_init() RuntimeError: No CUDA GPUs are available ``` At the same time gvisor's debug logs show `nvproxy: unknown control command 0xcb33010c`. FUTURE_COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/10824 from derpsteb:ob/key-rotation 960c2d0925bc0f4056daf7d067c1dc8fbc47b4a2 PiperOrigin-RevId: 668003601 --- pkg/abi/nvgpu/ctrl.go | 1 + pkg/sentry/devices/nvproxy/version.go | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pkg/abi/nvgpu/ctrl.go b/pkg/abi/nvgpu/ctrl.go index 3942c4197c..8f97b1a9d7 100644 --- a/pkg/abi/nvgpu/ctrl.go +++ b/pkg/abi/nvgpu/ctrl.go @@ -561,4 +561,5 @@ const ( NV_CONF_COMPUTE_CTRL_CMD_SYSTEM_GET_CAPABILITIES = 0xcb330101 NV_CONF_COMPUTE_CTRL_CMD_SYSTEM_GET_GPUS_STATE = 0xcb330104 NV_CONF_COMPUTE_CTRL_CMD_GPU_GET_NUM_SECURE_CHANNELS = 0xcb33010b + NV_CONF_COMPUTE_CTRL_CMD_GPU_GET_KEY_ROTATION_STATE = 0xcb33010c ) diff --git a/pkg/sentry/devices/nvproxy/version.go b/pkg/sentry/devices/nvproxy/version.go index ce68c3faf2..7feb027523 100644 --- a/pkg/sentry/devices/nvproxy/version.go +++ b/pkg/sentry/devices/nvproxy/version.go @@ -673,7 +673,22 @@ func Init() { _ = addDriverABI(550, 54, 14, "8c497ff1cfc7c310fb875149bc30faa4fd26d2237b2cba6cd2e8b0780157cfe3", v550_54_14) v550_54_15 := addDriverABI(550, 54, 15, "2e859ae5f912a9a47aaa9b2d40a94a14f6f486b5d3b67c0ddf8b72c1c9650385", v550_54_14) - _ = addDriverABI(550, 90, 07, "51acf579d5a9884f573a1d3f522e7fafa5e7841e22a9cec0b4bbeae31b0b9733", v550_54_15) + + v550_90_07 := func() *driverABI { + abi := v550_54_15() + abi.controlCmd[nvgpu.NV_CONF_COMPUTE_CTRL_CMD_GPU_GET_KEY_ROTATION_STATE] = rmControlSimple + + prevNames := abi.getStructNames + abi.getStructNames = func() *driverStructNames { + names := prevNames() + names.controlNames[nvgpu.NV_CONF_COMPUTE_CTRL_CMD_GPU_GET_KEY_ROTATION_STATE] = simpleIoctl("NV_CONF_COMPUTE_CTRL_CMD_GPU_GET_KEY_ROTATION_STATE_PARAMS") + + return names + } + + return abi + } + _ = addDriverABI(550, 90, 07, "51acf579d5a9884f573a1d3f522e7fafa5e7841e22a9cec0b4bbeae31b0b9733", v550_90_07) }) }