From d3f847b0adc848d3c030db9664ce37356a028c6c Mon Sep 17 00:00:00 2001 From: Tapasya Patki Date: Mon, 11 Mar 2024 15:17:14 -0700 Subject: [PATCH] Update JSON utilization API to support CPU-only, GPU-only and multi-architecture systems (#525) --- src/docs/sphinx/VariorumAPI.rst | 7 +- .../sphinx/api/json_support_functions.rst | 3 +- src/examples/CMakeLists.txt | 3 +- ...riorum-get-node-utilization-json-example.c | 88 ----------------- ...> variorum-get-utilization-json-example.c} | 4 +- src/variorum/AMD_GPU/amd_gpu_power_features.c | 3 +- src/variorum/AMD_GPU/config_amd_gpu.c | 2 +- src/variorum/Nvidia_GPU/config_nvidia.c | 2 +- src/variorum/config_architecture.c | 2 +- src/variorum/config_architecture.h | 4 +- src/variorum/variorum.c | 97 ++++++++----------- src/variorum/variorum.h | 40 +------- 12 files changed, 53 insertions(+), 202 deletions(-) delete mode 100644 src/examples/variorum-get-node-utilization-json-example.c rename src/examples/{variorum-get-gpu-utilization-json-example.c => variorum-get-utilization-json-example.c} (94%) diff --git a/src/docs/sphinx/VariorumAPI.rst b/src/docs/sphinx/VariorumAPI.rst index 92df62457..40c1664b8 100644 --- a/src/docs/sphinx/VariorumAPI.rst +++ b/src/docs/sphinx/VariorumAPI.rst @@ -122,11 +122,10 @@ The API to obtain node utilization has the following format. It takes a string (``char**``) by reference as input, and populates this string with a JSON object with total CPU, system CPU, user CPU, total memory, and GPU (when available) utilizations. It reports the utilization of each available GPU. GPU utilization -is accomplished using the ``int variorum_get_gpu_utilization_json(char -**get_gpu_util_obj_str)`` function. The total memory utilization is computed +is obtained using the NVML and RSMI APIs. The total memory utilization is computed using ``/proc/meminfo``, and CPU utilizations is computed using ``/proc/stat``. -The ``variorum_get_node_utilization_json(char **get_util_obj_str)`` function +The ``variorum_get_utilization_json(char **get_util_obj_str)`` function returns a string type nested JSON object. An example is provided below: .. code:: @@ -150,7 +149,7 @@ returns a string type nested JSON object. An example is provided below: The ``*`` here refers to socket ID, and the ``#`` refers to GPU ID. -The ``variorum_get_node_utilization_json(char **get_util_obj_str)`` function +The ``variorum_get_utilization_json(char **get_util_obj_str)`` function returns a string type nested JSON object. An example is provided below: .. code:: diff --git a/src/docs/sphinx/api/json_support_functions.rst b/src/docs/sphinx/api/json_support_functions.rst index 9ca48128a..f49348631 100644 --- a/src/docs/sphinx/api/json_support_functions.rst +++ b/src/docs/sphinx/api/json_support_functions.rst @@ -20,6 +20,5 @@ Defined in ``variorum/variorum.h``. .. doxygenfunction:: variorum_get_frequency_json -.. doxygenfunction:: variorum_get_node_utilization_json +.. doxygenfunction:: variorum_get_utilization_json -.. doxygenfunction:: variorum_get_gpu_utilization_json diff --git a/src/examples/CMakeLists.txt b/src/examples/CMakeLists.txt index 2d1ceff9b..2cc9acb3d 100644 --- a/src/examples/CMakeLists.txt +++ b/src/examples/CMakeLists.txt @@ -17,11 +17,10 @@ set(BASIC_EXAMPLES variorum-disable-turbo-example variorum-enable-turbo-example variorum-get-frequency-json-example - variorum-get-gpu-utilization-json-example variorum-get-node-power-domain-info-json-example variorum-get-power-json-example variorum-get-thermals-json-example - variorum-get-node-utilization-json-example + variorum-get-utilization-json-example variorum-get-topology-info-example variorum-integration-using-json-example variorum-monitoring-to-file-example diff --git a/src/examples/variorum-get-node-utilization-json-example.c b/src/examples/variorum-get-node-utilization-json-example.c deleted file mode 100644 index 8802f3e44..000000000 --- a/src/examples/variorum-get-node-utilization-json-example.c +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright 2019-2023 Lawrence Livermore National Security, LLC and other -// Variorum Project Developers. See the top-level LICENSE file for details. -// -// SPDX-License-Identifier: MIT - -#include -#include -#include - -#include -#include - -#ifdef SECOND_RUN -static inline double do_work(int input) -{ - int i; - double result = (double)input; - - for (i = 0; i < 100000; i++) - { - result += i * result; - } - - return result; -} -#endif - -int main(int argc, char **argv) -{ - int ret; - char *s = NULL; -#ifdef SECOND_RUN - int i; - int size = 1E4; - volatile double x = 0.0; -#endif - - const char *usage = "Usage: %s [-h] [-v]\n"; - int opt; - while ((opt = getopt(argc, argv, "hv")) != -1) - { - switch (opt) - { - case 'h': - printf(usage, argv[0]); - return 0; - case 'v': - printf("%s\n", variorum_get_current_version()); - return 0; - default: - fprintf(stderr, usage, argv[0]); - return -1; - } - } - ret = variorum_get_node_utilization_json(&s); - if (ret != 0) - { - printf("First run: JSON get node utilization failed!\n"); - free(s); - exit(-1); - } - - /* Print the entire JSON object */ - puts(s); - -#ifdef SECOND_RUN - for (i = 0; i < size; i++) - { - x += do_work(i); - } - printf("Final result: %f\n", x); - ret = variorum_get_node_utilization_json(&s); - if (ret != 0) - { - printf("Second run: JSON get node utilization failed!\n"); - free(s); - exit(-1); - } - - /* Print the entire JSON object */ - puts(s); -#endif - - /* Deallocate the string */ - free(s); - - return ret; -} diff --git a/src/examples/variorum-get-gpu-utilization-json-example.c b/src/examples/variorum-get-utilization-json-example.c similarity index 94% rename from src/examples/variorum-get-gpu-utilization-json-example.c rename to src/examples/variorum-get-utilization-json-example.c index 32748688f..cfd637004 100644 --- a/src/examples/variorum-get-gpu-utilization-json-example.c +++ b/src/examples/variorum-get-utilization-json-example.c @@ -52,7 +52,7 @@ int main(int argc, char **argv) return -1; } } - ret = variorum_get_gpu_utilization_json(&s); + ret = variorum_get_utilization_json(&s); if (ret != 0) { printf("First run: JSON get node utilization failed!\n"); @@ -69,7 +69,7 @@ int main(int argc, char **argv) x += do_work(i); } printf("Final result: %f\n", x); - ret = variorum_get_gpu_utilization_json(&s); + ret = variorum_get_utilization_json(&s); if (ret != 0) { printf("Second run: JSON get node utilization failed!\n"); diff --git a/src/variorum/AMD_GPU/amd_gpu_power_features.c b/src/variorum/AMD_GPU/amd_gpu_power_features.c index a23bd10f8..b0737d550 100644 --- a/src/variorum/AMD_GPU/amd_gpu_power_features.c +++ b/src/variorum/AMD_GPU/amd_gpu_power_features.c @@ -789,7 +789,6 @@ void get_gpu_utilization_data_json(int chipid, int total_sockets, rsmi_status_t ret; uint32_t num_devices; int gpus_per_socket; - int d = 0; char socket_id[12]; char hostname[1024]; char device_id[12]; @@ -869,7 +868,7 @@ void get_gpu_utilization_data_json(int chipid, int total_sockets, getenv("HOSTNAME"), __FILE__, __FUNCTION__, __LINE__); } - snprintf(device_id, 12, "GPU%d_util%%", d); + snprintf(device_id, 12, "GPU%d_util%%", i); json_object_set_new(socket_obj, device_id, json_integer(utilpercent)); } diff --git a/src/variorum/AMD_GPU/config_amd_gpu.c b/src/variorum/AMD_GPU/config_amd_gpu.c index c182f4183..a9f50b72f 100644 --- a/src/variorum/AMD_GPU/config_amd_gpu.c +++ b/src/variorum/AMD_GPU/config_amd_gpu.c @@ -35,7 +35,7 @@ int set_amd_gpu_func_ptrs(int idx) amd_gpu_instinct_get_gpu_utilization; g_platform[idx].variorum_get_thermals_json = amd_gpu_instinct_get_thermals_json; g_platform[idx].variorum_get_frequency_json = amd_gpu_instinct_get_clocks_json; - g_platform[idx].variorum_get_gpu_utilization_json = + g_platform[idx].variorum_get_utilization_json = amd_gpu_instinct_get_gpu_utilization_json; /* Initialize control interfaces */ g_platform[idx].variorum_cap_each_gpu_power_limit = diff --git a/src/variorum/Nvidia_GPU/config_nvidia.c b/src/variorum/Nvidia_GPU/config_nvidia.c index 63494feae..4e98a6cbb 100644 --- a/src/variorum/Nvidia_GPU/config_nvidia.c +++ b/src/variorum/Nvidia_GPU/config_nvidia.c @@ -32,7 +32,7 @@ int set_nvidia_func_ptrs(int idx) g_platform[idx].variorum_print_gpu_utilization = volta_get_gpu_utilization; g_platform[idx].variorum_get_thermals_json = volta_get_thermals_json; g_platform[idx].variorum_get_frequency_json = volta_get_clocks_json; - g_platform[idx].variorum_get_gpu_utilization_json = + g_platform[idx].variorum_get_utilization_json = volta_get_gpu_utilization_json; /* Initialize control interfaces */ g_platform[idx].variorum_cap_each_gpu_power_limit = diff --git a/src/variorum/config_architecture.c b/src/variorum/config_architecture.c index f4db9a6ed..2d5c07172 100644 --- a/src/variorum/config_architecture.c +++ b/src/variorum/config_architecture.c @@ -354,7 +354,7 @@ void variorum_init_func_ptrs() g_platform[i].variorum_print_turbo = NULL; g_platform[i].variorum_poll_power = NULL; g_platform[i].variorum_print_gpu_utilization = NULL; - g_platform[i].variorum_get_gpu_utilization_json = NULL; + g_platform[i].variorum_get_utilization_json = NULL; g_platform[i].variorum_monitoring = NULL; g_platform[i].variorum_get_power_json = NULL; g_platform[i].variorum_get_node_power_domain_info_json = NULL; diff --git a/src/variorum/config_architecture.h b/src/variorum/config_architecture.h index 0b89fb0ea..406c53161 100644 --- a/src/variorum/config_architecture.h +++ b/src/variorum/config_architecture.h @@ -237,10 +237,10 @@ struct platform /// @return Error code. int (*variorum_print_gpu_utilization)(int long_ver); - /// @brief Function pointer to get JSON object for GPU utilization + /// @brief Function pointer to get JSON object for utilization /// /// @return Error code. - int (*variorum_get_gpu_utilization_json)(char **get_gpu_util_obj_str); + int (*variorum_get_utilization_json)(char **get_util_obj_str); /// @brief Function pointer to get JSON object for node power data. /// diff --git a/src/variorum/variorum.c b/src/variorum/variorum.c index 4f788ab21..d19025eb4 100644 --- a/src/variorum/variorum.c +++ b/src/variorum/variorum.c @@ -1090,7 +1090,7 @@ int variorum_get_power_json(char **get_power_obj_str) return err; } -int variorum_get_node_utilization_json(char **get_util_obj_str) +int variorum_get_utilization_json(char **get_util_obj_str) { int err = 0; err = variorum_enter(__FILE__, __FUNCTION__, __LINE__); @@ -1099,20 +1099,12 @@ int variorum_get_node_utilization_json(char **get_util_obj_str) return -1; } - err = variorum_exit(__FILE__, __FUNCTION__, __LINE__); - if (err) - { - return -1; - } - char hostname[1024]; struct timeval tv; uint64_t ts; - char *gpu_util_str = NULL; gethostname(hostname, 1024); gettimeofday(&tv, NULL); ts = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; - int ret; char str[100]; const char d[2] = " "; char *token, *s, *p; @@ -1136,9 +1128,36 @@ int variorum_get_node_utilization_json(char **get_util_obj_str) uint64_t mem_free = 0; uint64_t sys_time = 0; int strcp; + int idx = -1; + json_t *get_util_obj = NULL; + json_t *get_cpu_util_obj = NULL; + json_t *get_timestamp_obj = NULL; + json_t *cpu_util_obj = NULL; + + // Look for a GPU build and get an ID. + for (idx = 0; idx < P_NUM_PLATFORMS; idx++) + { +#ifdef VARIORUM_WITH_INTEL_GPU + idx = P_INTEL_GPU_IDX; + break; +#endif +#ifdef VARIORUM_WITH_NVIDIA_GPU + idx = P_NVIDIA_GPU_IDX; + break; +#endif +#ifdef VARIORUM_WITH_AMD_GPU + idx = P_AMD_GPU_IDX; + break; +#endif + } + + // If we have a GPU build, obtain the GPU object first. +#if defined(VARIORUM_WITH_NVIDIA_GPU) || defined(VARIORUM_WITH_AMD_GPU) || defined(VARIORUM_WITH_INTEL_GPU) + int ret; + char *gpu_util_str = NULL; // get gpu utilization - ret = variorum_get_gpu_utilization_json(&gpu_util_str); + ret = g_platform[idx].variorum_get_utilization_json(&gpu_util_str); if (ret != 0) { printf("JSON get gpu utilization failed. Exiting.\n"); @@ -1146,23 +1165,26 @@ int variorum_get_node_utilization_json(char **get_util_obj_str) return -1; } - /* Load the string as a JSON object using Jansson */ - json_t *get_util_obj = json_loads(gpu_util_str, JSON_DECODE_ANY, NULL); + /* Load the existing GPU string as a JSON object using Jansson */ + get_util_obj = json_loads(gpu_util_str, JSON_DECODE_ANY, NULL); + get_cpu_util_obj = json_object_get(get_util_obj, hostname); + get_timestamp_obj = json_object_get(get_cpu_util_obj, "timestamp"); + cpu_util_obj = json_object_get(get_cpu_util_obj, "CPU"); +#endif - json_t *get_cpu_util_obj = json_object_get(get_util_obj, hostname); - if (get_cpu_util_obj == NULL) + //CPU-only build will have this object as NULL. + if (get_util_obj == NULL) { + get_util_obj = json_object(); get_cpu_util_obj = json_object(); json_object_set_new(get_util_obj, hostname, get_cpu_util_obj); } - json_t *get_timestamp_obj = json_object_get(get_util_obj, "timestamp"); if (get_timestamp_obj == NULL) { json_object_set_new(get_cpu_util_obj, "timestamp", json_integer(ts)); } - json_t *cpu_util_obj = json_object_get(get_cpu_util_obj, "CPU"); if (cpu_util_obj == NULL) { cpu_util_obj = json_object(); @@ -1237,6 +1259,7 @@ int variorum_get_node_utilization_json(char **get_util_obj_str) last_sum = sum; last_sys_time = sys_time; last_idle = sum_idle; + json_object_set_new(cpu_util_obj, "total_util%", json_real(cpu_util)); json_object_set_new(cpu_util_obj, "user_util%", json_real(user_util)); json_object_set_new(cpu_util_obj, "system_util%", json_real(sys_util)); @@ -1293,48 +1316,6 @@ int variorum_get_node_utilization_json(char **get_util_obj_str) *get_util_obj_str = json_dumps(get_util_obj, JSON_INDENT(4)); json_decref(get_util_obj); state = 1; - return 0; -} - -int variorum_get_gpu_utilization_json(char **get_gpu_util_obj_str) -{ - int err = 0; - int i; - err = variorum_enter(__FILE__, __FUNCTION__, __LINE__); - if (err) - { - return -1; - } - - for (i = 0; i < P_NUM_PLATFORMS; i++) - { -#ifdef VARIORUM_WITH_INTEL_GPU - i = P_INTEL_GPU_IDX; - break; -#endif -#ifdef VARIORUM_WITH_NVIDIA_GPU - i = P_NVIDIA_GPU_IDX; - break; -#endif -#ifdef VARIORUM_WITH_AMD_GPU - i = P_AMD_GPU_IDX; - break; -#endif - } - - if (g_platform[i].variorum_get_gpu_utilization_json == NULL) - { - variorum_error_handler("Feature not yet implemented or is not supported", - VARIORUM_ERROR_FEATURE_NOT_IMPLEMENTED, - getenv("HOSTNAME"), __FILE__, - __FUNCTION__, __LINE__); - return -1; - } - err = g_platform[i].variorum_get_gpu_utilization_json(get_gpu_util_obj_str); - if (err) - { - return -1; - } err = variorum_exit(__FILE__, __FUNCTION__, __LINE__); if (err) diff --git a/src/variorum/variorum.h b/src/variorum/variorum.h index 752c0d47c..378468ea6 100644 --- a/src/variorum/variorum.h +++ b/src/variorum/variorum.h @@ -584,45 +584,7 @@ int variorum_get_power_json(char **get_power_obj_str); /// @returns 0 if successful, otherwise -1. Note that feature not implemented /// returns a -1 for the JSON APIs so that users don't have to explicitly /// check for NULL strings. -int variorum_get_node_utilization_json(char **get_util_obj_str); - -/// @brief Populate a string in JSON format with utilization of each GPU -/// -/// Format: -/// { -/// "hostname": { -/// "GPU": { -/// Socket_n : { -/// GPUnm_util% : GPU_utilization -/// }, -/// "timestamp" : timestamp -/// } -/// where n is the socket number and m is the GPU ID. -/// -/// @supparch -/// - AMD Radeon Instinct GPUs (MI50 onwards) -/// - NVIDIA Volta -/// -/// @unsupported -/// - AMD EPYC Milan -/// - IBM Power9 -/// - Intel Sandy Bridge -/// - Intel Ivy Bridge -/// - Intel Haswell -/// - Intel Broadwell -/// - Intel Skylake -/// - Intel Kaby Lake -/// - Intel Ice Lake -/// - Intel Cascade Lake -/// - Intel Cooper Lake -/// -/// @param [out] get_gpu_util_obj_str String (passed by reference) that contains -/// node-level utilization information. -/// -/// @returns 0 if successful, otherwise -1. Note that feature not implemented -/// returns a -1 for the JSON APIs so that users don't have to explicitly -/// check for NULL strings. -int variorum_get_gpu_utilization_json(char **get_gpu_util_obj_str); +int variorum_get_utilization_json(char **get_util_obj_str); /// @brief Populate a string in JSON format with measurable and controllable /// power domains, along with the ranges.