diff --git a/src/docs/sphinx/VariorumAPI.rst b/src/docs/sphinx/VariorumAPI.rst index 98027b97d..15552e890 100644 --- a/src/docs/sphinx/VariorumAPI.rst +++ b/src/docs/sphinx/VariorumAPI.rst @@ -26,6 +26,7 @@ implementations in Variorum are described in the following sections: - :doc:`api/cap_functions` - :doc:`api/json_support_functions` - :doc:`api/enable_disable_functions` +- :doc:`api/json` ******************* Variorum Wrappers diff --git a/src/docs/sphinx/api/json.rst b/src/docs/sphinx/api/json.rst new file mode 100644 index 000000000..0a13ab6a0 --- /dev/null +++ b/src/docs/sphinx/api/json.rst @@ -0,0 +1,48 @@ +.. # Copyright 2019-2023 Lawrence Livermore National Security, LLC and other + # Variorum Project Developers. See the top-level LICENSE file for details. + # + # SPDX-License-Identifier: MIT + +########## + JSON API +########## + +******************************* + Obtaining Thermal Information +******************************* + +The API to obtain node thermal has the following format. It takes a string +(``char**``) by reference as input, and populates this string with a nested +JSON object with hostname, followed by socket_{number}, followed by CPU and or +GPU (depending on the platform, may contain only one or both), followed by Core +and Mem for CPU. + +The ``variorum_get_thermals_json(char **)`` function returns a string type +nested JSON object. An example is provided below:: + + { + "hostname": { + "Socket_0": { + "CPU": { + "Core": { + "temp_celsius_core_0": (Integer), + ... + "temp_celsius_core_i": (Integer), + }, + "Mem": { + "temp_celsius_dimm_0": (Integer), + ... + :temp_celsius_dimm_i": (Integer), + }, + }, + "GPU": { + "temp_celsius_gpu_0": (Integer), + ... + "temp_celsius_gpu_i": (Integer), + } + }, + "timestamp" : (Integer) + } + } + +Here, ``i`` is the index of the core or GPU and ``0 <= i < num_cores/GPUs``. diff --git a/src/examples/CMakeLists.txt b/src/examples/CMakeLists.txt index 3ef2b1df4..bcb464b3b 100644 --- a/src/examples/CMakeLists.txt +++ b/src/examples/CMakeLists.txt @@ -18,6 +18,7 @@ set(BASIC_EXAMPLES variorum-enable-turbo-example variorum-get-node-power-json-example variorum-get-node-power-domain-info-json-example + variorum-get-node-thermal-json-example variorum-integration-using-json-example variorum-get-topology-info-example variorum-monitoring-to-file-example diff --git a/src/examples/variorum-get-node-thermal-json-example.c b/src/examples/variorum-get-node-thermal-json-example.c new file mode 100644 index 000000000..5d0895dc2 --- /dev/null +++ b/src/examples/variorum-get-node-thermal-json-example.c @@ -0,0 +1,51 @@ +// Copyright 2019-2023 Lawrence Livermore National Security, LLC and other +// Variorum Project Developers. See the top-level LICENSE file for details. +// +// SPDX-License-Identifier: MIT + +#include +#include +#include + +#include +#include + +int main(int argc, char **argv) +{ + int ret; + char *s = NULL; + + const char *usage = "Usage: %s [-h] [-v]\n"; + int opt; + while ((opt = getopt(argc, argv, "hv")) != -1) + { + switch (opt) + { + case 'h': + printf(usage, argv[0]); + return 0; + case 'v': + printf("%s\n", variorum_get_current_version()); + return 0; + default: + fprintf(stderr, usage, argv[0]); + return -1; + } + } + + ret = variorum_get_thermals_json(&s); + if (ret != 0) + { + printf("First run: JSON get thermals failed!\n"); + free(s); + exit(-1); + } + + /* Print the entire JSON object */ + puts(s); + + /* Deallocate the string */ + free(s); + + return ret; +} diff --git a/src/variorum/AMD_GPU/amd_gpu_power_features.c b/src/variorum/AMD_GPU/amd_gpu_power_features.c index 689178239..549378fb3 100644 --- a/src/variorum/AMD_GPU/amd_gpu_power_features.c +++ b/src/variorum/AMD_GPU/amd_gpu_power_features.c @@ -273,6 +273,7 @@ void get_thermals_data(int chipid, int total_sockets, int verbose, FILE *output) static int init = 0; static struct timeval start; struct timeval now; + int i; gethostname(hostname, 1024); @@ -316,8 +317,7 @@ void get_thermals_data(int chipid, int total_sockets, int verbose, FILE *output) gettimeofday(&now, NULL); - for (int i = chipid * gpus_per_socket; - i < (chipid + 1) * gpus_per_socket; i++) + for (i = chipid * gpus_per_socket; i < (chipid + 1) * gpus_per_socket; i++) { int64_t temp_val = -1; double temp_val_flt = -1.0; @@ -379,6 +379,86 @@ void get_thermals_data(int chipid, int total_sockets, int verbose, FILE *output) } } +void get_thermals_json(int chipid, int total_sockets, json_t *output) +{ + rsmi_status_t ret; + uint32_t num_devices; + int gpus_per_socket; + char hostname[1024]; + + gethostname(hostname, 1024); + + ret = rsmi_init(0); + if (ret != RSMI_STATUS_SUCCESS) + { + variorum_error_handler("Could not initialize RSMI", + VARIORUM_ERROR_PLATFORM_ENV, + getenv("HOSTNAME"), __FILE__, __FUNCTION__, + __LINE__); + exit(-1); + } + + ret = rsmi_num_monitor_devices(&num_devices); + if (ret != RSMI_STATUS_SUCCESS) + { + variorum_error_handler("Could not get number of GPU devices", + VARIORUM_ERROR_PLATFORM_ENV, + getenv("HOSTNAME"), __FILE__, __FUNCTION__, + __LINE__); + } + + gpus_per_socket = num_devices / total_sockets; + + char socketid[12]; + snprintf(socketid, 12, "socket_%d", chipid); + + // check if socket object is in node object + json_t *socket_obj = json_object_get(output, socketid); + if (socket_obj == NULL) + { + socket_obj = json_object(); + json_object_set_new(output, socketid, socket_obj); + } + + // general gpu object + json_t *gpu_obj = json_object(); + json_object_set_new(socket_obj, "GPU", gpu_obj); + + int i; + for (i = chipid * gpus_per_socket; i < (chipid + 1) * gpus_per_socket; i++) + { + int64_t temp_val = -1; + double temp_val_flt = -1.0; + + ret = rsmi_dev_temp_metric_get(i, RSMI_TEMP_TYPE_EDGE, RSMI_TEMP_CURRENT, + &temp_val); + if (ret != RSMI_STATUS_SUCCESS) + { + variorum_error_handler("RSMI API was not successful", + VARIORUM_ERROR_PLATFORM_ENV, + getenv("HOSTNAME"), __FILE__, __FUNCTION__, + __LINE__); + } + + temp_val_flt = (double)(temp_val / (1000)); // Convert to Celcius. + + // gpu entry + char gpuid[32]; + snprintf(gpuid, 32, "temp_celsius_gpu_%d", i); + json_object_set_new(gpu_obj, gpuid, json_real(temp_val_flt)); + } + + ret = rsmi_shut_down(); + + if (ret != RSMI_STATUS_SUCCESS) + { + variorum_error_handler("Could not shutdown RSMI", + VARIORUM_ERROR_PLATFORM_ENV, + getenv("HOSTNAME"), __FILE__, __FUNCTION__, + __LINE__); + } +} + void get_clocks_data(int chipid, int total_sockets, int verbose, FILE *output) { rsmi_status_t ret; diff --git a/src/variorum/AMD_GPU/amd_gpu_power_features.h b/src/variorum/AMD_GPU/amd_gpu_power_features.h index 021d3d963..a6deef90c 100644 --- a/src/variorum/AMD_GPU/amd_gpu_power_features.h +++ b/src/variorum/AMD_GPU/amd_gpu_power_features.h @@ -8,6 +8,7 @@ #include #include +#include #include @@ -22,4 +23,6 @@ void get_gpu_utilization_data(int chipid, int total_sockets, int verbose, void cap_each_gpu_power_limit(int chipid, int total_sockets, unsigned int powerlimit); +void get_thermals_json(int chipid, int total_sockets, json_t *output); + #endif diff --git a/src/variorum/AMD_GPU/config_amd_gpu.c b/src/variorum/AMD_GPU/config_amd_gpu.c index c33aaa96d..98a4032f4 100644 --- a/src/variorum/AMD_GPU/config_amd_gpu.c +++ b/src/variorum/AMD_GPU/config_amd_gpu.c @@ -37,6 +37,8 @@ int set_amd_gpu_func_ptrs(int idx) /* Initialize control interfaces */ g_platform[idx].variorum_cap_each_gpu_power_limit = amd_gpu_instinct_cap_each_gpu_power_limit; + g_platform[idx].variorum_get_thermals_json = + amd_gpu_instinct_get_thermals_json; } else { diff --git a/src/variorum/AMD_GPU/instinctGPU.c b/src/variorum/AMD_GPU/instinctGPU.c index 64bb835c0..403f3fa43 100644 --- a/src/variorum/AMD_GPU/instinctGPU.c +++ b/src/variorum/AMD_GPU/instinctGPU.c @@ -77,6 +77,27 @@ int amd_gpu_instinct_get_thermals(int verbose) return 0; } +int amd_gpu_instinct_get_thermals_json(json_t *get_thermal_obj) +{ + char *val = getenv("VARIORUM_LOG"); + if (val != NULL && atoi(val) == 1) + { + printf("Running %s\n", __FUNCTION__); + } + + unsigned iter = 0; + unsigned nsockets; + + variorum_get_topology(&nsockets, NULL, NULL, P_AMD_GPU_IDX); + + for (iter = 0; iter < nsockets; iter++) + { + get_thermals_json(iter, nsockets, get_thermal_obj); + } + + return 0; +} + int amd_gpu_instinct_get_clocks(int verbose) { char *val = getenv("VARIORUM_LOG"); diff --git a/src/variorum/AMD_GPU/instinctGPU.h b/src/variorum/AMD_GPU/instinctGPU.h index ecd1ed667..dbdb0adc3 100644 --- a/src/variorum/AMD_GPU/instinctGPU.h +++ b/src/variorum/AMD_GPU/instinctGPU.h @@ -6,11 +6,15 @@ #ifndef INSTINCTGPU_H_INCLUDE #define INSTINCTGPU_H_INCLUDE +#include +#include + int amd_gpu_instinct_get_power(int verbose); int amd_gpu_instinct_get_power_limit(int verbose); int amd_gpu_instinct_get_thermals(int verbose); int amd_gpu_instinct_get_clocks(int verbose); int amd_gpu_instinct_get_gpu_utilization(int verbose); int amd_gpu_instinct_cap_each_gpu_power_limit(unsigned int powerlimit); +int amd_gpu_instinct_get_thermals_json(json_t *get_thermal_obj); #endif diff --git a/src/variorum/IBM/Power9.c b/src/variorum/IBM/Power9.c index 77e64bfaf..a2528f874 100644 --- a/src/variorum/IBM/Power9.c +++ b/src/variorum/IBM/Power9.c @@ -471,6 +471,68 @@ int ibm_cpu_p9_get_node_power_json(char **get_power_obj_str) return 0; } +int ibm_cpu_p9_get_node_thermal_json(json_t *get_thermal_obj) +{ + char *val = ("VARIORUM_LOG"); + if (val != NULL && atoi(val) == 1) + { + printf("Running %s\n", __FUNCTION__); + } + + void *buf; + int fd; + int rc; + int bytes; + unsigned iter = 0; + unsigned nsockets; + +#ifdef VARIORUM_WITH_IBM_CPU + variorum_get_topology(&nsockets, NULL, NULL, P_IBM_CPU_IDX); +#endif + + fd = open("/sys/firmware/opal/exports/occ_inband_sensors", O_RDONLY); + if (fd < 0) + { + printf("Failed to open occ_inband_sensors file\n"); + return -1; + } + + for (iter = 0; iter < nsockets; iter++) + { + lseek(fd, iter * OCC_SENSOR_DATA_BLOCK_SIZE, SEEK_SET); + + buf = malloc(OCC_SENSOR_DATA_BLOCK_SIZE); + if (!buf) + { + printf("Failed to allocate\n"); + return -1; + } + + for (rc = bytes = 0; bytes < OCC_SENSOR_DATA_BLOCK_SIZE; bytes += rc) + { + rc = read(fd, buf + bytes, OCC_SENSOR_DATA_BLOCK_SIZE - bytes); + + if (!rc || rc < 0) + { + break; + } + } + + if (bytes != OCC_SENSOR_DATA_BLOCK_SIZE) + { + printf("Failed to read data\n"); + free(buf); + return -1; + } + + json_get_thermal_sensors(iter, get_thermal_obj, buf); + free(buf); + } + + close(fd); + return 0; +} + int ibm_cpu_p9_get_node_power_domain_info_json(char **get_domain_obj_str) { char *val = ("VARIORUM_LOG"); diff --git a/src/variorum/IBM/Power9.h b/src/variorum/IBM/Power9.h index 2d7fab4bd..028db0296 100644 --- a/src/variorum/IBM/Power9.h +++ b/src/variorum/IBM/Power9.h @@ -24,4 +24,6 @@ int ibm_cpu_p9_get_node_power_json(char **get_power_obj_str); int ibm_cpu_p9_get_node_power_domain_info_json(char **get_domain_obj_str); +int ibm_cpu_p9_get_node_thermal_json(json_t *get_thermal_obj); + #endif diff --git a/src/variorum/IBM/config_ibm.c b/src/variorum/IBM/config_ibm.c index 89eecfd8f..b901c1e08 100644 --- a/src/variorum/IBM/config_ibm.c +++ b/src/variorum/IBM/config_ibm.c @@ -36,6 +36,8 @@ int set_ibm_func_ptrs(int idx) g_platform[idx].variorum_get_node_power_json = ibm_cpu_p9_get_node_power_json; g_platform[idx].variorum_get_node_power_domain_info_json = ibm_cpu_p9_get_node_power_domain_info_json; + g_platform[idx].variorum_get_thermals_json = + ibm_cpu_p9_get_node_thermal_json; } else { diff --git a/src/variorum/IBM/ibm_power_features.c b/src/variorum/IBM/ibm_power_features.c index 6f1bb4358..6cef1255d 100644 --- a/src/variorum/IBM/ibm_power_features.c +++ b/src/variorum/IBM/ibm_power_features.c @@ -455,3 +455,65 @@ void json_get_power_sensors(int chipid, json_t *get_power_obj, const void *buf) json_object_set_new(get_power_obj, mem_str, json_real(pwrmem)); json_object_set_new(get_power_obj, gpu_str, json_real(pwrgpu)); } + +void json_get_thermal_sensors(int chipid, json_t *node_obj, const void *buf) +{ + struct occ_sensor_data_header *hb; + struct occ_sensor_name *md; + int i; + + hb = (struct occ_sensor_data_header *)(uint64_t)buf; + md = (struct occ_sensor_name *)((uint64_t)hb + be32toh(hb->names_offset)); + + char socketid[12]; + snprintf(socketid, 12, "socket_%d", chipid); + + json_t *socket_obj = json_object_get(node_obj, socketid); + if (socket_obj == NULL) + { + socket_obj = json_object(); + json_object_set_new(node_obj, socketid, socket_obj); + } + + json_t *cpu_obj = json_object(); + json_object_set_new(socket_obj, "CPU", cpu_obj); + + json_t *core_obj = json_object(); + json_object_set_new(cpu_obj, "Core", core_obj); + + json_t *mem_obj = json_object(); + json_object_set_new(cpu_obj, "Mem", mem_obj); + + for (i = 0; i < be16toh(hb->nr_sensors); i++) + { + uint32_t offset = be32toh(md[i].reading_offset); + uint32_t scale = be32toh(md[i].scale_factor); + uint64_t sample = 0; + + if (md[i].structure_type == OCC_SENSOR_READING_FULL) + { + sample = read_sensor(hb, offset, SENSOR_SAMPLE); + } + + if (strncmp(md[i].name, "TEMPPROCTHRMC", 13) == 0) + { + char core_temp[32]; + char core_temp_value[5]; + strncpy(core_temp_value, md[i].name + 13, 5); + int core_temp_int = atoi(core_temp_value); + snprintf(core_temp, 32, "temp_celsius_core_%d", core_temp_int); + + json_object_set_new(core_obj, core_temp, json_integer(sample * TO_FP(scale))); + } + else if (strncmp(md[i].name, "TEMPDIMM", 8) == 0) + { + char mem_temp[32]; + char mem_temp_value[5]; + strncpy(mem_temp_value, md[i].name + 8, 5); + int mem_temp_int = atoi(mem_temp_value); + snprintf(mem_temp, 32, "temp_celsius_dimm_%d", mem_temp_int); + + json_object_set_new(mem_obj, mem_temp, json_integer(sample * TO_FP(scale))); + } + } +} diff --git a/src/variorum/IBM/ibm_power_features.h b/src/variorum/IBM/ibm_power_features.h index 21ad28e26..005ff723d 100644 --- a/src/variorum/IBM/ibm_power_features.h +++ b/src/variorum/IBM/ibm_power_features.h @@ -153,4 +153,8 @@ void json_get_power_sensors(int chipid, json_t *get_power_obj, const void *buf); +void json_get_thermal_sensors(int chipid, + json_t *get_thermal_obj, + const void *buf); + #endif diff --git a/src/variorum/Intel/Intel_06_2A.c b/src/variorum/Intel/Intel_06_2A.c index 91301e7ab..e060f080a 100644 --- a/src/variorum/Intel/Intel_06_2A.c +++ b/src/variorum/Intel/Intel_06_2A.c @@ -451,6 +451,22 @@ int intel_cpu_fm_06_2a_get_node_power_domain_info_json(char return 0; } +int intel_cpu_fm_06_2a_get_thermals_json(json_t *get_thermal_obj) +{ + char *val = getenv("VARIORUM_LOG"); + if (val != NULL && atoi(val) == 1) + { + printf("Running %s\n", __FUNCTION__); + } + + get_therm_temp_reading_json(get_thermal_obj, + msrs.ia32_therm_status, + msrs.ia32_package_therm_status, + msrs.msr_temperature_target); + + return 0; +} + int intel_cpu_fm_06_2a_cap_best_effort_node_power_limit(int node_limit) { char *val = getenv("VARIORUM_LOG"); diff --git a/src/variorum/Intel/Intel_06_2A.h b/src/variorum/Intel/Intel_06_2A.h index b3df3e1c1..f20d1bebf 100644 --- a/src/variorum/Intel/Intel_06_2A.h +++ b/src/variorum/Intel/Intel_06_2A.h @@ -111,4 +111,6 @@ int intel_cpu_fm_06_2a_cap_best_effort_node_power_limit(int node_power_limit); int intel_cpu_fm_06_2a_get_frequencies(void); +int intel_cpu_fm_06_2a_get_thermals_json(json_t *get_thermal_obj); + #endif diff --git a/src/variorum/Intel/Intel_06_2D.c b/src/variorum/Intel/Intel_06_2D.c index d4368702f..19168e78b 100644 --- a/src/variorum/Intel/Intel_06_2D.c +++ b/src/variorum/Intel/Intel_06_2D.c @@ -454,6 +454,22 @@ int intel_cpu_fm_06_2d_get_node_power_domain_info_json(char return 0; } +int intel_cpu_fm_06_2d_get_thermals_json(json_t *get_thermal_obj) +{ + char *val = getenv("VARIORUM_LOG"); + if (val != NULL && atoi(val) == 1) + { + printf("Running %s\n", __FUNCTION__); + } + + get_therm_temp_reading_json(get_thermal_obj, + msrs.ia32_therm_status, + msrs.ia32_package_therm_status, + msrs.msr_temperature_target); + + return 0; +} + int intel_cpu_fm_06_2d_cap_best_effort_node_power_limit(int node_limit) { char *val = getenv("VARIORUM_LOG"); diff --git a/src/variorum/Intel/Intel_06_2D.h b/src/variorum/Intel/Intel_06_2D.h index d36d69e9d..f19a7f055 100644 --- a/src/variorum/Intel/Intel_06_2D.h +++ b/src/variorum/Intel/Intel_06_2D.h @@ -113,4 +113,6 @@ int intel_cpu_fm_06_2d_cap_best_effort_node_power_limit(int node_power_limit); int intel_cpu_fm_06_2d_get_frequencies(void); +int intel_cpu_fm_06_2d_get_thermals_json(json_t *get_thermal_obj); + #endif diff --git a/src/variorum/Intel/Intel_06_3E.c b/src/variorum/Intel/Intel_06_3E.c index 5a5dbc3bb..468a92163 100644 --- a/src/variorum/Intel/Intel_06_3E.c +++ b/src/variorum/Intel/Intel_06_3E.c @@ -479,6 +479,22 @@ int intel_cpu_fm_06_3e_get_node_power_domain_info_json(char return 0; } +int intel_cpu_fm_06_3e_get_thermals_json(json_t *get_thermal_obj) +{ + char *val = getenv("VARIORUM_LOG"); + if (val != NULL && atoi(val) == 1) + { + printf("Running %s\n", __FUNCTION__); + } + + get_therm_temp_reading_json(get_thermal_obj, + msrs.ia32_therm_status, + msrs.ia32_package_therm_status, + msrs.msr_temperature_target); + + return 0; +} + int intel_cpu_fm_06_3e_cap_best_effort_node_power_limit(int node_limit) { char *val = getenv("VARIORUM_LOG"); diff --git a/src/variorum/Intel/Intel_06_3E.h b/src/variorum/Intel/Intel_06_3E.h index e6f6499c6..fc11c99a1 100644 --- a/src/variorum/Intel/Intel_06_3E.h +++ b/src/variorum/Intel/Intel_06_3E.h @@ -114,4 +114,6 @@ int intel_cpu_fm_06_3e_cap_best_effort_node_power_limit(int node_power_limit); int intel_cpu_fm_06_3e_get_frequencies(void); +int intel_cpu_fm_06_3e_get_thermals_json(json_t *get_thermal_obj); + #endif diff --git a/src/variorum/Intel/Intel_06_3F.c b/src/variorum/Intel/Intel_06_3F.c index c7a3df18a..acc35b1af 100644 --- a/src/variorum/Intel/Intel_06_3F.c +++ b/src/variorum/Intel/Intel_06_3F.c @@ -473,6 +473,22 @@ int intel_cpu_fm_06_3f_get_node_power_domain_info_json(char return 0; } +int intel_cpu_fm_06_3f_get_thermals_json(json_t *get_thermal_obj) +{ + char *val = getenv("VARIORUM_LOG"); + if (val != NULL && atoi(val) == 1) + { + printf("Running %s\n", __FUNCTION__); + } + + get_therm_temp_reading_json(get_thermal_obj, + msrs.ia32_therm_status, + msrs.ia32_package_therm_status, + msrs.msr_temperature_target); + + return 0; +} + int intel_cpu_fm_06_3f_cap_best_effort_node_power_limit(int node_limit) { char *val = getenv("VARIORUM_LOG"); diff --git a/src/variorum/Intel/Intel_06_3F.h b/src/variorum/Intel/Intel_06_3F.h index 149e31eea..dd1e9eee4 100644 --- a/src/variorum/Intel/Intel_06_3F.h +++ b/src/variorum/Intel/Intel_06_3F.h @@ -116,4 +116,6 @@ int intel_cpu_fm_06_3f_cap_best_effort_node_power_limit(int node_power_limit); int intel_cpu_fm_06_3f_get_frequencies(void); +int intel_cpu_fm_06_3f_get_thermals_json(json_t *get_thermal_obj); + #endif diff --git a/src/variorum/Intel/Intel_06_4F.c b/src/variorum/Intel/Intel_06_4F.c index b346e9bb6..887d5e12d 100644 --- a/src/variorum/Intel/Intel_06_4F.c +++ b/src/variorum/Intel/Intel_06_4F.c @@ -496,6 +496,22 @@ int intel_cpu_fm_06_4f_get_node_power_domain_info_json(char return 0; } +int intel_cpu_fm_06_4f_get_thermals_json(json_t *get_thermal_obj) +{ + char *val = getenv("VARIORUM_LOG"); + if (val != NULL && atoi(val) == 1) + { + printf("Running %s\n", __FUNCTION__); + } + + get_therm_temp_reading_json(get_thermal_obj, + msrs.ia32_therm_status, + msrs.ia32_package_therm_status, + msrs.msr_temperature_target); + + return 0; +} + int intel_cpu_fm_06_4f_cap_best_effort_node_power_limit(int node_limit) { char *val = getenv("VARIORUM_LOG"); diff --git a/src/variorum/Intel/Intel_06_4F.h b/src/variorum/Intel/Intel_06_4F.h index edf85907d..c05d3fc44 100644 --- a/src/variorum/Intel/Intel_06_4F.h +++ b/src/variorum/Intel/Intel_06_4F.h @@ -116,4 +116,6 @@ int intel_cpu_fm_06_4f_cap_best_effort_node_power_limit(int node_power_limit); int intel_cpu_fm_06_4f_get_frequencies(void); +int intel_cpu_fm_06_4f_get_thermals_json(json_t *get_thermal_obj); + #endif diff --git a/src/variorum/Intel/Intel_06_55.c b/src/variorum/Intel/Intel_06_55.c index c9b0e79c2..b9dce0e8d 100644 --- a/src/variorum/Intel/Intel_06_55.c +++ b/src/variorum/Intel/Intel_06_55.c @@ -414,6 +414,22 @@ int intel_cpu_fm_06_55_get_node_power_domain_info_json(char return 0; } +int intel_cpu_fm_06_55_get_thermals_json(json_t *get_thermal_obj) +{ + char *val = getenv("VARIORUM_LOG"); + if (val != NULL && atoi(val) == 1) + { + printf("Running %s\n", __FUNCTION__); + } + + get_therm_temp_reading_json(get_thermal_obj, + msrs.ia32_therm_status, + msrs.ia32_package_therm_status, + msrs.msr_temperature_target); + + return 0; +} + int intel_cpu_fm_06_55_cap_best_effort_node_power_limit(int node_limit) { char *val = getenv("VARIORUM_LOG"); diff --git a/src/variorum/Intel/Intel_06_55.h b/src/variorum/Intel/Intel_06_55.h index bc591ced4..5c1f4cb1c 100644 --- a/src/variorum/Intel/Intel_06_55.h +++ b/src/variorum/Intel/Intel_06_55.h @@ -112,4 +112,6 @@ int intel_cpu_fm_06_55_cap_frequency(int core_freq_mhz); int intel_cpu_fm_06_55_get_frequencies(void); +int intel_cpu_fm_06_55_get_thermals_json(json_t *get_thermal_obj); + #endif diff --git a/src/variorum/Intel/Intel_06_9E.c b/src/variorum/Intel/Intel_06_9E.c index af2473bbd..5734405a7 100644 --- a/src/variorum/Intel/Intel_06_9E.c +++ b/src/variorum/Intel/Intel_06_9E.c @@ -416,6 +416,22 @@ int intel_cpu_fm_06_9e_get_node_power_domain_info_json(char return 0; } +int intel_cpu_fm_06_9e_get_thermals_json(json_t *get_thermal_obj) +{ + char *val = getenv("VARIORUM_LOG"); + if (val != NULL && atoi(val) == 1) + { + printf("Running %s\n", __FUNCTION__); + } + + get_therm_temp_reading_json(get_thermal_obj, + msrs.ia32_therm_status, + msrs.ia32_package_therm_status, + msrs.msr_temperature_target); + + return 0; +} + int intel_cpu_fm_06_9e_cap_best_effort_node_power_limit(int node_limit) { char *val = getenv("VARIORUM_LOG"); diff --git a/src/variorum/Intel/Intel_06_9E.h b/src/variorum/Intel/Intel_06_9E.h index 56d0a29ae..228dadafb 100644 --- a/src/variorum/Intel/Intel_06_9E.h +++ b/src/variorum/Intel/Intel_06_9E.h @@ -110,4 +110,6 @@ int intel_cpu_fm_06_9e_cap_best_effort_node_power_limit(int node_power_limit); int intel_cpu_fm_06_9e_get_frequencies(void); +int intel_cpu_fm_06_9e_get_thermals_json(json_t *get_thermal_obj); + #endif diff --git a/src/variorum/Intel/config_intel.c b/src/variorum/Intel/config_intel.c index d41a0235e..be491b64f 100644 --- a/src/variorum/Intel/config_intel.c +++ b/src/variorum/Intel/config_intel.c @@ -95,6 +95,8 @@ int set_intel_func_ptrs(int idx) intel_cpu_fm_06_2a_cap_best_effort_node_power_limit; g_platform[idx].variorum_print_available_frequencies = intel_cpu_fm_06_2a_get_frequencies; + g_platform[idx].variorum_get_thermals_json = + intel_cpu_fm_06_2a_get_thermals_json; } else if (*g_platform[idx].arch_id == FM_06_2D) { @@ -122,6 +124,8 @@ int set_intel_func_ptrs(int idx) intel_cpu_fm_06_2d_cap_best_effort_node_power_limit; g_platform[idx].variorum_print_available_frequencies = intel_cpu_fm_06_2d_get_frequencies; + g_platform[idx].variorum_get_thermals_json = + intel_cpu_fm_06_2d_get_thermals_json; } // Ivy Bridge 06_3E else if (*g_platform[idx].arch_id == FM_06_3E) @@ -150,6 +154,8 @@ int set_intel_func_ptrs(int idx) intel_cpu_fm_06_3e_cap_best_effort_node_power_limit; g_platform[idx].variorum_print_available_frequencies = intel_cpu_fm_06_3e_get_frequencies; + g_platform[idx].variorum_get_thermals_json = + intel_cpu_fm_06_3e_get_thermals_json; } // Haswell 06_3F else if (*g_platform[idx].arch_id == FM_06_3F) @@ -178,6 +184,8 @@ int set_intel_func_ptrs(int idx) intel_cpu_fm_06_3f_cap_best_effort_node_power_limit; g_platform[idx].variorum_print_available_frequencies = intel_cpu_fm_06_3f_get_frequencies; + g_platform[idx].variorum_get_thermals_json = + intel_cpu_fm_06_3f_get_thermals_json; } // Broadwell 06_4F else if (*g_platform[idx].arch_id == FM_06_4F) @@ -206,6 +214,8 @@ int set_intel_func_ptrs(int idx) intel_cpu_fm_06_4f_cap_best_effort_node_power_limit; g_platform[idx].variorum_print_available_frequencies = intel_cpu_fm_06_4f_get_frequencies; + g_platform[idx].variorum_get_thermals_json = + intel_cpu_fm_06_4f_get_thermals_json; } // Skylake 06_55 else if (*g_platform[idx].arch_id == FM_06_55) @@ -234,6 +244,8 @@ int set_intel_func_ptrs(int idx) intel_cpu_fm_06_55_cap_frequency; g_platform[idx].variorum_print_available_frequencies = intel_cpu_fm_06_55_get_frequencies; + g_platform[idx].variorum_get_thermals_json = + intel_cpu_fm_06_55_get_thermals_json; } // Kaby Lake 06_9E else if (*g_platform[idx].arch_id == FM_06_9E) @@ -260,6 +272,8 @@ int set_intel_func_ptrs(int idx) intel_cpu_fm_06_9e_cap_best_effort_node_power_limit; g_platform[idx].variorum_print_available_frequencies = intel_cpu_fm_06_9e_get_frequencies; + g_platform[idx].variorum_get_thermals_json = + intel_cpu_fm_06_9e_get_thermals_json; } // Ice Lake 06_6A else if (*g_platform[idx].arch_id == FM_06_6A) diff --git a/src/variorum/Intel/thermal_features.c b/src/variorum/Intel/thermal_features.c index 192c5acaf..0ec92bc7b 100644 --- a/src/variorum/Intel/thermal_features.c +++ b/src/variorum/Intel/thermal_features.c @@ -6,6 +6,9 @@ #include #include #include +#include +#include +#include #include #include @@ -377,6 +380,76 @@ int print_therm_temp_reading(FILE *writedest, off_t msr_therm_stat, return 0; } +int get_therm_temp_reading_json(json_t *get_thermal_object, + off_t msr_therm_stat, + off_t msr_pkg_therm_stat, + off_t msr_temp_target) +{ + struct therm_stat *t_stat = NULL; + struct msr_temp_target *t_target = NULL; + struct pkg_therm_stat *pkg_stat = NULL; + unsigned i, j, k; + unsigned nsockets, ncores, nthreads; + unsigned idx; + float core_temp; + int pkg_temp; + + variorum_get_topology(&nsockets, &ncores, &nthreads, P_INTEL_CPU_IDX); + + pkg_stat = (struct pkg_therm_stat *) malloc(nsockets * sizeof( + struct pkg_therm_stat)); + get_pkg_therm_stat(pkg_stat, msr_pkg_therm_stat); + + t_target = (struct msr_temp_target *) malloc(nsockets * sizeof( + struct msr_temp_target)); + get_temp_target(t_target, msr_temp_target); + + t_stat = (struct therm_stat *) malloc(nthreads * sizeof(struct therm_stat)); + get_therm_stat(t_stat, msr_therm_stat); + + for (i = 0; i < nsockets; i++) + { + char socket_id[12]; //up to 9999 sockets + snprintf(socket_id, 12, "socket_%d", i); + json_t *socket_obj = json_object_get(get_thermal_object, socket_id); + if (socket_obj == NULL) + { + socket_obj = json_object(); + json_object_set_new(get_thermal_object, socket_id, socket_obj); + } + + json_t *cpu_obj = json_object(); + json_object_set_new(socket_obj, "CPU", cpu_obj); + + pkg_temp = (int)t_target[i].temp_target - pkg_stat[i].readout; + json_object_set_new(cpu_obj, "PKG_Actual", json_integer(pkg_temp)); + + json_t *core_obj = json_object(); + json_object_set_new(cpu_obj, "Core", core_obj); + + for (j = 0; j < ncores / nsockets; j++) + { + char core[32]; + snprintf(core, 32, "temp_celsius_core_%d", j); + core_temp = 0.0; + + for (k = 0; k < nthreads / ncores; k++) + { + idx = (k * nsockets * (ncores / nsockets)) + (i * (ncores / nsockets)) + j; + core_temp += (int)t_target[i].temp_target - t_stat[idx].readout; + } + core_temp /= (nthreads / ncores); + json_object_set_new(core_obj, core, json_real(core_temp)); + } + } + + free(pkg_stat); + free(t_stat); + free(t_target); + + return 0; +} + ///// @brief Initialize storage for IA32_THERM_INTERRUPT. ///// ///// @param [out] ti Data for per-core thermal interrupts. diff --git a/src/variorum/Intel/thermal_features.h b/src/variorum/Intel/thermal_features.h index ae781187c..1853fb1ce 100644 --- a/src/variorum/Intel/thermal_features.h +++ b/src/variorum/Intel/thermal_features.h @@ -6,6 +6,8 @@ #ifndef THERMAL_FEATURES_H_INCLUDE #define THERMAL_FEATURES_H_INCLUDE +#include + /// @brief Structure containing data from MSR_TEMPERATURE_TARGET. /// /// The scope of this MSR is defined as unique for Sandy Bridge. In our @@ -282,6 +284,12 @@ int print_verbose_therm_temp_reading(FILE *writedest, off_t msr_pkg_therm_stat, off_t msr_temp_target); +/// @brief set json object string with temperature data +int get_therm_temp_reading_json(json_t *get_thermal_object, + off_t msr_therm_stat, + off_t msr_pkg_therm_stat, + off_t msr_temp_target); + /// @brief Read value of the IA32_PACKAGE_THERM_STATUS register and translate /// bit fields to human-readable values. /// diff --git a/src/variorum/Nvidia_GPU/Volta.c b/src/variorum/Nvidia_GPU/Volta.c index 8b4e73c72..dbfa352e8 100644 --- a/src/variorum/Nvidia_GPU/Volta.c +++ b/src/variorum/Nvidia_GPU/Volta.c @@ -51,6 +51,26 @@ int volta_get_thermals(int long_ver) return 0; } +int volta_get_thermals_json(json_t *get_thermal_obj) +{ + char *val = getenv("VARIORUM_LOG"); + if (val != NULL && atoi(val) == 1) + { + printf("Running %s\n", __FUNCTION__); + } + + unsigned iter = 0; + unsigned nsockets; + variorum_get_topology(&nsockets, NULL, NULL, P_NVIDIA_GPU_IDX); + + for (iter = 0; iter < nsockets; iter++) + { + nvidia_gpu_get_thermal_json(iter, get_thermal_obj); + } + + return 0; +} + int volta_get_clocks(int long_ver) { char *val = getenv("VARIORUM_LOG"); diff --git a/src/variorum/Nvidia_GPU/Volta.h b/src/variorum/Nvidia_GPU/Volta.h index a6402e006..8d47677d9 100644 --- a/src/variorum/Nvidia_GPU/Volta.h +++ b/src/variorum/Nvidia_GPU/Volta.h @@ -6,6 +6,8 @@ #ifndef VOLTA_H_INCLUDE #define VOLTA_H_INCLUDE +#include + int volta_get_power(int long_ver); int volta_get_thermals(int long_ver); @@ -18,4 +20,6 @@ int volta_get_gpu_utilization(int long_ver); int volta_cap_each_gpu_power_limit(unsigned int powerlimit); +int volta_get_thermals_json(json_t *get_thermal_obj); + #endif diff --git a/src/variorum/Nvidia_GPU/config_nvidia.c b/src/variorum/Nvidia_GPU/config_nvidia.c index d132e1c8e..693058e96 100644 --- a/src/variorum/Nvidia_GPU/config_nvidia.c +++ b/src/variorum/Nvidia_GPU/config_nvidia.c @@ -33,6 +33,8 @@ int set_nvidia_func_ptrs(int idx) /* Initialize control interfaces */ g_platform[idx].variorum_cap_each_gpu_power_limit = volta_cap_each_gpu_power_limit; + g_platform[idx].variorum_get_thermals_json = + volta_get_thermals_json; } else { diff --git a/src/variorum/Nvidia_GPU/nvidia_gpu_power_features.c b/src/variorum/Nvidia_GPU/nvidia_gpu_power_features.c index 4783b9f5f..edc72eba9 100644 --- a/src/variorum/Nvidia_GPU/nvidia_gpu_power_features.c +++ b/src/variorum/Nvidia_GPU/nvidia_gpu_power_features.c @@ -172,6 +172,39 @@ void nvidia_gpu_get_thermal_data(int chipid, int verbose, FILE *output) /*!@todo: Print GPU memory temperature */ } +void nvidia_gpu_get_thermal_json(int chipid, json_t *output) +{ + unsigned gpu_temp; + int d; + + char socket_id[12]; + snprintf(socket_id, 12, "socket_%d", chipid); + + //try to find socket object in node object, set new object if not found + json_t *socket_obj = json_object_get(output, socket_id); + if (socket_obj == NULL) + { + socket_obj = json_object(); + json_object_set_new(output, socket_id, socket_obj); + } + + //create new json object for GPU + json_t *gpu_obj = json_object(); + json_object_set_new(socket_obj, "GPU", gpu_obj); + + for (d = chipid * (int)m_gpus_per_socket; + d < (chipid + 1) * (int)m_gpus_per_socket; ++d) + { + nvmlDeviceGetTemperature(m_unit_devices_file_desc[d], NVML_TEMPERATURE_GPU, + &gpu_temp); + + //set GPU device id and temperature in general GPU json object. + char device_id[32]; + snprintf(device_id, 32, "temp_celsius_gpu_%d", d); + json_object_set_new(gpu_obj, device_id, json_integer(gpu_temp)); + } +} + void nvidia_gpu_get_power_limits_data(int chipid, int verbose, FILE *output) { unsigned int power_limit; diff --git a/src/variorum/Nvidia_GPU/nvidia_gpu_power_features.h b/src/variorum/Nvidia_GPU/nvidia_gpu_power_features.h index 899b39397..60121f6a5 100644 --- a/src/variorum/Nvidia_GPU/nvidia_gpu_power_features.h +++ b/src/variorum/Nvidia_GPU/nvidia_gpu_power_features.h @@ -10,6 +10,7 @@ #include #include +#include extern unsigned m_total_unit_devices; extern nvmlDevice_t *m_unit_devices_file_desc; @@ -32,4 +33,6 @@ void nvidia_gpu_get_gpu_utilization_data(int chipid, int verbose, FILE *output); void cap_each_gpu_power_limit(int chipid, unsigned int powerlimit); +void nvidia_gpu_get_thermal_json(int chipid, json_t *output); + #endif diff --git a/src/variorum/config_architecture.c b/src/variorum/config_architecture.c index edb9e9f40..4b4a8b637 100644 --- a/src/variorum/config_architecture.c +++ b/src/variorum/config_architecture.c @@ -358,6 +358,7 @@ void variorum_init_func_ptrs() g_platform[i].variorum_get_node_power_json = NULL; g_platform[i].variorum_get_node_power_domain_info_json = NULL; g_platform[i].variorum_print_energy = NULL; + g_platform[i].variorum_get_thermals_json = NULL; } } diff --git a/src/variorum/config_architecture.h b/src/variorum/config_architecture.h index 1f3ff44cb..aecb78f6b 100644 --- a/src/variorum/config_architecture.h +++ b/src/variorum/config_architecture.h @@ -10,6 +10,8 @@ #include +#include + /// @brief Create a mask from bit m to n (63 >= m >= n >= 0). /// /// Example: MASK_RANGE(4,2) --> (((1<<((4)-(2)+1))-1)<<(2)) @@ -245,6 +247,11 @@ struct platform /// @return Error code. int (*variorum_get_node_power_domain_info_json)(char **get_domain_obj_str); + /// @brief Function pointer to get JSON object for thermal information + /// + /// @return Error code. + int (*variorum_get_thermals_json)(json_t *get_thermal_obj); + /// @brief Function pointer to get list of available frequencies. /// /// @return Error code. diff --git a/src/variorum/variorum.c b/src/variorum/variorum.c index ba452bf6c..dcc546ba8 100644 --- a/src/variorum/variorum.c +++ b/src/variorum/variorum.c @@ -6,6 +6,9 @@ #include #include #include +#include +#include +#include #include #include @@ -1147,6 +1150,58 @@ int variorum_get_node_power_domain_info_json(char **get_domain_obj_str) return err; } +int variorum_get_thermals_json(char **get_thermal_obj_str) +{ + int err = 0; + int i; + uint64_t ts; + err = variorum_enter(__FILE__, __FUNCTION__, __LINE__); + if (err) + { + return -1; + } + + char hostname[1024]; + gethostname(hostname, 1024); + + struct timeval tv; + + json_t *get_thermal_obj = json_object(); + json_t *node_obj = json_object(); + json_object_set_new(get_thermal_obj, hostname, node_obj); + + gettimeofday(&tv, NULL); + ts = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; + json_object_set_new(node_obj, "timestamp", json_integer(ts)); + + for (i = 0; i < P_NUM_PLATFORMS; i++) + { + if (g_platform[i].variorum_get_thermals_json == NULL) + { + variorum_error_handler("Feature not yet implemented or is not supported", + VARIORUM_ERROR_FEATURE_NOT_IMPLEMENTED, + getenv("HOSTNAME"), __FILE__, + __FUNCTION__, __LINE__); + continue; + } + err = g_platform[i].variorum_get_thermals_json(node_obj); + if (err) + { + return -1; + } + } + + *get_thermal_obj_str = json_dumps(get_thermal_obj, JSON_INDENT(4)); + json_decref(get_thermal_obj); + + err = variorum_exit(__FILE__, __FUNCTION__, __LINE__); + if (err) + { + return -1; + } + return err; +} + int variorum_print_available_frequencies(void) { int err = 0; diff --git a/src/variorum/variorum.h b/src/variorum/variorum.h index da4d099bb..17e160d22 100644 --- a/src/variorum/variorum.h +++ b/src/variorum/variorum.h @@ -564,6 +564,29 @@ int variorum_get_node_power_json(char **get_power_obj_str); /// check for NULL strings. int variorum_get_node_power_domain_info_json(char **get_domain_obj_str); +/// @brief Populate a string in nested JSON format for temperature readouts +/// format: hostname { Socket_n { CPU { Core { Sensor Name : Temp in C }, Mem { Sensor Name : Temp in C } } }, GPU { Device : Temp in C } } +/// where n is the socket number +/// +/// @supparch +/// - Intel Sandy Bridge +/// - Intel Ivy Bridge +/// - Intel Haswell +/// - Intel Broadwell +/// - Intel Skylake +/// - Intel Kaby Lake +/// - IBM Power9 +/// - AMD Instinct +/// - Nvidia Volta +/// +/// @param [out] output String (passed by reference) that contains node-level +/// thermal information +/// +/// @returns 0 if successful, otherwise -1. Note that feature not implemented +/// returns a -1 for the JSON APIs so that users don't have to explicily +/// check for NULL strings. +int variorum_get_thermals_json(char **get_thermal_obj_str); + /// @brief Returns Variorum version as a constant string. /// /// @supparch