Skip to content

Commit

Permalink
add json output for thermal data (#461)
Browse files Browse the repository at this point in the history
- add variorum_get_thermal_json function interfaces
- add docs
- supports AMD GPUs, nVidia GPUs, Intel CPUs, IBM Power9 CPUs
  • Loading branch information
slabasan authored Dec 1, 2023
1 parent 035ebd5 commit 49db931
Show file tree
Hide file tree
Showing 40 changed files with 714 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/docs/sphinx/VariorumAPI.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ implementations in Variorum are described in the following sections:
- :doc:`api/cap_functions`
- :doc:`api/json_support_functions`
- :doc:`api/enable_disable_functions`
- :doc:`api/json`

*******************
Variorum Wrappers
Expand Down
48 changes: 48 additions & 0 deletions src/docs/sphinx/api/json.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
.. # Copyright 2019-2023 Lawrence Livermore National Security, LLC and other
# Variorum Project Developers. See the top-level LICENSE file for details.
#
# SPDX-License-Identifier: MIT
##########
JSON API
##########

*******************************
Obtaining Thermal Information
*******************************

The API to obtain node thermal has the following format. It takes a string
(``char**``) by reference as input, and populates this string with a nested
JSON object with hostname, followed by socket_{number}, followed by CPU and or
GPU (depending on the platform, may contain only one or both), followed by Core
and Mem for CPU.

The ``variorum_get_thermals_json(char **)`` function returns a string type
nested JSON object. An example is provided below::

{
"hostname": {
"Socket_0": {
"CPU": {
"Core": {
"temp_celsius_core_0": (Integer),
...
"temp_celsius_core_i": (Integer),
},
"Mem": {
"temp_celsius_dimm_0": (Integer),
...
:temp_celsius_dimm_i": (Integer),
},
},
"GPU": {
"temp_celsius_gpu_0": (Integer),
...
"temp_celsius_gpu_i": (Integer),
}
},
"timestamp" : (Integer)
}
}

Here, ``i`` is the index of the core or GPU and ``0 <= i < num_cores/GPUs``.
1 change: 1 addition & 0 deletions src/examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ set(BASIC_EXAMPLES
variorum-enable-turbo-example
variorum-get-node-power-json-example
variorum-get-node-power-domain-info-json-example
variorum-get-node-thermal-json-example
variorum-integration-using-json-example
variorum-get-topology-info-example
variorum-monitoring-to-file-example
Expand Down
51 changes: 51 additions & 0 deletions src/examples/variorum-get-node-thermal-json-example.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Copyright 2019-2023 Lawrence Livermore National Security, LLC and other
// Variorum Project Developers. See the top-level LICENSE file for details.
//
// SPDX-License-Identifier: MIT

#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>

#include <variorum.h>
#include <variorum_topology.h>

int main(int argc, char **argv)
{
int ret;
char *s = NULL;

const char *usage = "Usage: %s [-h] [-v]\n";
int opt;
while ((opt = getopt(argc, argv, "hv")) != -1)
{
switch (opt)
{
case 'h':
printf(usage, argv[0]);
return 0;
case 'v':
printf("%s\n", variorum_get_current_version());
return 0;
default:
fprintf(stderr, usage, argv[0]);
return -1;
}
}

ret = variorum_get_thermals_json(&s);
if (ret != 0)
{
printf("First run: JSON get thermals failed!\n");
free(s);
exit(-1);
}

/* Print the entire JSON object */
puts(s);

/* Deallocate the string */
free(s);

return ret;
}
84 changes: 82 additions & 2 deletions src/variorum/AMD_GPU/amd_gpu_power_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ void get_thermals_data(int chipid, int total_sockets, int verbose, FILE *output)
static int init = 0;
static struct timeval start;
struct timeval now;
int i;

gethostname(hostname, 1024);

Expand Down Expand Up @@ -316,8 +317,7 @@ void get_thermals_data(int chipid, int total_sockets, int verbose, FILE *output)

gettimeofday(&now, NULL);

for (int i = chipid * gpus_per_socket;
i < (chipid + 1) * gpus_per_socket; i++)
for (i = chipid * gpus_per_socket; i < (chipid + 1) * gpus_per_socket; i++)
{
int64_t temp_val = -1;
double temp_val_flt = -1.0;
Expand Down Expand Up @@ -379,6 +379,86 @@ void get_thermals_data(int chipid, int total_sockets, int verbose, FILE *output)
}
}

void get_thermals_json(int chipid, int total_sockets, json_t *output)
{
rsmi_status_t ret;
uint32_t num_devices;
int gpus_per_socket;
char hostname[1024];

gethostname(hostname, 1024);

ret = rsmi_init(0);
if (ret != RSMI_STATUS_SUCCESS)
{
variorum_error_handler("Could not initialize RSMI",
VARIORUM_ERROR_PLATFORM_ENV,
getenv("HOSTNAME"), __FILE__, __FUNCTION__,
__LINE__);
exit(-1);
}

ret = rsmi_num_monitor_devices(&num_devices);
if (ret != RSMI_STATUS_SUCCESS)
{
variorum_error_handler("Could not get number of GPU devices",
VARIORUM_ERROR_PLATFORM_ENV,
getenv("HOSTNAME"), __FILE__, __FUNCTION__,
__LINE__);
}

gpus_per_socket = num_devices / total_sockets;

char socketid[12];
snprintf(socketid, 12, "socket_%d", chipid);

// check if socket object is in node object
json_t *socket_obj = json_object_get(output, socketid);
if (socket_obj == NULL)
{
socket_obj = json_object();
json_object_set_new(output, socketid, socket_obj);
}

// general gpu object
json_t *gpu_obj = json_object();
json_object_set_new(socket_obj, "GPU", gpu_obj);

int i;
for (i = chipid * gpus_per_socket; i < (chipid + 1) * gpus_per_socket; i++)
{
int64_t temp_val = -1;
double temp_val_flt = -1.0;

ret = rsmi_dev_temp_metric_get(i, RSMI_TEMP_TYPE_EDGE, RSMI_TEMP_CURRENT,
&temp_val);
if (ret != RSMI_STATUS_SUCCESS)
{
variorum_error_handler("RSMI API was not successful",
VARIORUM_ERROR_PLATFORM_ENV,
getenv("HOSTNAME"), __FILE__, __FUNCTION__,
__LINE__);
}

temp_val_flt = (double)(temp_val / (1000)); // Convert to Celcius.

// gpu entry
char gpuid[32];
snprintf(gpuid, 32, "temp_celsius_gpu_%d", i);
json_object_set_new(gpu_obj, gpuid, json_real(temp_val_flt));
}

ret = rsmi_shut_down();

if (ret != RSMI_STATUS_SUCCESS)
{
variorum_error_handler("Could not shutdown RSMI",
VARIORUM_ERROR_PLATFORM_ENV,
getenv("HOSTNAME"), __FILE__, __FUNCTION__,
__LINE__);
}
}

void get_clocks_data(int chipid, int total_sockets, int verbose, FILE *output)
{
rsmi_status_t ret;
Expand Down
3 changes: 3 additions & 0 deletions src/variorum/AMD_GPU/amd_gpu_power_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <stdint.h>
#include <stdio.h>
#include <jansson.h>

#include <rocm_smi/rocm_smi.h>

Expand All @@ -22,4 +23,6 @@ void get_gpu_utilization_data(int chipid, int total_sockets, int verbose,
void cap_each_gpu_power_limit(int chipid, int total_sockets,
unsigned int powerlimit);

void get_thermals_json(int chipid, int total_sockets, json_t *output);

#endif
2 changes: 2 additions & 0 deletions src/variorum/AMD_GPU/config_amd_gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ int set_amd_gpu_func_ptrs(int idx)
/* Initialize control interfaces */
g_platform[idx].variorum_cap_each_gpu_power_limit =
amd_gpu_instinct_cap_each_gpu_power_limit;
g_platform[idx].variorum_get_thermals_json =
amd_gpu_instinct_get_thermals_json;
}
else
{
Expand Down
21 changes: 21 additions & 0 deletions src/variorum/AMD_GPU/instinctGPU.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,27 @@ int amd_gpu_instinct_get_thermals(int verbose)
return 0;
}

int amd_gpu_instinct_get_thermals_json(json_t *get_thermal_obj)
{
char *val = getenv("VARIORUM_LOG");
if (val != NULL && atoi(val) == 1)
{
printf("Running %s\n", __FUNCTION__);
}

unsigned iter = 0;
unsigned nsockets;

variorum_get_topology(&nsockets, NULL, NULL, P_AMD_GPU_IDX);

for (iter = 0; iter < nsockets; iter++)
{
get_thermals_json(iter, nsockets, get_thermal_obj);
}

return 0;
}

int amd_gpu_instinct_get_clocks(int verbose)
{
char *val = getenv("VARIORUM_LOG");
Expand Down
4 changes: 4 additions & 0 deletions src/variorum/AMD_GPU/instinctGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,15 @@
#ifndef INSTINCTGPU_H_INCLUDE
#define INSTINCTGPU_H_INCLUDE

#include <jansson.h>
#include <sys/time.h>

int amd_gpu_instinct_get_power(int verbose);
int amd_gpu_instinct_get_power_limit(int verbose);
int amd_gpu_instinct_get_thermals(int verbose);
int amd_gpu_instinct_get_clocks(int verbose);
int amd_gpu_instinct_get_gpu_utilization(int verbose);
int amd_gpu_instinct_cap_each_gpu_power_limit(unsigned int powerlimit);
int amd_gpu_instinct_get_thermals_json(json_t *get_thermal_obj);

#endif
62 changes: 62 additions & 0 deletions src/variorum/IBM/Power9.c
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,68 @@ int ibm_cpu_p9_get_node_power_json(char **get_power_obj_str)
return 0;
}

int ibm_cpu_p9_get_node_thermal_json(json_t *get_thermal_obj)
{
char *val = ("VARIORUM_LOG");
if (val != NULL && atoi(val) == 1)
{
printf("Running %s\n", __FUNCTION__);
}

void *buf;
int fd;
int rc;
int bytes;
unsigned iter = 0;
unsigned nsockets;

#ifdef VARIORUM_WITH_IBM_CPU
variorum_get_topology(&nsockets, NULL, NULL, P_IBM_CPU_IDX);
#endif

fd = open("/sys/firmware/opal/exports/occ_inband_sensors", O_RDONLY);
if (fd < 0)
{
printf("Failed to open occ_inband_sensors file\n");
return -1;
}

for (iter = 0; iter < nsockets; iter++)
{
lseek(fd, iter * OCC_SENSOR_DATA_BLOCK_SIZE, SEEK_SET);

buf = malloc(OCC_SENSOR_DATA_BLOCK_SIZE);
if (!buf)
{
printf("Failed to allocate\n");
return -1;
}

for (rc = bytes = 0; bytes < OCC_SENSOR_DATA_BLOCK_SIZE; bytes += rc)
{
rc = read(fd, buf + bytes, OCC_SENSOR_DATA_BLOCK_SIZE - bytes);

if (!rc || rc < 0)
{
break;
}
}

if (bytes != OCC_SENSOR_DATA_BLOCK_SIZE)
{
printf("Failed to read data\n");
free(buf);
return -1;
}

json_get_thermal_sensors(iter, get_thermal_obj, buf);
free(buf);
}

close(fd);
return 0;
}

int ibm_cpu_p9_get_node_power_domain_info_json(char **get_domain_obj_str)
{
char *val = ("VARIORUM_LOG");
Expand Down
2 changes: 2 additions & 0 deletions src/variorum/IBM/Power9.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,6 @@ int ibm_cpu_p9_get_node_power_json(char **get_power_obj_str);

int ibm_cpu_p9_get_node_power_domain_info_json(char **get_domain_obj_str);

int ibm_cpu_p9_get_node_thermal_json(json_t *get_thermal_obj);

#endif
2 changes: 2 additions & 0 deletions src/variorum/IBM/config_ibm.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ int set_ibm_func_ptrs(int idx)
g_platform[idx].variorum_get_node_power_json = ibm_cpu_p9_get_node_power_json;
g_platform[idx].variorum_get_node_power_domain_info_json =
ibm_cpu_p9_get_node_power_domain_info_json;
g_platform[idx].variorum_get_thermals_json =
ibm_cpu_p9_get_node_thermal_json;
}
else
{
Expand Down
Loading

0 comments on commit 49db931

Please sign in to comment.