Skip to content

Commit

Permalink
json output for frequency data (#488)
Browse files Browse the repository at this point in the history
- add docs
- add variorum_get_frequency_json function interfaces
- supports Intel CPUs, AMD CPUs, IBM Power9 CPUs, AMD GPUs, nVidia GPUs

---------

Co-authored-by: Kyle Fan <[email protected]>
  • Loading branch information
slabasan and kfan326 authored Jan 15, 2024
1 parent 5118852 commit 576ab21
Show file tree
Hide file tree
Showing 42 changed files with 712 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/docs/sphinx/api/json_support_functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@ Defined in ``variorum/variorum.h``.
.. doxygenfunction:: variorum_get_node_power_domain_info_json

.. doxygenfunction:: variorum_get_thermals_json

.. doxygenfunction:: variorum_get_node_frequency_json
1 change: 1 addition & 0 deletions src/examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ set(BASIC_EXAMPLES
variorum-cap-socket-power-limit-example
variorum-disable-turbo-example
variorum-enable-turbo-example
variorum-get-node-frequency-json-example
variorum-get-node-power-domain-info-json-example
variorum-get-node-power-json-example
variorum-get-node-thermal-json-example
Expand Down
45 changes: 45 additions & 0 deletions src/examples/variorum-get-node-frequency-json-example.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// Copyright 2019-2023 Lawrence Livermore National Security, LLC and other
// Variorum Project Developers. See the top-level LICENSE file for details.
//
// SPDX-License-Identifier: MIT

#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>

#include <variorum.h>

int main(int argc, char **argv)
{
int ret;

const char *usage = "Usage: %s [-h] [-v]\n";
int opt;
while ((opt = getopt(argc, argv, "hv")) != -1)
{
switch (opt)
{
case 'h':
printf(usage, argv[0]);
return 0;
case 'v':
printf("%s\n", variorum_get_current_version());
return 0;
default:
fprintf(stderr, usage, argv[0]);
return -1;
}
}
char *s = NULL;
ret = variorum_get_node_frequency_json(&s);
if (ret != 0)
{
printf("Variorum get frequency json failure!\n");
free(s);
exit(-1);
}
puts(s);
free(s);

return ret;
}
1 change: 1 addition & 0 deletions src/variorum/AMD/config_amd.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ int set_amd_func_ptrs(int idx)
g_platform[idx].variorum_get_node_power_json = amd_cpu_epyc_get_node_power_json;
g_platform[idx].variorum_get_node_power_domain_info_json =
amd_cpu_epyc_get_node_power_domain_info_json;
g_platform[idx].variorum_get_frequency_json = amd_cpu_epyc_get_json_boostlimit;
break;
default:
fprintf(stdout, "ESMI not initialized, drivers not found. "
Expand Down
45 changes: 45 additions & 0 deletions src/variorum/AMD/epyc.c
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,51 @@ int amd_cpu_epyc_print_boostlimit()
return 0;
}

int amd_cpu_epyc_get_json_boostlimit(json_t *get_clock_obj_json)
{
char *val = getenv("VARIORUM_LOG");
if (val != NULL && atoi(val) == 1)
{
printf("Running %s\n\n", __FUNCTION__);
}

int socket, core, ret;
uint32_t boostlimit;

int num_sockets = g_platform[P_AMD_CPU_IDX].num_sockets;
int total_cores = g_platform[P_AMD_CPU_IDX].total_cores;
int cores_per_socket = total_cores / num_sockets;
int current_core = 0;

for (socket = 0; socket < num_sockets; ++socket)
{
char socket_name[16];
snprintf(socket_name, 16, "socket_%d", socket);
json_t *socket_obj = json_object_get(get_clock_obj_json, socket_name);
if (socket_obj == NULL)
{
socket_obj = json_object();
json_object_set_new(get_clock_obj_json, socket_name, socket_obj);
}

json_t *cpu_obj = json_object();
json_object_set_new(socket_obj, "CPU", cpu_obj);

json_t *core_obj = json_object();
json_object_set_new(cpu_obj, "core", core_obj);

for (core = 0; core < cores_per_socket; ++core)
{
ret = esmi_core_boostlimit_get(current_core, &boostlimit);
char core_avg_string[24];
snprintf(core_avg_string, 24, "core_%d_avg_freq_mhz", current_core);
json_object_set_new(core_obj, core_avg_string, json_real(boostlimit));
current_core++;
}
}
return 0;
}

int amd_cpu_epyc_set_each_core_boostlimit(int boostlimit)
{
char *val = getenv("VARIORUM_LOG");
Expand Down
4 changes: 4 additions & 0 deletions src/variorum/AMD/epyc.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,8 @@ int amd_cpu_epyc_get_node_power_domain_info_json(
char **get_domain_obj_str
);

int amd_cpu_epyc_get_json_boostlimit(
json_t *get_clock_obj_json
);

#endif
86 changes: 86 additions & 0 deletions src/variorum/AMD_GPU/amd_gpu_power_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,92 @@ void get_clocks_data(int chipid, int total_sockets, int verbose, FILE *output)
}
}

void get_clocks_json(int chipid, int total_sockets, json_t *output)
{
rsmi_status_t ret;
uint32_t num_devices;
int gpus_per_socket;
char socketID[16];

snprintf(socketID, 16, "socket_%d", chipid);

ret = rsmi_init(0);
if (ret != RSMI_STATUS_SUCCESS)
{
variorum_error_handler("Could not initialize RSMI",
VARIORUM_ERROR_PLATFORM_ENV,
getenv("HOSTNAME"), __FILE__, __FUNCTION__,
__LINE__);
}

ret = rsmi_num_monitor_devices(&num_devices);
if (ret != RSMI_STATUS_SUCCESS)
{
variorum_error_handler("Could not get number of GPU devices",
VARIORUM_ERROR_PLATFORM_ENV,
getenv("HOSTNAME"), __FILE__, __FUNCTION__,
__LINE__);
}

gpus_per_socket = num_devices / total_sockets;

json_t *socket_obj = json_object_get(output, socketID);
if (socket_obj == NULL)
{
socket_obj = json_object();
json_object_set_new(output, socketID, socket_obj);
}

json_t *gpu_obj = json_object();
json_object_set_new(socket_obj, "GPU", gpu_obj);

for (int i = chipid * gpus_per_socket;
i < (chipid + 1) * gpus_per_socket; i++)
{
rsmi_frequencies_t f_sys, f_mem;
uint32_t f_sys_val, f_mem_val;

ret = rsmi_dev_gpu_clk_freq_get(i, RSMI_CLK_TYPE_SYS, &f_sys);
if (ret != RSMI_STATUS_SUCCESS)
{
variorum_error_handler("RSMI API was not successful",
VARIORUM_ERROR_PLATFORM_ENV,
getenv("HOSTNAME"), __FILE__, __FUNCTION__,
__LINE__);
}

ret = rsmi_dev_gpu_clk_freq_get(i, RSMI_CLK_TYPE_MEM, &f_mem);
if (ret != RSMI_STATUS_SUCCESS)
{
variorum_error_handler("RSMI API was not successful",
VARIORUM_ERROR_PLATFORM_ENV,
getenv("HOSTNAME"), __FILE__, __FUNCTION__,
__LINE__);
}

f_sys_val = f_sys.frequency[f_sys.current] / (1000 * 1000); // Convert to MHz
f_mem_val = f_mem.frequency[f_mem.current] / (1000 * 1000); // Convert to MHz

char gpu_clock_string[32];
snprintf(gpu_clock_string, 32, "gpu_%d_freq_mhz", i);

char gpu_mem_clock_string[32];
snprintf(gpu_mem_clock_string, 32, "gpu_%d_mem_freq_mhz", i);

json_object_set_new(gpu_obj, gpu_clock_string, json_integer(f_sys_val));
json_object_set_new(gpu_obj, gpu_mem_clock_string, json_integer(f_mem_val));
}

ret = rsmi_shut_down();
if (ret != RSMI_STATUS_SUCCESS)
{
variorum_error_handler("Could not shutdown RSMI",
VARIORUM_ERROR_PLATFORM_ENV,
getenv("HOSTNAME"), __FILE__, __FUNCTION__,
__LINE__);
}
}

void get_gpu_utilization_data(int chipid, int total_sockets, int verbose,
FILE *output)
{
Expand Down
6 changes: 6 additions & 0 deletions src/variorum/AMD_GPU/amd_gpu_power_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,10 @@ void get_thermals_json(
json_t *output
);

void get_clocks_json(
int chipid,
int total_sockets,
json_t *output
);

#endif
1 change: 1 addition & 0 deletions src/variorum/AMD_GPU/config_amd_gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ int set_amd_gpu_func_ptrs(int idx)
g_platform[idx].variorum_print_gpu_utilization =
amd_gpu_instinct_get_gpu_utilization;
g_platform[idx].variorum_get_thermals_json = amd_gpu_instinct_get_thermals_json;
g_platform[idx].variorum_get_frequency_json = amd_gpu_instinct_get_clocks_json;
/* Initialize control interfaces */
g_platform[idx].variorum_cap_each_gpu_power_limit =
amd_gpu_instinct_cap_each_gpu_power_limit;
Expand Down
21 changes: 21 additions & 0 deletions src/variorum/AMD_GPU/instinctGPU.c
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,27 @@ int amd_gpu_instinct_get_clocks(int verbose)
return 0;
}

int amd_gpu_instinct_get_clocks_json(json_t *get_clock_obj_json)
{
char *val = getenv("VARIORUM_LOG");
if (val != NULL && atoi(val) == 1)
{
printf("Running %s\n", __FUNCTION__);
}

unsigned iter = 0;
unsigned nsockets;

variorum_get_topology(&nsockets, NULL, NULL, P_AMD_GPU_IDX);

for (iter = 0; iter < nsockets; iter++)
{
get_clocks_json(iter, nsockets, get_clock_obj_json);
}

return 0;
}

int amd_gpu_instinct_get_gpu_utilization(int verbose)
{
char *val = getenv("VARIORUM_LOG");
Expand Down
4 changes: 4 additions & 0 deletions src/variorum/AMD_GPU/instinctGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,8 @@ int amd_gpu_instinct_get_thermals_json(
json_t *get_thermal_obj
);

int amd_gpu_instinct_get_clocks_json(
json_t *get_clocks_obj_json
);

#endif
61 changes: 61 additions & 0 deletions src/variorum/IBM/Power9.c
Original file line number Diff line number Diff line change
Expand Up @@ -571,3 +571,64 @@ int ibm_cpu_p9_get_node_power_domain_info_json(char **get_domain_obj_str)

return 0;
}

int ibm_cpu_p9_get_node_frequency_json(json_t *get_frequency_obj_json)
{
char *val = ("VARIORUM_LOG");
if (val != NULL && atoi(val) == 1)
{
printf("Running %s\n", __FUNCTION__);
}

void *buf;
int fd;
int rc;
int bytes;
unsigned iter = 0;
unsigned nsockets;

#ifdef VARIORUM_WITH_IBM_CPU
variorum_get_topology(&nsockets, NULL, NULL, P_IBM_CPU_IDX);
#endif

fd = open("/sys/firmware/opal/exports/occ_inband_sensors", O_RDONLY);
if (fd < 0)
{
printf("Failed to open occ_inband_sensors file\n");
return -1;
}

for (iter = 0; iter < nsockets; iter++)
{
lseek(fd, iter * OCC_SENSOR_DATA_BLOCK_SIZE, SEEK_SET);

buf = malloc(OCC_SENSOR_DATA_BLOCK_SIZE);
if (!buf)
{
printf("Failed to allocate\n");
return -1;
}

for (rc = bytes = 0; bytes < OCC_SENSOR_DATA_BLOCK_SIZE; bytes += rc)
{
rc = read(fd, buf + bytes, OCC_SENSOR_DATA_BLOCK_SIZE - bytes);

if (!rc || rc < 0)
{
break;
}
}

if (bytes != OCC_SENSOR_DATA_BLOCK_SIZE)
{
printf("Failed to read data\n");
free(buf);
return -1;
}
json_get_frequency_sensors(iter, get_frequency_obj_json, buf);
free(buf);
}

close(fd);
return 0;
}
4 changes: 4 additions & 0 deletions src/variorum/IBM/Power9.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,8 @@ int ibm_cpu_p9_get_node_thermal_json(
json_t *get_thermal_obj
);

int ibm_cpu_p9_get_node_frequency_json(
json_t *get_frequency_obj_json
);

#endif
2 changes: 2 additions & 0 deletions src/variorum/IBM/config_ibm.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ int set_ibm_func_ptrs(int idx)
g_platform[idx].variorum_get_node_power_domain_info_json =
ibm_cpu_p9_get_node_power_domain_info_json;
g_platform[idx].variorum_get_thermals_json = ibm_cpu_p9_get_node_thermal_json;
g_platform[idx].variorum_get_frequency_json =
ibm_cpu_p9_get_node_frequency_json;
}
else
{
Expand Down
Loading

0 comments on commit 576ab21

Please sign in to comment.