Skip to content

Commit

Permalink
fix(monitoring): merge the telegraf spinup code for all hosts
Browse files Browse the repository at this point in the history
Signed-off-by: sakethanne <[email protected]>
  • Loading branch information
Sakethanne committed Oct 3, 2024
1 parent f95e77e commit 18d3144
Showing 1 changed file with 68 additions and 64 deletions.
132 changes: 68 additions & 64 deletions ansible/monitoring.yml
Original file line number Diff line number Diff line change
@@ -1,78 +1,60 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2022 Dell Inc, or its subsidiaries.
---

- name: Monitoring
# Management server runs it via compose, see below. So skip it here
hosts: hostservers,tgens,DPUs
become: yes
become: true
vars:
bmc_vars: "{{ hostvars[inventory_hostname+'bmc'] }}"
bmc_vars: "{{ hostvars[inventory_hostname+'bmc'] }}"
tasks:

- name: Copy telegraf folder to remote
ansible.builtin.copy: src=../telegraf.d dest=/root
- name: Copy telegraf folder to remote folder
ansible.builtin.copy:
src: ../telegraf.d
dest: /root
mode: "0755"

- name: Remove arista config file
ansible.builtin.file: state=absent path=/root/telegraf.d/arista.conf

# TODO: create new telegraf container or use same for Marvell card
ansible.builtin.file:
path: /root/telegraf.d/arista.conf
state: absent

# TODO: see if there is an opportunity to consolidate and code dup removal

- name: Nvidia | telegraf otel monitoring
- name: Nvidia | Run additional Nvidia specific tasks
when: inventory_hostname == 'bf2'
block:
- name: Nvidia | make sure emulation is running for temperature
ansible.builtin.systemd: state=started name=set_emu_param
- ansible.builtin.systemd: state=stopped name=mlnx_snap
- ansible.builtin.systemd: state=started name=spdk_tgt
- name: Nvidia | Run telegraf container on Nvidia BF
community.docker.docker_container:
name: telegraf
image: docker.io/library/telegraf:1.31
ansible.builtin.systemd:
name: set_emu_param
state: started
restart: true
detach: true
network_mode: host
restart_policy: always
mounts:
- type: bind
source: /root/telegraf.d/telegraf.conf.bf2
target: /etc/telegraf/telegraf.conf
read_only: true
- type: bind
source: /run/emu_param
target: /run/emu_param
read_only: true

# TODO: see if there is an opportunity to consolidate and code dup removal
- name: Nvidia | Stop mlx_snap service
ansible.builtin.systemd:
name: mlnx_snap
state: stopped

- name: Nvidia | Start telegraf service
ansible.builtin.systemd:
name: spdk_tgt
state: started

- name: Intel | telegraf otel monitoring
- name: Intel | Set proxy environment and downgrade requests package due to bug
when: inventory_hostname == 'mev'
environment: "{{ proxy_env | default({}) }}"
block:
- name: Intel | Downgrade requests package due to bug https://github.com/ansible-collections/community.docker/issues/868
ansible.builtin.pip: name=requests<2.32
- name: Intel | Run telegraf container on Intel MEV
community.docker.docker_container:
name: telegraf
image: docker.io/library/telegraf:1.31
state: started
restart: true
detach: true
network_mode: host
restart_policy: always
mounts:
- type: bind
source: /root/telegraf.d/telegraf.conf.mev
target: /etc/telegraf/telegraf.conf
read_only: true
ansible.builtin.pip:
name: requests
version: "<2.32"

- name: Run telegraf container on others
when:
- inventory_hostname != 'mev'
- inventory_hostname != 'bf2'
- name: Define telegraf environment variables (only if not mev or bf2)
when: inventory_hostname not in ['mev', 'bf2']
ansible.builtin.set_fact:
telegraf_env:
REDFISH_HOST: "{{ bmc_vars.ansible_host }}"
REDFISH_USER: "{{ bmc_vars.ansible_user }}"
REDFISH_PASSWORD: "{{ bmc_vars.ansible_password }}"
REDFISH_SYSTEM_ID: "{{ bmc_vars.resource_id }}"

- name: Run telegraf container on all hosts
community.docker.docker_container:
name: telegraf
image: docker.io/library/telegraf:1.31
Expand All @@ -81,13 +63,35 @@
detach: true
network_mode: host
restart_policy: always
mounts:
- type: bind
source: /root/telegraf.d
target: /etc/telegraf/telegraf.d
read_only: true
env:
REDFISH_HOST: "{{ bmc_vars.ansible_host }}"
REDFISH_USER: "{{ bmc_vars.ansible_user }}"
REDFISH_PASSWORD: "{{ bmc_vars.ansible_password }}"
REDFISH_SYSTEM_ID: "{{ bmc_vars.resource_id }}"
mounts: >
{{
[
{
'type': 'bind',
'source': (
'/root/telegraf.d/telegraf.conf.' + inventory_hostname
if inventory_hostname in ['bf2', 'mev']
else '/root/telegraf.d'
),
'target': (
'/etc/telegraf/telegraf.conf'
if inventory_hostname in ['bf2', 'mev']
else '/etc/telegraf/telegraf.d'
),
'read_only': True
}
] + (
[
{
'type': 'bind',
'source': '/run/emu_param',
'target': '/run/emu_param',
'read_only': True
}
] if inventory_hostname == 'bf2' else []
)
}}
env: >
{{
telegraf_env if inventory_hostname not in ['mev', 'bf2'] else {}
}}

0 comments on commit 18d3144

Please sign in to comment.