Skip to content

Commit

Permalink
Monitoring / Prometheus fixes: Ceph scraping and simcore container la…
Browse files Browse the repository at this point in the history
…bels (#322)

* Remove ceph scrape (was broken) on osparc-public

* Adress DevOps Changes ITISFoundation/osparc-simcore#4453

---------

Co-authored-by: kaiser <[email protected]>
  • Loading branch information
mrnicegyu11 and kaiser authored Aug 22, 2023
1 parent a001605 commit 5f9e20a
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 47 deletions.
8 changes: 6 additions & 2 deletions services/monitoring/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,16 @@ up-letsencrypt-http: .init .env config.monitoring config.prometheus ${TEMP_COMP
@docker stack deploy --with-registry-auth --prune --compose-file ${TEMP_COMPOSE}-letsencrypt-http ${STACK_NAME}

.PHONY: up-letsencrypt-dns
up-letsencrypt-dns: .init .env config.monitoring config.prometheus ${TEMP_COMPOSE}-letsencrypt-dns ## Deploys or updates current stack "$(STACK_NAME)" using let's encrypt dns challenge
up-letsencrypt-dns: .init .env config.monitoring config.prometheus ${TEMP_COMPOSE}-letsencrypt-dns ## Deploys or updates current stack "$(STACK_NAME)" using let's encrypt dns challenge
@docker stack deploy --with-registry-auth --prune --compose-file ${TEMP_COMPOSE}-letsencrypt-dns ${STACK_NAME}

.PHONY: up-dalco
up-dalco: .init .env config.monitoring config.prometheus.ceph.simcore ${TEMP_COMPOSE}-dalco ## Deploys monitoring stack for Dalco Cluster
@docker stack deploy --with-registry-auth --prune --compose-file ${TEMP_COMPOSE}-dalco ${STACK_NAME}

.PHONY: up-public
up-public: up-dalco
up-public: .init .env config.monitoring config.prometheus ${TEMP_COMPOSE}-public ## Deploys monitoring stack for Public Cluster
@docker stack deploy --with-registry-auth --prune --compose-file ${TEMP_COMPOSE}-public ${STACK_NAME}

.PHONY: up-aws
up-aws: .init .env config.monitoring config.prometheus.simcore ${TEMP_COMPOSE}-aws ## Deploys or updates current stack "$(STACK_NAME)" in AWS
Expand All @@ -65,6 +66,9 @@ ${TEMP_COMPOSE}-letsencrypt-dns: docker-compose.yml docker-compose.letsencrypt.d
${TEMP_COMPOSE}-dalco: docker-compose.yml docker-compose.dalco.yml docker-compose.letsencrypt.dns.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml
@${REPO_BASE_DIR}/scripts/docker-compose-config.bash -e .env $< docker-compose.letsencrypt.dns.yml docker-compose.dalco.yml > $@

${TEMP_COMPOSE}-public: docker-compose.yml docker-compose.public.yml docker-compose.letsencrypt.dns.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml
@${REPO_BASE_DIR}/scripts/docker-compose-config.bash -e .env $< docker-compose.letsencrypt.dns.yml docker-compose.public.yml > $@

${TEMP_COMPOSE}-aws: docker-compose.yml docker-compose.aws.yml docker-compose.letsencrypt.dns.yml config.monitoring .env pgsql_query_exporter_config.yaml smokeping_prober_config.yaml
@${REPO_BASE_DIR}/scripts/docker-compose-config.bash -e .env $< docker-compose.aws.yml docker-compose.letsencrypt.dns.yml > $@

Expand Down
26 changes: 26 additions & 0 deletions services/monitoring/docker-compose.public.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
version: '3.7'
services:
cadvisor-exporter:
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
grafana:
dns: 8.8.8.8
deploy:
placement:
constraints:
- node.labels.grafana==true

prometheuscatchall:
deploy:
placement:
constraints:
- node.labels.prometheus==true
prometheuscadvisor:
deploy:
placement:
constraints:
- node.labels.prometheus==true
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@
"uid": "RmZEr52nz"
},
"editorMode": "code",
"expr": "count(container_memory_usage_bytes{image=~\"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$\",container_label_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$\",container_label_simcore_user_agent!=\"puppeteer\"}),0)",
"expr": "count(container_memory_usage_bytes{image=~\"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}),0)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -383,13 +383,13 @@
"uid": "RmZEr52nz"
},
"editorMode": "code",
"expr": "count(container_memory_usage_bytes{image=~\"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$\",container_label_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$\",container_label_simcore_user_agent!=\"puppeteer\"}),0)",
"expr": "count(container_memory_usage_bytes{image=~\"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*[.osparc.io].*/simcore/services/dynamic/s4l-core-lite.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}),0)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Number of s4l-lite studies running (excluding puppeteer from v1.52.0)",
"title": "Number of s4l-lite studies running",
"type": "timeseries"
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@
"uid": "RmZEr52nz"
},
"editorMode": "code",
"expr": "count(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/electrode-selector.*$\",container_label_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/electrode-selector.*$\",container_label_simcore_user_agent!=\"puppeteer\"}),0)",
"expr": "count(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/electrode-selector.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/electrode-selector.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}),0)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
Expand Down Expand Up @@ -273,7 +273,7 @@
"uid": "RmZEr52nz"
},
"editorMode": "code",
"expr": "count(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/sim4life-postpro.*$\",container_label_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/sim4life-postpro.*$\",container_label_simcore_user_agent!=\"puppeteer\"}),0)",
"expr": "count(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/sim4life-postpro.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/sim4life-postpro.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}),0)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
Expand Down Expand Up @@ -366,7 +366,7 @@
"uid": "RmZEr52nz"
},
"editorMode": "code",
"expr": "count(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/ti-postpro.*$\",container_label_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/ti-postpro.*$\",container_label_simcore_user_agent!=\"puppeteer\"}),0)",
"expr": "count(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/ti-postpro.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}) OR clamp_max(absent(container_memory_usage_bytes{image=~\"^.*/simcore/services/dynamic/ti-postpro.*$\",container_label_io_simcore_runtime_simcore_user_agent!=\"puppeteer\"}),0)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
Expand Down
59 changes: 20 additions & 39 deletions services/monitoring/prometheus/prometheus-base.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,48 +97,10 @@ scrape_configs:
# this would require soffisticated regex parsing
# To use these: # KEEP: container_label_simcore_service_settings
#
#- source_labels: [container_label_simcore_service_settings]
# separator: ;
# regex: '^\"(.*)?(\"type\": \"Resources\", \"value\": \{\"Limits\": (\{".*\})?\{\"NanoCPUs\": )(\d*)([,\}])'
# target_label: docker_service_resources_nanocpu_limit
# replacement: ${4}
# action: replace
#- source_labels: [container_label_simcore_service_settings]
# separator: ;
# regex: '^\"(.*)?(\"type\": \"Resources\", \"value\": \{\"Limits\": \{)(\{".*\})?(\".*\"\: \d+[,\}] )?(\"MemoryBytes\": )(\d*)([\}])'
# target_label: docker_service_resources_memorybytes_limit
# replacement: ${6}
# action: replace
#- source_labels: [container_label_simcore_service_settings]
# separator: ;
# regex: '^\"(.*)?(\"type\": \"Resources\", \"value\": \{)("Limits".*, )?(\"Reservations\": (\{".*\})?\{\"NanoCPUs\": )(\d*)([,\}])'
# target_label: docker_service_resources_nanocpu_reservation
# replacement: ${6}
# action: replace
#- source_labels: [container_label_simcore_service_settings]
# separator: ;
# regex: '^\"(.*)?(\"type\": \"Resources\", \"value\": \{)(.*, )?(\"Reservations\": )(\{".*\})?\{(".*": \d+[,\}] )?(\"MemoryBytes\": )(\d*)([,\}])'
# target_label: docker_service_resources_nanocpu_reservation
# replacement: ${8}
# action: replace
#- source_labels: [container_label_simcore_service_settings]
# separator: ;
# regex: '^"(.*)?("GenericResources": \[\{"DiscreteResourceSpec": )(.*)(\{"Kind": "VRAM", "Value": )(\d+)'
# target_label: docker_service_resources_vram_reservation
# replacement: ${5}
# action: replace
#- source_labels: [container_label_simcore_service_settings]
# separator: ;
# regex: '^"(.*)?("GenericResources": \[\{"DiscreteResourceSpec": )(.*)(\{"Kind": "AIRAM", "Value": )(\d+)'
# target_label: docker_service_resources_vram_reservation
# replacement: ${5}
# action: replace
- regex: "container_label_com_docker_compose_config_hash"
action: labeldrop # cAdvisor pruning
- regex: "container_label_com_docker_compose_container_number"
action: labeldrop # cAdvisor pruning
- regex: "container_label_io_simcore_.*"
action: labeldrop # cAdvisor pruning
- regex: "container_label_simcore_service_compose_spec"
action: labeldrop # cAdvisor pruning
- regex: "container_label_simcore_service_container_http_entrypoint"
Expand All @@ -157,7 +119,26 @@ scrape_configs:
action: labeldrop # cAdvisor pruning
- regex: "container_label_maintainer"
action: labeldrop # cAdvisor pruning

- regex: "container_label_com_docker_compose_project_working_dir"
action: labeldrop
- regex: "container_label_io_simcore_contact"
action: labeldrop
- regex: "container_label_io_simcore_authors"
action: labeldrop
- regex: "container_label_io_simcore_description"
action: labeldrop
- regex: "container_label_io_simcore_inputs"
action: labeldrop
- regex: "container_label_io_simcore_key"
action: labeldrop
- regex: "container_label_io_simcore_outputs"
action: labeldrop
- regex: "container_label_io_simcore_runtime_swarm_stack_name"
action: labeldrop
- regex: "container_label_io_simcore_thumbnail"
action: labeldrop
- regex: "container_label_simcore_service_settings"
action: labeldrop

### Used:
# container_label_com_docker_swarm_node_id
Expand Down

0 comments on commit 5f9e20a

Please sign in to comment.