From 88094a9853c2d6a5046a9f67383243569276e912 Mon Sep 17 00:00:00 2001 From: Di Xu Date: Fri, 31 Aug 2018 15:13:10 +0800 Subject: [PATCH] replay on docker to rotate logs for watchdog/job_exporter --- .../prometheus/node-exporter-ds.yaml.template | 8 +- .../prometheus/watchdog-ds.yaml.template | 8 ++ pai-management/src/gpu-exporter/dockerfile | 22 ++--- prometheus/exporter/healthy_check.py | 83 ------------------- prometheus/exporter/job_exporter.py | 13 +-- prometheus/exporter/no_older_than.py | 42 ++++++++++ prometheus/exporter/watchdog.py | 11 +-- 7 files changed, 67 insertions(+), 120 deletions(-) delete mode 100644 prometheus/exporter/healthy_check.py create mode 100644 prometheus/exporter/no_older_than.py diff --git a/pai-management/bootstrap/prometheus/node-exporter-ds.yaml.template b/pai-management/bootstrap/prometheus/node-exporter-ds.yaml.template index fdd6cb6038..e3a3a63bfb 100644 --- a/pai-management/bootstrap/prometheus/node-exporter-ds.yaml.template +++ b/pai-management/bootstrap/prometheus/node-exporter-ds.yaml.template @@ -89,8 +89,12 @@ spec: readinessProbe: exec: command: - - python - - /usr/local/healthy_check.py + - "python" + - "/job_exporter/no_older_than.py" + - "--delta" + - "60" + - "/datastorage/prometheus/job_exporter.prom" + - "/datastorage/prometheus/gpu_exporter.prom" initialDelaySeconds: 30 periodSeconds: 30 resources: diff --git a/pai-management/bootstrap/prometheus/watchdog-ds.yaml.template b/pai-management/bootstrap/prometheus/watchdog-ds.yaml.template index 450da2ecf0..9d1158c322 100644 --- a/pai-management/bootstrap/prometheus/watchdog-ds.yaml.template +++ b/pai-management/bootstrap/prometheus/watchdog-ds.yaml.template @@ -40,6 +40,12 @@ spec: - name: watchdog image: {{ clusterinfo["dockerregistryinfo"]["prefix"] }}watchdog:{{ clusterinfo["dockerregistryinfo"]["docker_tag"] }} imagePullPolicy: Always + readinessProbe: + httpGet: + path: / + port: 9101 + initialDelaySeconds: 30 + periodSeconds: 30 resources: limits: memory: "1Gi" @@ -53,6 +59,8 @@ spec: - "/watchdog.py" - "--interval" - "30" + - "--port" + - "9101" - "{{ clusterinfo['webportalinfo']['k8s_api_server_uri'] }}" volumes: - name: collector-mount diff --git a/pai-management/src/gpu-exporter/dockerfile b/pai-management/src/gpu-exporter/dockerfile index c7f3b3016d..22b5944f36 100644 --- a/pai-management/src/gpu-exporter/dockerfile +++ b/pai-management/src/gpu-exporter/dockerfile @@ -15,31 +15,19 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -FROM ubuntu:16.04 +FROM python:2.7 -# -# Preparation -# ENV NVIDIA_VERSION=current ENV NV_DRIVER=/var/drivers/nvidia/$NVIDIA_VERSION ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$NV_DRIVER/lib:$NV_DRIVER/lib64 ENV PATH=$PATH:$NV_DRIVER/bin -WORKDIR /root/ - -RUN apt-get update && \ - apt-get -y install wget build-essential python python-pip git pciutils - -COPY copied_file/exporter/* /usr/local/ +RUN mkdir -p /job_exporter +COPY copied_file/exporter/* /job_exporter/ RUN wget https://download.docker.com/linux/static/stable/x86_64/docker-17.06.2-ce.tgz -RUN cp docker-17.06.2-ce.tgz /usr/local -RUN tar xzvf /usr/local/docker-17.06.2-ce.tgz -C /usr/local/ +RUN tar xzvf docker-17.06.2-ce.tgz -C /usr/local/ RUN cp -r /usr/local/docker/* /usr/bin/ -# -# start -# - -CMD python /usr/local/job_exporter.py /datastorage/prometheus 30 +CMD python /job_exporter/job_exporter.py /datastorage/prometheus 30 diff --git a/prometheus/exporter/healthy_check.py b/prometheus/exporter/healthy_check.py deleted file mode 100644 index eec82b1923..0000000000 --- a/prometheus/exporter/healthy_check.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/python -# Copyright (c) Microsoft Corporation -# All rights reserved. -# -# MIT License -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and -# to permit persons to whom the Software is furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING -# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, -# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -import subprocess -import sys -import logging -from logging.handlers import RotatingFileHandler -import os -import re - -import utils - -logger = logging.getLogger(__name__) - -def main(): - runTimeException = [] - gpuExists = False - - try: - gpuOutput = utils.check_output(["lspci"]) - r = re.search("[0-9a-fA-F][0-9a-fA-F]:[0-9a-fA-F][0-9a-fA-F].[0-9] (3D|VGA compatible) controller: NVIDIA Corporation.*", gpuOutput, flags=0) - if r is not None: - gpuExists = True - except subprocess.CalledProcessError as e: - runTimeException.append("lspci") - logger.error("command '%s' return with error (code %d): %s", e.cmd, e.returncode, e.output) - - if gpuExists: - try: - smiOutput = utils.check_output(["nvidia-smi", "-q", "-x"]) - except subprocess.CalledProcessError as e: - runTimeException.append("nvidia-smi") - logger.error("command '%s' return with error (code %d): %s", e.cmd, e.returncode, e.output) - except OSError as e: - if e.errno == os.errno.ENOENT: - logger.warning("nvidia-smi not found") - - try: - dockerDockerInspect = utils.check_output(["docker", "inspect", "--help"]) - except subprocess.CalledProcessError as e: - runTimeException.append("docker_inspect") - logger.error("command '%s' return with error (code %d): %s", e.cmd, e.returncode, e.output) - - try: - dockerDockerStats = subprocess.check_output(["docker", "stats", "--no-stream", "--format", - "table {{.Container}}, {{.CPUPerc}},{{.MemUsage}},{{.NetIO}},{{.BlockIO}},{{.MemPerc}}"]) - except subprocess.CalledProcessError as e: - runTimeException.append("docker_stats") - logger.error("command '%s' return with error (code %d): %s", e.cmd, e.returncode, e.output) - - if not os.path.exists("/datastorage/prometheus/job_exporter.prom"): - runTimeException.append(joblogfile) - logger.error("/datastorage/prometheus/job_exporter.prom does not exists") - - if len(runTimeException) > 0: - exception = "| ".join(runTimeException) - raise RuntimeError("gpu-exporter readiness probe failed, error component:" + exception) - -if __name__ == "__main__": - rootLogger = logging.getLogger() - rootLogger.setLevel(logging.INFO) - fh = RotatingFileHandler("/datastorage/prometheus/node_exporter_probe.log", maxBytes= 1024 * 1024 * 10, backupCount=5) - fh.setLevel(logging.INFO) - formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s") - fh.setFormatter(formatter) - rootLogger.addHandler(fh) - - main() diff --git a/prometheus/exporter/job_exporter.py b/prometheus/exporter/job_exporter.py index c3ce3d5e76..35c4927896 100644 --- a/prometheus/exporter/job_exporter.py +++ b/prometheus/exporter/job_exporter.py @@ -20,7 +20,6 @@ import sys import time import logging -from logging.handlers import RotatingFileHandler import docker_stats import docker_inspect @@ -43,7 +42,6 @@ def parse_from_labels(labels): else: otherLabels[key] = val - return gpuIds, otherLabels @@ -88,14 +86,6 @@ def main(argv): jobMetricsPath = logDir + "/job_exporter.prom" timeSleep = int(argv[1]) - rootLogger = logging.getLogger() - rootLogger.setLevel(logging.INFO) - fh = RotatingFileHandler(logDir + "/gpu_exporter.log", maxBytes= 1024 * 1024 * 10, backupCount=5) - fh.setLevel(logging.INFO) - formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s") - fh.setFormatter(formatter) - rootLogger.addHandler(fh) - iter = 0 singleton = utils.Singleton(gpu_exporter.collect_gpu_info) @@ -121,4 +111,7 @@ def main(argv): if __name__ == "__main__": + logging.basicConfig(format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s", + level=logging.INFO) + main(sys.argv[1:]) diff --git a/prometheus/exporter/no_older_than.py b/prometheus/exporter/no_older_than.py new file mode 100644 index 0000000000..03ccbda354 --- /dev/null +++ b/prometheus/exporter/no_older_than.py @@ -0,0 +1,42 @@ +#!/usr/bin/python +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import argparse +import datetime +import os + +def check_no_older_than(paths, delta): + """ raise RuntimeError exception if any path in paths is older than `now - delta` """ + now = datetime.datetime.now() + delta = datetime.timedelta(seconds=delta) + oldest = now - delta + + for path in paths: + mtime = os.path.getmtime(path) + mtime = datetime.datetime.fromtimestamp(mtime) + if oldest > mtime: + raise RuntimeError("{} was updated more than {} seconds ago".format(path, delta)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("paths", nargs="+", help="file to be checked") + parser.add_argument("-d", "--delta", type=int, default=60, help="check file is no older than -d seconds") + args = parser.parse_args() + + check_no_older_than(args.paths, args.delta) diff --git a/prometheus/exporter/watchdog.py b/prometheus/exporter/watchdog.py index 0ca3f8415a..8ca8424746 100644 --- a/prometheus/exporter/watchdog.py +++ b/prometheus/exporter/watchdog.py @@ -361,14 +361,6 @@ def main(args): hosts = load_machine_list(args.hosts) - rootLogger = logging.getLogger() - rootLogger.setLevel(logging.INFO) - fh = RotatingFileHandler(logDir + "/watchdog.log", maxBytes= 1024 * 1024 * 100, backupCount=5) - fh.setLevel(logging.INFO) - formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s") - fh.setFormatter(formatter) - rootLogger.addHandler(fh) - list_pods_url = "{}/api/v1/namespaces/default/pods/".format(address) list_nodes_url = "{}/api/v1/nodes/".format(address) @@ -423,4 +415,7 @@ def main(args): parser.add_argument("--hosts", "-m", help="yaml file path contains host info", default="/etc/watchdog/config.yml") args = parser.parse_args() + logging.basicConfig(format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s", + level=logging.INFO) + main(args)