From 136e2532f8c034e58d8842a66e547a669ae2d6ee Mon Sep 17 00:00:00 2001 From: Michal Nowikowski Date: Sat, 10 Apr 2021 19:59:05 +0200 Subject: [PATCH] improved reporting step timeout and duration --- agent/kraken/agent/jobber.py | 37 +++++++++---------- agent/kraken/agent/tool.py | 5 ++- .../run-results/run-results.component.html | 11 +++++- 3 files changed, 31 insertions(+), 22 deletions(-) diff --git a/agent/kraken/agent/jobber.py b/agent/kraken/agent/jobber.py index f82c3c22..e91439b4 100644 --- a/agent/kraken/agent/jobber.py +++ b/agent/kraken/agent/jobber.py @@ -1,4 +1,4 @@ -# Copyright 2020 The Kraken Authors +# Copyright 2020-2021 The Kraken Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -119,6 +119,8 @@ def __init__(self, kk_srv, command, job_id, idx): self.subprocess_task = None + self.start_time = time.time() + def cancel(self): self.is_canceled = True if self.subprocess_task: @@ -144,6 +146,7 @@ async def async_handle_request(self, reader, writer): log.exception('problem with decoding data %s from %s', data, addr) return + data['duration'] = round(time.time() - self.proc_coord.start_time + 0.5) self.proc_coord.result = data if self.proc_coord.command in ['run_tests', 'run_analysis', 'run_artifacts']: @@ -257,9 +260,8 @@ def _run_step(srv, exec_ctx, job_dir, job_id, idx, step, tools, deadline): result, cancel = _exec_tool(srv, exec_ctx, tool_path, 'collect_tests', job_dir, 10, user, step_file_path, job_id, idx) log.info('result for collect_tests: %s', str(result)[:200]) if cancel: - result = {'status': 'cancel', 'reason': 'cancel'} log.info('canceling job') - return result, cancel + return 'cancel', cancel # check result if not isinstance(result, dict): @@ -268,7 +270,7 @@ def _run_step(srv, exec_ctx, job_dir, job_id, idx, step, tools, deadline): # if command not succeeded if result['status'] != 'done': rsp = srv.report_step_result(job_id, idx, result) - return result, rsp.get('cancel', False) + return result['status'], rsp.get('cancel', False) # check result if 'tests' not in result: @@ -278,7 +280,7 @@ def _run_step(srv, exec_ctx, job_dir, job_id, idx, step, tools, deadline): if len(result['tests']) == 0: result = {'status': 'error', 'reason': 'no-tests'} rsp = srv.report_step_result(job_id, idx, result) - return result, rsp.get('cancel', False) + return result['status'], rsp.get('cancel', False) # if there are tests then send them for dispatching to server response = srv.dispatch_tests(job_id, idx, result['tests']) @@ -291,14 +293,13 @@ def _run_step(srv, exec_ctx, job_dir, job_id, idx, step, tools, deadline): log.info('timout expired %s', deadline) result = {'status': 'error', 'reason': 'job-timeout'} srv.report_step_result(job_id, idx, result) - return result, cancel + return result['status'], cancel result, cancel = _exec_tool(srv, exec_ctx, tool_path, 'run_tests', job_dir, timeout, user, step_file_path, job_id, idx) log.info('result for run_tests: %s', str(result)[:200]) # do not srv.report_step_result, it was already done in RequestHandler.async_handle_request if cancel: - result = {'status': 'cancel', 'reason': 'cancel'} log.info('canceling job') - return result, cancel + return 'cancel', cancel if 'run_analysis' in available_commands: timeout = deadline - time.time() @@ -306,14 +307,13 @@ def _run_step(srv, exec_ctx, job_dir, job_id, idx, step, tools, deadline): log.info('timout expired %s', deadline) result = {'status': 'error', 'reason': 'job-timeout'} srv.report_step_result(job_id, idx, result) - return result, cancel + return result['status'], cancel result, cancel = _exec_tool(srv, exec_ctx, tool_path, 'run_analysis', job_dir, timeout, user, step_file_path, job_id, idx) log.info('result for run_analysis: %s', str(result)[:200]) # do not srv.report_step_result, it was already done in RequestHandler.async_handle_request if cancel: - result = {'status': 'cancel', 'reason': 'cancel'} log.info('canceling job') - return result, cancel + return 'cancel', cancel if 'run_artifacts' in available_commands: timeout = deadline - time.time() @@ -321,14 +321,13 @@ def _run_step(srv, exec_ctx, job_dir, job_id, idx, step, tools, deadline): log.info('timout expired %s', deadline) result = {'status': 'error', 'reason': 'job-timeout'} srv.report_step_result(job_id, idx, result) - return result, cancel + return result['status'], cancel result, cancel = _exec_tool(srv, exec_ctx, tool_path, 'run_artifacts', job_dir, timeout, user, step_file_path, job_id, idx) log.info('result for run_artifacts: %s', str(result)[:200]) # do not srv.report_step_result, it was already done in RequestHandler.async_handle_request if cancel: - result = {'status': 'cancel', 'reason': 'cancel'} log.info('canceling job') - return result, cancel + return 'cancel', cancel if 'run' in available_commands: timeout = deadline - time.time() @@ -336,15 +335,14 @@ def _run_step(srv, exec_ctx, job_dir, job_id, idx, step, tools, deadline): log.info('timout expired %s', deadline) result = {'status': 'error', 'reason': 'job-timeout'} srv.report_step_result(job_id, idx, result) - return result, cancel + return result['status'], cancel attempts = step.get('attempts', 1) sleep_time_after_attempt = step.get('sleep_time_after_attempt', 0) for n in range(attempts): result, cancel = _exec_tool(srv, exec_ctx, tool_path, 'run', job_dir, timeout, user, step_file_path, job_id, idx) if cancel: - result = {'status': 'cancel', 'reason': 'cancel'} log.info('canceling job') - return result, cancel + return 'cancel', cancel if result['status'] == 'done': break retry_info = 'no more retries' if n + 1 == attempts else ('retrying after %ds' % sleep_time_after_attempt) @@ -357,7 +355,7 @@ def _run_step(srv, exec_ctx, job_dir, job_id, idx, step, tools, deadline): if cancel: log.info('canceling job') - return result, cancel + return result['status'], cancel def _create_exec_context(job): @@ -403,8 +401,7 @@ def run(srv, job): step['trigger_data'] = job['trigger_data'] try: - result, cancel = _run_step(srv, exec_ctx, job_dir, job['id'], idx, step, tools, job['deadline']) - last_status = result['status'] + last_status, cancel = _run_step(srv, exec_ctx, job_dir, job['id'], idx, step, tools, job['deadline']) except KeyboardInterrupt: raise except: diff --git a/agent/kraken/agent/tool.py b/agent/kraken/agent/tool.py index b53dc01c..0b26c011 100644 --- a/agent/kraken/agent/tool.py +++ b/agent/kraken/agent/tool.py @@ -14,6 +14,7 @@ import sys import json +import time import socket import inspect import logging @@ -107,6 +108,7 @@ def execute(sock, module, command, step_file_path): result = {'status': 'done'} ret = 0 + t0 = time.time() if command == 'get_commands': func_list = [o[0] for o in inspect.getmembers( @@ -141,7 +143,8 @@ def execute(sock, module, command, step_file_path): else: raise Exception('unknown command %s' % command) - log.info('step tool %s, cmd %s done with retcode %s', tool_name, command, ret) + duration = t0 - time.time() + log.info('step tool %s, cmd %s done with retcode %s in %dsecs', tool_name, command, ret, duration) if ret != 0: if ret == 10000: diff --git a/ui/src/app/run-results/run-results.component.html b/ui/src/app/run-results/run-results.component.html index ee1d4e1b..cffb8a4a 100644 --- a/ui/src/app/run-results/run-results.component.html +++ b/ui/src/app/run-results/run-results.component.html @@ -101,9 +101,18 @@
Status: {{ getStepStatus(step) }}
+
+ Duration: {{ step.result.duration }}s +
Reason: {{ step.result['reason'] }} -
{{ step.result['msg'] }}
+
+ Timeout: {{ step.timeout || '60' }}s +
+
+
+
{{ step.result['msg'] }}
+