Skip to content

Commit

Permalink
improved reporting step timeout and duration
Browse files Browse the repository at this point in the history
  • Loading branch information
godfryd committed Apr 10, 2021
1 parent 444c03a commit 136e253
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 22 deletions.
37 changes: 17 additions & 20 deletions agent/kraken/agent/jobber.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2020 The Kraken Authors
# Copyright 2020-2021 The Kraken Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -119,6 +119,8 @@ def __init__(self, kk_srv, command, job_id, idx):

self.subprocess_task = None

self.start_time = time.time()

def cancel(self):
self.is_canceled = True
if self.subprocess_task:
Expand All @@ -144,6 +146,7 @@ async def async_handle_request(self, reader, writer):
log.exception('problem with decoding data %s from %s', data, addr)
return

data['duration'] = round(time.time() - self.proc_coord.start_time + 0.5)
self.proc_coord.result = data

if self.proc_coord.command in ['run_tests', 'run_analysis', 'run_artifacts']:
Expand Down Expand Up @@ -257,9 +260,8 @@ def _run_step(srv, exec_ctx, job_dir, job_id, idx, step, tools, deadline):
result, cancel = _exec_tool(srv, exec_ctx, tool_path, 'collect_tests', job_dir, 10, user, step_file_path, job_id, idx)
log.info('result for collect_tests: %s', str(result)[:200])
if cancel:
result = {'status': 'cancel', 'reason': 'cancel'}
log.info('canceling job')
return result, cancel
return 'cancel', cancel

# check result
if not isinstance(result, dict):
Expand All @@ -268,7 +270,7 @@ def _run_step(srv, exec_ctx, job_dir, job_id, idx, step, tools, deadline):
# if command not succeeded
if result['status'] != 'done':
rsp = srv.report_step_result(job_id, idx, result)
return result, rsp.get('cancel', False)
return result['status'], rsp.get('cancel', False)

# check result
if 'tests' not in result:
Expand All @@ -278,7 +280,7 @@ def _run_step(srv, exec_ctx, job_dir, job_id, idx, step, tools, deadline):
if len(result['tests']) == 0:
result = {'status': 'error', 'reason': 'no-tests'}
rsp = srv.report_step_result(job_id, idx, result)
return result, rsp.get('cancel', False)
return result['status'], rsp.get('cancel', False)

# if there are tests then send them for dispatching to server
response = srv.dispatch_tests(job_id, idx, result['tests'])
Expand All @@ -291,60 +293,56 @@ def _run_step(srv, exec_ctx, job_dir, job_id, idx, step, tools, deadline):
log.info('timout expired %s', deadline)
result = {'status': 'error', 'reason': 'job-timeout'}
srv.report_step_result(job_id, idx, result)
return result, cancel
return result['status'], cancel
result, cancel = _exec_tool(srv, exec_ctx, tool_path, 'run_tests', job_dir, timeout, user, step_file_path, job_id, idx)
log.info('result for run_tests: %s', str(result)[:200])
# do not srv.report_step_result, it was already done in RequestHandler.async_handle_request
if cancel:
result = {'status': 'cancel', 'reason': 'cancel'}
log.info('canceling job')
return result, cancel
return 'cancel', cancel

if 'run_analysis' in available_commands:
timeout = deadline - time.time()
if timeout <= 0:
log.info('timout expired %s', deadline)
result = {'status': 'error', 'reason': 'job-timeout'}
srv.report_step_result(job_id, idx, result)
return result, cancel
return result['status'], cancel
result, cancel = _exec_tool(srv, exec_ctx, tool_path, 'run_analysis', job_dir, timeout, user, step_file_path, job_id, idx)
log.info('result for run_analysis: %s', str(result)[:200])
# do not srv.report_step_result, it was already done in RequestHandler.async_handle_request
if cancel:
result = {'status': 'cancel', 'reason': 'cancel'}
log.info('canceling job')
return result, cancel
return 'cancel', cancel

if 'run_artifacts' in available_commands:
timeout = deadline - time.time()
if timeout <= 0:
log.info('timout expired %s', deadline)
result = {'status': 'error', 'reason': 'job-timeout'}
srv.report_step_result(job_id, idx, result)
return result, cancel
return result['status'], cancel
result, cancel = _exec_tool(srv, exec_ctx, tool_path, 'run_artifacts', job_dir, timeout, user, step_file_path, job_id, idx)
log.info('result for run_artifacts: %s', str(result)[:200])
# do not srv.report_step_result, it was already done in RequestHandler.async_handle_request
if cancel:
result = {'status': 'cancel', 'reason': 'cancel'}
log.info('canceling job')
return result, cancel
return 'cancel', cancel

if 'run' in available_commands:
timeout = deadline - time.time()
if timeout <= 0:
log.info('timout expired %s', deadline)
result = {'status': 'error', 'reason': 'job-timeout'}
srv.report_step_result(job_id, idx, result)
return result, cancel
return result['status'], cancel
attempts = step.get('attempts', 1)
sleep_time_after_attempt = step.get('sleep_time_after_attempt', 0)
for n in range(attempts):
result, cancel = _exec_tool(srv, exec_ctx, tool_path, 'run', job_dir, timeout, user, step_file_path, job_id, idx)
if cancel:
result = {'status': 'cancel', 'reason': 'cancel'}
log.info('canceling job')
return result, cancel
return 'cancel', cancel
if result['status'] == 'done':
break
retry_info = 'no more retries' if n + 1 == attempts else ('retrying after %ds' % sleep_time_after_attempt)
Expand All @@ -357,7 +355,7 @@ def _run_step(srv, exec_ctx, job_dir, job_id, idx, step, tools, deadline):
if cancel:
log.info('canceling job')

return result, cancel
return result['status'], cancel


def _create_exec_context(job):
Expand Down Expand Up @@ -403,8 +401,7 @@ def run(srv, job):
step['trigger_data'] = job['trigger_data']

try:
result, cancel = _run_step(srv, exec_ctx, job_dir, job['id'], idx, step, tools, job['deadline'])
last_status = result['status']
last_status, cancel = _run_step(srv, exec_ctx, job_dir, job['id'], idx, step, tools, job['deadline'])
except KeyboardInterrupt:
raise
except:
Expand Down
5 changes: 4 additions & 1 deletion agent/kraken/agent/tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import sys
import json
import time
import socket
import inspect
import logging
Expand Down Expand Up @@ -107,6 +108,7 @@ def execute(sock, module, command, step_file_path):

result = {'status': 'done'}
ret = 0
t0 = time.time()

if command == 'get_commands':
func_list = [o[0] for o in inspect.getmembers(
Expand Down Expand Up @@ -141,7 +143,8 @@ def execute(sock, module, command, step_file_path):
else:
raise Exception('unknown command %s' % command)

log.info('step tool %s, cmd %s done with retcode %s', tool_name, command, ret)
duration = t0 - time.time()
log.info('step tool %s, cmd %s done with retcode %s in %dsecs', tool_name, command, ret, duration)

if ret != 0:
if ret == 10000:
Expand Down
11 changes: 10 additions & 1 deletion ui/src/app/run-results/run-results.component.html
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,18 @@
<div>
Status: <i style="vertical-align: baseline;" [ngClass]="getStepStatusClass(step)"></i> {{ getStepStatus(step) }}
</div>
<div *ngIf="step.result && step.result.duration >= 0">
Duration: {{ step.result.duration }}s
</div>
<div *ngIf="step.result && step.result['reason']">
Reason: {{ step.result['reason'] }}
<pre>{{ step.result['msg'] }}</pre>
<div *ngIf="step.result['reason'] == 'step-timeout'">
Timeout: {{ step.timeout || '60' }}s
<br>
</div>
<div *ngIf="step.result['reason'] != 'step-timeout'">
<pre>{{ step.result['msg'] }}</pre>
</div>
</div>
</div>
</p-tabPanel>
Expand Down

0 comments on commit 136e253

Please sign in to comment.