Skip to content

Commit

Permalink
Merge branch 'master' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
vkarak committed Jan 23, 2025
2 parents 520333d + e66c20c commit 09d2197
Show file tree
Hide file tree
Showing 9 changed files with 103 additions and 20 deletions.
26 changes: 26 additions & 0 deletions docs/config_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1608,6 +1608,32 @@ This handler transmits the whole log record, meaning that all the information wi
.. versionadded:: 4.1


.. py:attribute:: logging.handlers_perflog..httpjson..backoff_intervals
:required: No
:default: ``[0.1, 0.2, 0.4, 0.8, 1.6, 3.2]``

List of wait intervals in seconds when server responds with HTTP error 429 (``TOO_MANY_REQUESTS``).

In this case, ReFrame will retry contacting the server after waiting an amount of time that is determined by cyclically iterating this list of intervals.

ReFrame will keep trying contacting the server, until a different HTTP resonse is received (either success or error) or the corresponding :attr:`~config.logging.handlers_perflog..httpjson..timeout` is exceeded.

.. versionadded:: 4.7.3


.. py:attribute:: logging.handlers_perflog..httpjson..retry_timeout
:required: No
:default: ``0``

Timeout in seconds for retrying when server responds with HTTP error 429 (``TOO_MANY_REQUESTS``).

If set to zero, ReFrame will retry until another HTTP response (success or error) is received.


.. versionadded:: 4.7.3


.. _exec-mode-config:

Expand Down
4 changes: 4 additions & 0 deletions reframe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
import os
import sys

<<<<<<< HEAD
VERSION = '4.8.0-dev.2'
=======
VERSION = '4.7.3'
>>>>>>> master
INSTALL_PREFIX = os.path.normpath(
os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
)
Expand Down
34 changes: 27 additions & 7 deletions reframe/core/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# SPDX-License-Identifier: BSD-3-Clause

import abc
import itertools
import logging
import logging.handlers
import numbers
Expand Down Expand Up @@ -554,6 +555,8 @@ def _create_httpjson_handler(site_config, config_prefix):
json_formatter = site_config.get(f'{config_prefix}/json_formatter')
extra_headers = site_config.get(f'{config_prefix}/extra_headers')
debug = site_config.get(f'{config_prefix}/debug')
backoff_intervals = site_config.get(f'{config_prefix}/backoff_intervals')
retry_timeout = site_config.get(f'{config_prefix}/retry_timeout')

parsed_url = urllib.parse.urlparse(url)
if parsed_url.scheme not in {'http', 'https'}:
Expand Down Expand Up @@ -595,7 +598,8 @@ def _create_httpjson_handler(site_config, config_prefix):
'no data will be sent to the server')

return HTTPJSONHandler(url, extras, ignore_keys, json_formatter,
extra_headers, debug)
extra_headers, debug, backoff_intervals,
retry_timeout)


def _record_to_json(record, extras, ignore_keys):
Expand Down Expand Up @@ -645,7 +649,7 @@ class HTTPJSONHandler(logging.Handler):

def __init__(self, url, extras=None, ignore_keys=None,
json_formatter=None, extra_headers=None,
debug=False):
debug=False, backoff_intervals=(1, 2, 3), retry_timeout=0):
super().__init__()
self._url = url
self._extras = extras
Expand All @@ -669,6 +673,8 @@ def __init__(self, url, extras=None, ignore_keys=None,
self._headers.update(extra_headers)

self._debug = debug
self._timeout = retry_timeout
self._backoff_intervals = backoff_intervals

def emit(self, record):
# Convert tags to a list to make them JSON friendly
Expand All @@ -680,19 +686,33 @@ def emit(self, record):
return

if self._debug:
import time
ts = int(time.time() * 1_000)
dump_file = f'httpjson_record_{ts}.json'
with open(dump_file, 'w') as fp:
fp.write(json_record)

return

timeout_time = time.time() + self._timeout
try:
requests.post(
self._url, data=json_record,
headers=self._headers
)
backoff_intervals = itertools.cycle(self._backoff_intervals)
while True:
response = requests.post(
self._url, data=json_record,
headers=self._headers
)
if response.ok:
break

if (response.status_code == 429 and
(not self._timeout or time.time() < timeout_time)):
time.sleep(next(backoff_intervals))
continue

raise LoggingError(
f'HTTPJSONhandler logging failed: HTTP response code '
f'{response.status_code}'
)
except requests.exceptions.RequestException as e:
raise LoggingError('logging failed') from e

Expand Down
20 changes: 16 additions & 4 deletions reframe/core/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2310,8 +2310,9 @@ def check_performance(self):
return

# Check the performance variables against their references.
errors = []
for key, values in self._perfvalues.items():
val, ref, low_thres, high_thres, *_ = values
val, ref, low_thres, high_thres, unit = values

# Verify that val is a number
if not isinstance(val, numbers.Number):
Expand All @@ -2325,11 +2326,22 @@ def check_performance(self):
sn.evaluate(
sn.assert_reference(
val, ref, low_thres, high_thres,
msg=('failed to meet reference: %s={0}, '
'expected {1} (l={2}, u={3})' % tag))
msg=(f'{tag}={{0}} {unit}, expected {{1}} '
'(l={2}, u={3})'))
)
except SanityError as e:
raise PerformanceError(e) from None
errors.append(e.message)

# Combine all error messages to a single `PerformanceError` containing
# the information of all failed performance variables
if errors:
msg = 'failed to meet references:'
if len(errors) > 1:
msg += '\n\t'
else:
msg += ' '

raise PerformanceError(msg + '\n\t'.join(errors))

def _copy_job_files(self, job, dst):
if job is None:
Expand Down
3 changes: 2 additions & 1 deletion reframe/core/schedulers/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,8 @@ def is_avail(self):
'ALLOCATED',
'COMPLETING',
'IDLE',
'RESERVED',
'PLANNED',
'RESERVED'
}
return self._states <= available_states

Expand Down
21 changes: 16 additions & 5 deletions reframe/frontend/executors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
import reframe.utility.jsonext as jsonext
import reframe.utility.typecheck as typ
from reframe.core.exceptions import (AbortTaskError,
JobNotStartedError,
FailureLimitError,
ForceExitError,
JobNotStartedError,
LoggingError,
RunSessionTimeout,
SkipTestError,
StatisticsError,
Expand Down Expand Up @@ -480,8 +481,13 @@ def finalize(self):

self._current_stage = 'finalize'
self._notify_listeners('on_task_success')
self._perflogger.log_performance(logging.INFO, self,
multiline=self._perflog_compat)
try:
self._perflogger.log_performance(logging.INFO, self,
multiline=self._perflog_compat)
except LoggingError as e:
logging.getlogger().warning(
f'could not log performance data for {self.testcase}: {e}'
)

@logging.time_function
def cleanup(self, *args, **kwargs):
Expand All @@ -491,8 +497,13 @@ def fail(self, exc_info=None, callback='on_task_failure'):
self._failed_stage = self._current_stage
self._exc_info = exc_info or sys.exc_info()
self._notify_listeners(callback)
self._perflogger.log_performance(logging.INFO, self,
multiline=self._perflog_compat)
try:
self._perflogger.log_performance(logging.INFO, self,
multiline=self._perflog_compat)
except LoggingError as e:
logging.getlogger().warning(
f'could not log performance data for {self.testcase}: {e}'
)

def skip(self, exc_info=None):
self._skipped = True
Expand Down
2 changes: 1 addition & 1 deletion reframe/frontend/executors/policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def __init__(self):
'_rfm_local': rt.runtime().get_option('systems/0/max_local_jobs')
}
self._pipeline_statistics = rt.runtime().get_option(
'systems/0/dump_pipeline_progress'
'general/0/dump_pipeline_progress'
)
self.task_listeners.append(self)

Expand Down
3 changes: 2 additions & 1 deletion reframe/frontend/printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,8 @@ def _print_failure_info(rec, runid, total_runs):

self.info(f" * Reason: {msg}")
tb = ''.join(traceback.format_exception(
*rec['fail_info'].values()))
*rec['fail_info'].values())
)
if rec['fail_severe']:
self.info(tb)
else:
Expand Down
10 changes: 9 additions & 1 deletion reframe/schemas/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,12 @@
},
"json_formatter": {},
"extra_headers": {"type": "object"},
"debug": {"type": "boolean"}
"debug": {"type": "boolean"},
"backoff_intervals": {
"type": "array",
"items": {"type": "number"}
},
"retry_timeout": {"type": "number"}
},
"required": ["url"]
}
Expand Down Expand Up @@ -531,6 +536,7 @@
"target_systems": {"$ref": "#/defs/system_ref"},
"table_format": {"enum": ["csv", "plain", "pretty"]},
"timestamp_dirs": {"type": "string"},
"topology_prefix": {"type": "string"},
"trap_job_errors": {"type": "boolean"},
"unload_modules": {"$ref": "#/defs/modules_list"},
"use_login_shell": {"type": "boolean"},
Expand Down Expand Up @@ -632,6 +638,8 @@
"logging/handlers_perflog/httpjson_json_formatter": null,
"logging/handlers_perflog/httpjson_extra_headers": {},
"logging/handlers_perflog/httpjson_debug": false,
"logging/handlers_perflog/httpjson_backoff_intervals": [0.1, 0.2, 0.4, 0.8, 1.6, 3.2],
"logging/handlers_perflog/httpjson_retry_timeout": 0,
"modes/options": [],
"modes/target_systems": ["*"],
"storage/enable": false,
Expand Down

0 comments on commit 09d2197

Please sign in to comment.