Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Cleanups and sanity-checking in cpu and db metrics #5499

Merged
merged 7 commits into from
Jun 24, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 13 additions & 18 deletions synapse/storage/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,22 +167,22 @@ def __init__(self):
self.current_counters = {}
self.previous_counters = {}

def update(self, key, start_time, end_time=None):
if end_time is None:
end_time = time.time()
duration = end_time - start_time
def update(self, key, duration_secs):
count, cum_time = self.current_counters.get(key, (0, 0))
count += 1
cum_time += duration
cum_time += duration_secs
self.current_counters[key] = (count, cum_time)
return end_time

def interval(self, interval_duration, limit=3):
def interval(self, interval_duration_secs, limit=3):
counters = []
for name, (count, cum_time) in iteritems(self.current_counters):
prev_count, prev_time = self.previous_counters.get(name, (0, 0))
counters.append(
((cum_time - prev_time) / interval_duration, count - prev_count, name)
(
(cum_time - prev_time) / interval_duration_secs,
count - prev_count,
name,
)
)

self.previous_counters = dict(self.current_counters)
Expand Down Expand Up @@ -213,7 +213,6 @@ def __init__(self, db_conn, hs):
# is running in mainline, and we have some nice monitoring frontends
# to watch it
self._txn_perf_counters = PerformanceCounters()
self._get_event_counters = PerformanceCounters()

self._get_event_cache = Cache(
"*getEvent*", keylen=3, max_entries=hs.config.event_cache_size
Expand Down Expand Up @@ -363,21 +362,17 @@ def loop():
time_then = self._previous_loop_ts
self._previous_loop_ts = time_now

ratio = (curr - prev) / (time_now - time_then)
duration = time_now - time_then
ratio = (curr - prev) / duration

top_three_counters = self._txn_perf_counters.interval(
time_now - time_then, limit=3
)

top_3_event_counters = self._get_event_counters.interval(
time_now - time_then, limit=3
duration, limit=3
)

perf_logger.info(
"Total database time: %.3f%% {%s} {%s}",
"Total database time: %.3f%% {%s}",
ratio * 100,
top_three_counters,
top_3_event_counters,
)

self._clock.looping_call(loop, 10000)
Expand Down Expand Up @@ -459,7 +454,7 @@ def _new_transaction(
transaction_logger.debug("[TXN END] {%s} %f sec", name, duration)

self._current_txn_total_time += duration
self._txn_perf_counters.update(desc, start, end)
self._txn_perf_counters.update(desc, duration)
sql_txn_timer.labels(desc).observe(duration)

@defer.inlineCallbacks
Expand Down
40 changes: 33 additions & 7 deletions synapse/util/logcontext.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,10 +325,9 @@ def stop(self):
)
return

usage_end = get_thread_resource_usage()

self._resource_usage.ru_utime += usage_end.ru_utime - self.usage_start.ru_utime
self._resource_usage.ru_stime += usage_end.ru_stime - self.usage_start.ru_stime
utime_delta, stime_delta = self._get_cputime()
self._resource_usage.ru_utime += utime_delta
self._resource_usage.ru_stime += stime_delta

self.usage_start = None

Expand All @@ -346,13 +345,38 @@ def get_resource_usage(self):
# can include resource usage so far.
is_main_thread = threading.current_thread() is self.main_thread
if self.alive and self.usage_start and is_main_thread:
current = get_thread_resource_usage()
res.ru_utime += current.ru_utime - self.usage_start.ru_utime
res.ru_stime += current.ru_stime - self.usage_start.ru_stime
utime_delta, stime_delta = self._get_cputime()
res.ru_utime += utime_delta
res.ru_stime += stime_delta

return res

def _get_cputime(self):
"""Get the cpu usage time so far

Returns: Tuple[float, float]: seconds in user mode, seconds in system mode
"""
current = get_thread_resource_usage()

utime_delta = current.ru_utime - self.usage_start.ru_utime
stime_delta = current.ru_stime - self.usage_start.ru_stime

# sanity check
if utime_delta < 0:
raise ValueError("utime went backwards! %f < %f" % (
current.ru_utime, self.usage_start.ru_utime,
))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the effect of raising here? Will this just take out metrics? Do we want to log.error but return 0 instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it will take out whatever is running at this point.

My concern with log.error is that it's never going to get noticed, and if it is, we won't be able to figure out why it is happening.


if stime_delta < 0:
raise ValueError("stime went backwards! %f < %f" % (
current.ru_stime, self.usage_start.ru_stime,
))

return utime_delta, stime_delta

def add_database_transaction(self, duration_sec):
if duration_sec < 0:
raise ValueError('DB txn time can only be non-negative')
self._resource_usage.db_txn_count += 1
self._resource_usage.db_txn_duration_sec += duration_sec

Expand All @@ -363,6 +387,8 @@ def add_database_scheduled(self, sched_sec):
sched_sec (float): number of seconds it took us to get a
connection
"""
if sched_sec < 0:
raise ValueError('DB scheduling time can only be non-negative')
self._resource_usage.db_sched_duration_sec += sched_sec

def record_event_fetch(self, event_count):
Expand Down