From f1c7008514dc9f5897750fa1f4b5466842efdf4e Mon Sep 17 00:00:00 2001 From: Aleksey Baulin Date: Sun, 18 Jun 2017 14:02:37 +0300 Subject: [PATCH 1/9] Remove atomic ops used in stats data storage and manipulations. With the change of the concept atomic ops had became unnecessary. Please see the comment here: https://github.com/tempesta-tech/tempesta/issues/712#issuecomment-309234144 --- tempesta_fw/apm.c | 144 ++++++++++++++++++++-------------------------- 1 file changed, 61 insertions(+), 83 deletions(-) diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c index ab38dae50..ec9939b86 100644 --- a/tempesta_fw/apm.c +++ b/tempesta_fw/apm.c @@ -62,8 +62,8 @@ * * @order - The order of a range. The ranges grow logarithmically. * Motivation: time estimation error becomes negligible as - * the time grows, so higher response times can be estimated - * less accurately; + * the time grows, so higher response times may be estimated + * with less accuracy; * @begin - the start response time value of a range; * @end - the end response time value of a range; * @atomic - atomic control handler to update all control fields @@ -76,7 +76,7 @@ * fall to a specific bucket in a range. * * Keep the members cache line aligned to minimize false sharing: each range - * is placed on a separate cache line, and control hadlers are also on their + * is placed on a separate cache line, and control handlers are also on their * own cache lines. */ #define TFW_STATS_RANGES 4 @@ -96,17 +96,17 @@ typedef union { typedef struct { TfwPcntCtl ctl[TFW_STATS_RANGES]; char __reset_from[0]; - atomic64_t tot_cnt; - atomic64_t tot_val; - atomic_t min_val; - atomic_t max_val; + unsigned long tot_cnt; + unsigned long tot_val; + unsigned long min_val; + unsigned long max_val; unsigned long __pad_ulong; - atomic_t cnt[TFW_STATS_RANGES][TFW_STATS_BCKTS]; + unsigned long cnt[TFW_STATS_RANGES][TFW_STATS_BCKTS]; char __reset_till[0]; } TfwPcntRanges __attribute__((aligned(L1_CACHE_BYTES))); -static inline atomic_t * -__rng(TfwPcntCtl *pc, atomic_t *cnt, unsigned int r_time) +static inline unsigned long * +__rng(TfwPcntCtl *pc, unsigned long *cnt, unsigned int r_time) { if (r_time <= pc->begin) return &cnt[0]; @@ -124,14 +124,10 @@ __range_grow_right(TfwPcntRanges *rng, TfwPcntCtl *pc, int r) TFW_DBG3(" -- extend right bound of range %d to begin=%u order=%u" " end=%u\n", r, pc->begin, pc->order, pc->end); - /* - * Coalesce all counters to left half of the buckets. - * Some concurrent updates can be lost. - */ + + /* Coalesce all counters to left half of the buckets. */ for (i = 0; i < TFW_STATS_BCKTS / 2; ++i) - atomic_set(&rng->cnt[r][i], - atomic_read(&rng->cnt[r][2 * i]) - + atomic_read(&rng->cnt[r][2 * i + 1])); + rng->cnt[r][i] = rng->cnt[r][2 * i] + rng->cnt[r][2 * i + 1]; } static void @@ -148,20 +144,19 @@ __range_shrink_left(TfwPcntRanges *rng, TfwPcntCtl *pc, int r) " end=%u\n", r, pc->begin, pc->order, pc->end); /* * Write sum of the left half counters to the first bucket and equally - * split counters of the right half among rest of the buckets. - * Some concurrent updates may be lost. + * split counters of the right half among the rest of the buckets. */ for (i = 1; i < TFW_STATS_BCKTS / 2; ++i) - atomic_add(atomic_read(&rng->cnt[r][i]), &rng->cnt[r][0]); - cnt_full = atomic_read(&rng->cnt[r][TFW_STATS_BCKTS / 2]); + rng->cnt[r][0] += rng->cnt[r][i]; + cnt_full = rng->cnt[r][TFW_STATS_BCKTS / 2]; cnt_half = cnt_full / 2; - atomic_add(cnt_half, &rng->cnt[r][0]); - atomic_set(&rng->cnt[r][1], cnt_full - cnt_half); + rng->cnt[r][0] += cnt_half; + rng->cnt[r][1] = cnt_full - cnt_half; for (i = 1; i < TFW_STATS_BCKTS / 2; ++i) { - cnt_full = atomic_read(&rng->cnt[r][TFW_STATS_BCKTS / 2 + i]); + cnt_full = rng->cnt[r][TFW_STATS_BCKTS / 2 + i]; cnt_half = cnt_full / 2; - atomic_set(&rng->cnt[r][i * 2], cnt_half); - atomic_set(&rng->cnt[r][i * 2 + 1], cnt_full - cnt_half); + rng->cnt[r][i * 2] = cnt_half; + rng->cnt[r][i * 2 + 1] = cnt_full - cnt_half; } } @@ -182,19 +177,16 @@ tfw_stats_extend(TfwPcntRanges *rng, unsigned int r_time) TFW_DBG3(" -- extend last range to begin=%u order=%u end=%u\n", pc.begin, pc.order, pc.end); - /* - * Coalesce all counters to the left half of the buckets. - * Some concurrent updates may be lost. - */ + + /* Coalesce all counters to the left half of the buckets. */ for (i = 0; i < TFW_STATS_BCKTS / 2; ++i) - atomic_set(&rng->cnt[TFW_STATS_RLAST][i], - atomic_read(&rng->cnt[TFW_STATS_RLAST][2 * i]) - + atomic_read(&rng->cnt[TFW_STATS_RLAST][2 * i + 1])); + rng->cnt[TFW_STATS_RLAST][i] = + rng->cnt[TFW_STATS_RLAST][2 * i] + + rng->cnt[TFW_STATS_RLAST][2 * i + 1]; } /** - * See if the range @r contains large outliers. Adjust it if so. - * This is the unlocked version. + * See if range @r contains large outliers. Adjust it if so. * * The leftmost bound is fixed to 1ms. The rightmost bound is only growing * to handle large values. So the adjustment may either increase the gaps @@ -210,15 +202,17 @@ tfw_stats_adjust(TfwPcntRanges *rng, int r) unsigned long i, cnt = 0, sum = 0, max = 0, i_max = 0; for (i = 0; i < TFW_STATS_BCKTS; ++i) { - if (atomic_read(&rng->cnt[r][i])) { - sum += atomic_read(&rng->cnt[r][i]); + if (rng->cnt[r][i]) { + sum += rng->cnt[r][i]; ++cnt; } - if (max < atomic_read(&rng->cnt[r][i])) { - max = atomic_read(&rng->cnt[r][i]); + if (max < rng->cnt[r][i]) { + max = rng->cnt[r][i]; i_max = i; } } + BUG_ON(!cnt); + /* outlier means (max < avg * 2) */ if (likely(max <= sum * 2 / cnt)) return; @@ -239,10 +233,9 @@ tfw_stats_adjust(TfwPcntRanges *rng, int r) * (r - 1)'th range. This is a rough approximation. */ cnt = max / (TFW_STATS_BCKTS / 2 + 1); - atomic_sub(cnt * (TFW_STATS_BCKTS / 2), - &rng->cnt[r][0]); + rng->cnt[r][0] -= cnt * (TFW_STATS_BCKTS / 2); for (i = TFW_STATS_BCKTS / 2; i < TFW_STATS_BCKTS; ++i) - atomic_set(&rng->cnt[r - 1][i], cnt); + rng->cnt[r - 1][i] = cnt; } /* * Fall through to reduce the range order. The first bucket @@ -272,15 +265,10 @@ tfw_stats_adjust(TfwPcntRanges *rng, int r) static inline bool tfw_stats_adj_max(TfwPcntRanges *rng, unsigned int r_time) { - int old_val, max_val = atomic_read(&rng->max_val); - - while (r_time > max_val) { - old_val = atomic_cmpxchg(&rng->max_val, max_val, r_time); - if (likely(old_val == max_val)) - return true; - max_val = old_val; + if (r_time > rng->max_val) { + rng->max_val = r_time; + return true; } - return false; } @@ -292,26 +280,16 @@ tfw_stats_adj_max(TfwPcntRanges *rng, unsigned int r_time) static inline bool tfw_stats_adj_min(TfwPcntRanges *rng, unsigned int r_time) { - int old_val, min_val = atomic_read(&rng->min_val); - - while (r_time < min_val) { - old_val = atomic_cmpxchg(&rng->min_val, min_val, r_time); - if (likely(old_val == min_val)) - return true; - min_val = old_val; + if (r_time < rng->min_val) { + rng->min_val = r_time; + return true; } - return false; } /** * Update server response time statistic. * @r_time is in milliseconds (1/HZ second), use jiffies to get it. - * - * The control handlers may be changed during the execution of this - * function. In that case a wrong bucket and/or range may be updated - * in a parallel thread of execution. That's acceptable in our model. - * We only care about correct array indexing. */ static void tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time) @@ -322,29 +300,29 @@ tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time) if (!tfw_stats_adj_min(rng, r_time)) tfw_stats_adj_max(rng, r_time); /* Add to @tot_val for AVG calculation. */ - atomic64_add(r_time, &rng->tot_val); + rng->tot_val += r_time; /* Binary search of an appropriate range. */ if (r_time <= pc2.end) { TfwPcntCtl pc0, pc1 = { .atomic = rng->ctl[1].atomic }; if (pc1.end < r_time) { - atomic_inc(__rng(&pc2, rng->cnt[2], r_time)); + ++(*__rng(&pc2, rng->cnt[2], r_time)); tfw_stats_adjust(rng, 2); - atomic64_inc(&rng->tot_cnt); + ++rng->tot_cnt; return; } pc0.atomic = rng->ctl[0].atomic; BUG_ON(pc0.begin != 1); /* left bound is never moved */ if (pc0.end < r_time) { - atomic_inc(__rng(&pc1, rng->cnt[1], r_time)); + ++(*__rng(&pc1, rng->cnt[1], r_time)); tfw_stats_adjust(rng, 1); - atomic64_inc(&rng->tot_cnt); + ++rng->tot_cnt; return; } - atomic_inc(__rng(&pc0, rng->cnt[0], r_time)); + ++(*__rng(&pc0, rng->cnt[0], r_time)); tfw_stats_adjust(rng, 0); - atomic64_inc(&rng->tot_cnt); + ++rng->tot_cnt; return; } @@ -353,9 +331,9 @@ tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time) tfw_stats_extend(rng, r_time); pc3.atomic = rng->ctl[3].atomic; } - atomic_inc(__rng(&pc3, rng->cnt[3], r_time)); + ++(*__rng(&pc3, rng->cnt[3], r_time)); tfw_stats_adjust(rng, 3); - atomic64_inc(&rng->tot_cnt); + ++rng->tot_cnt; } /* @@ -563,7 +541,7 @@ __tfw_apm_state_next(TfwPcntRanges *rng, TfwApmRBEState *st) for (r = i / TFW_STATS_BCKTS; r < TFW_STATS_RANGES; ++r) { for (b = i % TFW_STATS_BCKTS; b < TFW_STATS_BCKTS; ++b, ++i) { - if (!atomic_read(&rng->cnt[r][b])) + if (!rng->cnt[r][b]) continue; rtt = rng->ctl[r].begin + (b << rng->ctl[r].order); __tfw_apm_state_set(st, rtt, i, r, b); @@ -649,7 +627,7 @@ tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats if (st[i].v != v_min) continue; pcntrng = &rbent[i].pcntrng; - cnt += atomic_read(&pcntrng->cnt[st[i].r][st[i].b]); + cnt += pcntrng->cnt[st[i].r][st[i].b]; tfw_apm_state_next(pcntrng, &st[i]); } for ( ; p < pstats->psz && pval[p] <= cnt; ++p) @@ -660,12 +638,12 @@ tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats pstats->val[IDX_MIN] = UINT_MAX; for (i = 0; i < rbuf->rbufsz; i++) { pcntrng = &rbent[i].pcntrng; - if (pstats->val[IDX_MIN] > atomic_read(&pcntrng->min_val)) - pstats->val[IDX_MIN] = atomic_read(&pcntrng->min_val); - if (pstats->val[IDX_MAX] < atomic_read(&pcntrng->max_val)) - pstats->val[IDX_MAX] = atomic_read(&pcntrng->max_val); - cnt += atomic64_read(&pcntrng->tot_cnt); - val += atomic64_read(&pcntrng->tot_val); + if (pstats->val[IDX_MIN] > pcntrng->min_val) + pstats->val[IDX_MIN] = pcntrng->min_val; + if (pstats->val[IDX_MAX] < pcntrng->max_val) + pstats->val[IDX_MAX] = pcntrng->max_val; + cnt += pcntrng->tot_cnt; + val += pcntrng->tot_val; } if (likely(cnt)) pstats->val[IDX_AVG] = val / cnt; @@ -744,8 +722,8 @@ tfw_apm_rbctl_update(TfwApmData *data, int recalc) tfw_apm_rbent_checkreset(&rbent[centry], jtmistart); for (i = 0; i < rbuf->rbufsz; ++i) - total_cnt += atomic64_read(&rbent[i].pcntrng.tot_cnt); - entry_cnt = atomic64_read(&rbent[centry].pcntrng.tot_cnt); + total_cnt += rbent[i].pcntrng.tot_cnt; + entry_cnt = rbent[centry].pcntrng.tot_cnt; rbctl->entry_cnt = entry_cnt; rbctl->total_cnt = total_cnt; @@ -763,7 +741,7 @@ tfw_apm_rbctl_update(TfwApmData *data, int recalc) */ /* Nothing to do if there were no stats updates. */ - entry_cnt = atomic64_read(&rbent[centry].pcntrng.tot_cnt); + entry_cnt = rbent[centry].pcntrng.tot_cnt; if (unlikely(rbctl->entry_cnt == entry_cnt)) { if (unlikely(recalc)) { TFW_DBG3("%s: Old time window: recalculate: " From a9310d9f06dd57df2af71d7766a9b57e6662befe Mon Sep 17 00:00:00 2001 From: Aleksey Baulin Date: Sun, 18 Jun 2017 18:40:56 +0300 Subject: [PATCH 2/9] Additional cleanup due to serialization of updates and calculations. --- tempesta_fw/apm.c | 85 ++++++++++------------------------------------- 1 file changed, 17 insertions(+), 68 deletions(-) diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c index ec9939b86..addc23f51 100644 --- a/tempesta_fw/apm.c +++ b/tempesta_fw/apm.c @@ -475,7 +475,6 @@ typedef struct { * @flags - The atomic flags (see below). */ #define TFW_APM_DATA_F_REARM (0x0001) /* Re-arm the timer. */ -#define TFW_APM_DATA_F_RECALC (0x0002) /* Need to recalculate. */ #define TFW_APM_TIMER_INTVL (HZ / 20) #define TFW_APM_UBUF_SZ TFW_APM_TIMER_INTVL /* a slot per ms. */ @@ -563,21 +562,8 @@ tfw_apm_state_next(TfwPcntRanges *rng, TfwApmRBEState *st) /* * Calculate the latest percentiles from the current stats data. - * - * Note that the calculation is run under a lock that protects against - * concurrent updates to ranges in the current ring buffer entry. That - * makes the values of @tot_cnt and the hits numbers in the buckets - * mostly consistent. However, there's still a small chance for a race - * condition. @tot_cnt and the buckets' @cnt[][] are updated without - * a lock, asynchronously and at slightly different times. Due to a - * tiny discrepancy between @tot_cnt and the sum of hit counters in - * @cnt[][], the calculation may not be able to reach the target value. - * In that case the calculation exits prematurely, and a recalculation - * is scheduled at the next run of the timer. - * - * Returns the number of percentile values that have been filled. */ -static int +static void tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats) { #define IDX_MIN TFW_PSTATS_IDX_MIN @@ -608,21 +594,7 @@ tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats if (st[i].v < v_min) v_min = st[i].v; } - /* - * If the race condition has occured, then the results - * are incomplete and can be used only partially. - */ - if (unlikely(v_min == USHRT_MAX)) { - TFW_DBG3("%s: Calculation stopped prematurely: " - "cnt [%lu] total_cnt [%lu]\n", - __func__, cnt, rbctl->total_cnt); - TFW_DBG3("%s: [%lu] [%lu] [%lu] [%lu] [%lu] [%lu]\n", - __func__, - pval[IDX_ITH], pval[IDX_ITH + 1], - pval[IDX_ITH + 2], pval[IDX_ITH + 3], - pval[IDX_ITH + 4], pval[IDX_ITH + 5]); - break; - } + BUG_ON(v_min == USHRT_MAX); for (i = 0; i < rbuf->rbufsz; i++) { if (st[i].v != v_min) continue; @@ -648,8 +620,6 @@ tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats if (likely(cnt)) pstats->val[IDX_AVG] = val / cnt; - return p; - #undef IDX_ITH #undef IDX_AVG #undef IDX_MAX @@ -697,7 +667,7 @@ tfw_apm_rbent_checkreset(TfwApmRBEnt *crbent, unsigned long jtmistamp) * Return false if the percentile values don't need the recalculation. */ static bool -tfw_apm_rbctl_update(TfwApmData *data, int recalc) +tfw_apm_rbctl_update(TfwApmData *data) { int i, centry; unsigned long jtmnow = jiffies; @@ -742,15 +712,8 @@ tfw_apm_rbctl_update(TfwApmData *data, int recalc) /* Nothing to do if there were no stats updates. */ entry_cnt = rbent[centry].pcntrng.tot_cnt; - if (unlikely(rbctl->entry_cnt == entry_cnt)) { - if (unlikely(recalc)) { - TFW_DBG3("%s: Old time window: recalculate: " - "centry [%d] total_cnt [%lu]\n", - __func__, centry, rbctl->total_cnt); - return true; - } + if (unlikely(rbctl->entry_cnt == entry_cnt)) return false; - } BUG_ON(rbctl->entry_cnt > entry_cnt); /* Update the counts incrementally. */ @@ -765,15 +728,10 @@ tfw_apm_rbctl_update(TfwApmData *data, int recalc) /* * Calculate the latest percentiles if necessary. - * - * Return 0 if the calculation is successful. - * Return < 0 if there was a system error. - * Return > 0 and < @prcntlsz if the calculation is incomplete. */ -static int +static void tfw_apm_calc(TfwApmData *data) { - int nfilled, recalc; unsigned int rdidx; unsigned int val[ARRAY_SIZE(tfw_pstats_ith)] = { 0 }; TfwPrcntlStats pstats = { @@ -786,26 +744,18 @@ tfw_apm_calc(TfwApmData *data) rdidx = atomic_read(&data->stats.rdidx); asent = &data->stats.asent[(rdidx + 1) % 2]; - recalc = test_and_clear_bit(TFW_APM_DATA_F_RECALC, &data->flags); - if (!tfw_apm_rbctl_update(data, recalc)) - return 0; - nfilled = tfw_apm_prnctl_calc(&data->rbuf, &data->rbctl, &pstats); - if (!nfilled) - return 0; + if (!tfw_apm_rbctl_update(data)) + return; + tfw_apm_prnctl_calc(&data->rbuf, &data->rbctl, &pstats); - if (nfilled < asent->pstats.psz) { - TFW_DBG3("%s: Percentile calculation incomplete.\n", __func__); - set_bit(TFW_APM_DATA_F_RECALC, &data->flags); - } else { - TFW_DBG3("%s: Percentile values may have changed.\n", __func__); - write_lock(&asent->rwlock); - memcpy(asent->pstats.val, pstats.val, - asent->pstats.psz * sizeof(asent->pstats.val[0])); - atomic_inc(&data->stats.rdidx); - write_unlock(&asent->rwlock); - } + TFW_DBG3("%s: Percentile values may have changed.\n", __func__); + write_lock(&asent->rwlock); + memcpy(asent->pstats.val, pstats.val, + asent->pstats.psz * sizeof(asent->pstats.val[0])); + atomic_inc(&data->stats.rdidx); + write_unlock(&asent->rwlock); - return nfilled % asent->pstats.psz; + return; } /* @@ -904,9 +854,8 @@ tfw_apm_prcntl_tmfn(unsigned long fndata) ++updone; } } - if (updone && unlikely(tfw_apm_calc(data))) { - TFW_DBG2("%s: Incomplete calculation\n", __func__); - } + if (updone) + tfw_apm_calc(data); smp_mb(); if (test_bit(TFW_APM_DATA_F_REARM, &data->flags)) From 870f051213a08084d747b497fb2c97f69d9026b4 Mon Sep 17 00:00:00 2001 From: Aleksey Baulin Date: Sun, 18 Jun 2017 22:08:32 +0300 Subject: [PATCH 3/9] Small optimization in tfw_apm_adjust(). Make sure the function is not called when r equals zero (the first range the left boundary of which is fixed at 1ms). --- tempesta_fw/apm.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c index addc23f51..f70955b85 100644 --- a/tempesta_fw/apm.c +++ b/tempesta_fw/apm.c @@ -201,6 +201,9 @@ tfw_stats_adjust(TfwPcntRanges *rng, int r) TfwPcntCtl pc; unsigned long i, cnt = 0, sum = 0, max = 0, i_max = 0; + if (unlikely(r == 0)) + return; + for (i = 0; i < TFW_STATS_BCKTS; ++i) { if (rng->cnt[r][i]) { sum += rng->cnt[r][i]; @@ -217,7 +220,7 @@ tfw_stats_adjust(TfwPcntRanges *rng, int r) if (likely(max <= sum * 2 / cnt)) return; - if (r && i_max == 0) { + if (i_max == 0) { /* * Too many hits in the gap between r'th and (r - 1)'th ranges. * Move the right bound of the (r - 1)'th range to the right. @@ -249,12 +252,9 @@ tfw_stats_adjust(TfwPcntRanges *rng, int r) * If servers are too fast (all responses within 1ms), then there's * nothing to do here. */ - if (!r) - return; pc.atomic = rng->ctl[r].atomic; - if (likely(pc.order)) { + if (likely(pc.order)) __range_shrink_left(rng, &pc, r); - } } /* @@ -321,7 +321,6 @@ tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time) return; } ++(*__rng(&pc0, rng->cnt[0], r_time)); - tfw_stats_adjust(rng, 0); ++rng->tot_cnt; return; } From 689a4eeadc413b9b127b891e95a6130102e2adde Mon Sep 17 00:00:00 2001 From: Aleksey Baulin Date: Sun, 18 Jun 2017 22:13:53 +0300 Subject: [PATCH 4/9] Small fix in tfw_stats_extend(). After the counters are coalesced to the left half of the range, zero out the right half of the range when the range is extended. --- tempesta_fw/apm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c index f70955b85..e1b093bdb 100644 --- a/tempesta_fw/apm.c +++ b/tempesta_fw/apm.c @@ -183,6 +183,8 @@ tfw_stats_extend(TfwPcntRanges *rng, unsigned int r_time) rng->cnt[TFW_STATS_RLAST][i] = rng->cnt[TFW_STATS_RLAST][2 * i] + rng->cnt[TFW_STATS_RLAST][2 * i + 1]; + for (i = TFW_STATS_BCKTS / 2; i < TFW_STATS_BCKTS; ++i) + rng->cnt[TFW_STATS_RLAST][i] = 0; } /** From e2e1d628d3fbd87b1ca719da9f5e83c6863f9d5c Mon Sep 17 00:00:00 2001 From: Aleksey Baulin Date: Mon, 19 Jun 2017 14:02:26 +0300 Subject: [PATCH 5/9] Fix the wraparound bug in tfw_apm_extend(). The left boundary of a range expands using a special algorithm. When RTT value is big enough (but still within the range of unsigned short), the next value of the left boundary may exceed both the value of RTT and the maximum value of unsigned short. In that case a wraparound occurs which leads to an endless loop. --- tempesta_fw/apm.c | 54 +++++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c index e1b093bdb..30a120c45 100644 --- a/tempesta_fw/apm.c +++ b/tempesta_fw/apm.c @@ -164,15 +164,33 @@ __range_shrink_left(TfwPcntRanges *rng, TfwPcntCtl *pc, int r) * Extend the last range so that larger response times can be handled. */ static void -tfw_stats_extend(TfwPcntRanges *rng, unsigned int r_time) +tfw_stats_extend(TfwPcntRanges *rng, unsigned int *r_time) { int i; TfwPcntCtl pc = { .atomic = rng->ctl[TFW_STATS_RLAST].atomic }; + unsigned int end = pc.end, prend; do { ++pc.order; - pc.end = pc.begin + ((TFW_STATS_BCKTS - 1) << pc.order); - } while (pc.end < r_time); + prend = end; + end = pc.begin + ((TFW_STATS_BCKTS - 1) << pc.order); + } while (end < *r_time); + + /* + * If the value of "end" exceeds the maximum value of unsigned + * short, then the previous value of "end" is definitely within + * the limits. Roll back to the previous values, and adjust + * "r_time" to fit the range. If unable to expand at all, then + * just return the adjusted "r_time". + */ + if (end >= (1U << (FIELD_SIZEOF(TfwPcntCtl, end) * 8))) { + --pc.order; + *r_time = end = prend; + if (end == pc.end) + return; + } + + pc.end = (unsigned short)end; rng->ctl[TFW_STATS_RLAST].atomic = pc.atomic; TFW_DBG3(" -- extend last range to begin=%u order=%u end=%u\n", @@ -298,20 +316,13 @@ tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time) { TfwPcntCtl pc3, pc2 = { .atomic = rng->ctl[2].atomic }; - /* Adjust min/max values. */ - if (!tfw_stats_adj_min(rng, r_time)) - tfw_stats_adj_max(rng, r_time); - /* Add to @tot_val for AVG calculation. */ - rng->tot_val += r_time; - /* Binary search of an appropriate range. */ if (r_time <= pc2.end) { TfwPcntCtl pc0, pc1 = { .atomic = rng->ctl[1].atomic }; if (pc1.end < r_time) { ++(*__rng(&pc2, rng->cnt[2], r_time)); tfw_stats_adjust(rng, 2); - ++rng->tot_cnt; - return; + goto totals; } pc0.atomic = rng->ctl[0].atomic; @@ -319,22 +330,29 @@ tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time) if (pc0.end < r_time) { ++(*__rng(&pc1, rng->cnt[1], r_time)); tfw_stats_adjust(rng, 1); - ++rng->tot_cnt; - return; + goto totals; } ++(*__rng(&pc0, rng->cnt[0], r_time)); - ++rng->tot_cnt; - return; + goto totals; } pc3.atomic = rng->ctl[3].atomic; if (unlikely(r_time > pc3.end)) { - tfw_stats_extend(rng, r_time); + tfw_stats_extend(rng, &r_time); pc3.atomic = rng->ctl[3].atomic; } ++(*__rng(&pc3, rng->cnt[3], r_time)); tfw_stats_adjust(rng, 3); + +totals: + /* Adjust min/max values. */ + if (!tfw_stats_adj_min(rng, r_time)) + tfw_stats_adj_max(rng, r_time); + /* Add to @tot_val for AVG calculation. */ + rng->tot_val += r_time; ++rng->tot_cnt; + + return; } /* @@ -864,7 +882,7 @@ tfw_apm_prcntl_tmfn(unsigned long fndata) } static void -__tfw_apm_update(TfwApmData *data, unsigned long jtstamp, unsigned long rtt) +__tfw_apm_update(TfwApmData *data, unsigned long jtstamp, unsigned int rtt) { TfwApmUBuf *ubuf = this_cpu_ptr(data->ubuf); unsigned long idxval = atomic64_add_return(0, &ubuf->counter); @@ -888,7 +906,7 @@ tfw_apm_update(void *apmref, unsigned long jtstamp, unsigned long jrtt) * the maximum value possible for TfwPcntCtl{}->end. Currently * the value is USHRT_MAX which is about 65 secs in milliseconds. */ - if (likely(rtt < (1UL << FIELD_SIZEOF(TfwPcntCtl, end) * 8))) + if (likely(rtt < (1U << (FIELD_SIZEOF(TfwPcntCtl, end) * 8)))) __tfw_apm_update(apmref, jtstamp, rtt); } From 1d4b9161f7ae5db28094cdfa1711eb33e1ca15b5 Mon Sep 17 00:00:00 2001 From: Aleksey Baulin Date: Mon, 19 Jun 2017 14:48:27 +0300 Subject: [PATCH 6/9] Always calculate new percentiles in tfw_apm_prcntl_tmfn(). The percentiles were calculated only if there were updates of stats data. That lead to situations where percentiles were "stuck" at some previous values even though the time frame has moved forward. --- tempesta_fw/apm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c index 30a120c45..4592e2ea2 100644 --- a/tempesta_fw/apm.c +++ b/tempesta_fw/apm.c @@ -845,7 +845,7 @@ tfw_apm_pstats_verify(TfwPrcntlStats *pstats) static void tfw_apm_prcntl_tmfn(unsigned long fndata) { - int i, icpu, updone = 0; + int i, icpu; TfwApmData *data = (TfwApmData *)fndata; TfwApmRBuf *rbuf = &data->rbuf; TfwApmRBEnt *rbent = rbuf->rbent; @@ -870,11 +870,9 @@ tfw_apm_prcntl_tmfn(unsigned long fndata) WRITE_ONCE(ubent[i].data, ULONG_MAX); tfw_stats_update(&rbent[rtt_data.centry].pcntrng, rtt_data.rtt); - ++updone; } } - if (updone) - tfw_apm_calc(data); + tfw_apm_calc(data); smp_mb(); if (test_bit(TFW_APM_DATA_F_REARM, &data->flags)) From 1aacac7bfce0d71bf6d9c13000d8393c5d28e5a5 Mon Sep 17 00:00:00 2001 From: Aleksey Baulin Date: Wed, 21 Jun 2017 14:13:28 +0300 Subject: [PATCH 7/9] Expand TfwPcntCtl{} members in APM to unsigned int type. TfwPcntCtl{} has supported RTT values up to about 65 seconds. It's possible that higher values may be seen in the wild. This patch makes higher values acceptable. --- tempesta_fw/apm.c | 148 +++++++++++++++++++++------------------------- 1 file changed, 66 insertions(+), 82 deletions(-) diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c index 4592e2ea2..20aef1ea6 100644 --- a/tempesta_fw/apm.c +++ b/tempesta_fw/apm.c @@ -82,15 +82,14 @@ #define TFW_STATS_RANGES 4 #define TFW_STATS_RLAST (TFW_STATS_RANGES - 1) #define TFW_STATS_BCKTS 16 -#define TFW_STATS_TOTAL_BCKTS (TFW_STATS_RANGES * TFW_STATS_BCKTS) -typedef union { - struct { - unsigned int order; - unsigned short begin; - unsigned short end; - } __attribute__((packed)); - unsigned long atomic; +#define TFW_STATS_RSPAN(order) ((TFW_STATS_BCKTS - 1) << (order)) +#define TFW_STATS_RSPANUL(order) ((TFW_STATS_BCKTS - 1UL) << (order)) + +typedef struct { + unsigned int order; + unsigned int begin; + unsigned int end; } TfwPcntCtl; typedef struct { @@ -100,7 +99,6 @@ typedef struct { unsigned long tot_val; unsigned long min_val; unsigned long max_val; - unsigned long __pad_ulong; unsigned long cnt[TFW_STATS_RANGES][TFW_STATS_BCKTS]; char __reset_till[0]; } TfwPcntRanges __attribute__((aligned(L1_CACHE_BYTES))); @@ -119,8 +117,7 @@ __range_grow_right(TfwPcntRanges *rng, TfwPcntCtl *pc, int r) int i; ++pc->order; - pc->end = pc->begin + ((TFW_STATS_BCKTS - 1) << pc->order); - rng->ctl[r].atomic = pc->atomic; + pc->end = pc->begin + TFW_STATS_RSPAN(pc->order); TFW_DBG3(" -- extend right bound of range %d to begin=%u order=%u" " end=%u\n", r, pc->begin, pc->order, pc->end); @@ -137,8 +134,7 @@ __range_shrink_left(TfwPcntRanges *rng, TfwPcntCtl *pc, int r) unsigned long cnt_full, cnt_half; --pc->order; - pc->begin = pc->end - ((TFW_STATS_BCKTS - 1) << pc->order); - rng->ctl[r].atomic = pc->atomic; + pc->begin = pc->end - TFW_STATS_RSPAN(pc->order); TFW_DBG3(" -- shrink left bound of range %d to begin=%u order=%u" " end=%u\n", r, pc->begin, pc->order, pc->end); @@ -167,13 +163,13 @@ static void tfw_stats_extend(TfwPcntRanges *rng, unsigned int *r_time) { int i; - TfwPcntCtl pc = { .atomic = rng->ctl[TFW_STATS_RLAST].atomic }; - unsigned int end = pc.end, prend; + TfwPcntCtl *pc = &rng->ctl[TFW_STATS_RLAST]; + unsigned long end = pc->end, prend; do { - ++pc.order; + ++pc->order; prend = end; - end = pc.begin + ((TFW_STATS_BCKTS - 1) << pc.order); + end = pc->begin + TFW_STATS_RSPANUL(pc->order); } while (end < *r_time); /* @@ -183,18 +179,19 @@ tfw_stats_extend(TfwPcntRanges *rng, unsigned int *r_time) * "r_time" to fit the range. If unable to expand at all, then * just return the adjusted "r_time". */ - if (end >= (1U << (FIELD_SIZEOF(TfwPcntCtl, end) * 8))) { - --pc.order; + if (unlikely(end >= (1UL << (FIELD_SIZEOF(TfwPcntCtl, end) * 8)))) { + TFW_DBG3("%s: Unable to extend beyond r_time=[%u] end=[%lu]." + __func__, *r_time, end); + --pc->order; *r_time = end = prend; - if (end == pc.end) + if (end == pc->end) return; } - pc.end = (unsigned short)end; - rng->ctl[TFW_STATS_RLAST].atomic = pc.atomic; + pc->end = (typeof(pc->end))end; TFW_DBG3(" -- extend last range to begin=%u order=%u end=%u\n", - pc.begin, pc.order, pc.end); + pc->begin, pc->order, pc->end); /* Coalesce all counters to the left half of the buckets. */ for (i = 0; i < TFW_STATS_BCKTS / 2; ++i) @@ -218,8 +215,9 @@ tfw_stats_extend(TfwPcntRanges *rng, unsigned int *r_time) static void tfw_stats_adjust(TfwPcntRanges *rng, int r) { - TfwPcntCtl pc; - unsigned long i, cnt = 0, sum = 0, max = 0, i_max = 0; + int i; + TfwPcntCtl *pc, *prepc; + unsigned long prend, cnt = 0, sum = 0, max = 0, i_max = 0; if (unlikely(r == 0)) return; @@ -240,31 +238,26 @@ tfw_stats_adjust(TfwPcntRanges *rng, int r) if (likely(max <= sum * 2 / cnt)) return; - if (i_max == 0) { - /* - * Too many hits in the gap between r'th and (r - 1)'th ranges. - * Move the right bound of the (r - 1)'th range to the right. - */ - TfwPcntCtl pc_curr = { .atomic = rng->ctl[r].atomic }; - pc.atomic = rng->ctl[r - 1].atomic; - if (pc.begin + ((TFW_STATS_BCKTS - 1) << (pc.order + 1)) - < pc_curr.begin) - { - __range_grow_right(rng, &pc, r - 1); - /* - * Evenly distibute hits among the right half of the - * (r - 1)'th range. This is a rough approximation. - */ - cnt = max / (TFW_STATS_BCKTS / 2 + 1); - rng->cnt[r][0] -= cnt * (TFW_STATS_BCKTS / 2); - for (i = TFW_STATS_BCKTS / 2; i < TFW_STATS_BCKTS; ++i) - rng->cnt[r - 1][i] = cnt; - } - /* - * Fall through to reduce the range order. The first bucket - * gets a higher count. Since the left bound has been moved, - * the right bound of (r - 1)'th range will be moved next time. - */ + /* + * If too many hits fall in the gap between r'th and (r - 1)'th + * ranges, and (r - 1)'th range can grow, then grow that range + * and spread these hits evenly in the right half of (r - 1)'th + * range as a rough approximation. Afterwards, move on to reduce + * the range order. The first bucket gets a higher count. Since + * the left bound has been moved, the right bound of (r - 1)'th + * range will be moved next time. + */ + pc = &rng->ctl[r]; + prepc = &rng->ctl[r - 1]; + prend = prepc->begin + TFW_STATS_RSPANUL(prepc->order + 1); + + if ((i_max == 0) && (prend < pc->begin)) { + __range_grow_right(rng, prepc, r - 1); + + cnt = max / (TFW_STATS_BCKTS / 2 + 1); + rng->cnt[r][0] -= cnt * (TFW_STATS_BCKTS / 2); + for (i = TFW_STATS_BCKTS / 2; i < TFW_STATS_BCKTS; ++i) + rng->cnt[r - 1][i] = cnt; } /* @@ -272,9 +265,8 @@ tfw_stats_adjust(TfwPcntRanges *rng, int r) * If servers are too fast (all responses within 1ms), then there's * nothing to do here. */ - pc.atomic = rng->ctl[r].atomic; - if (likely(pc.order)) - __range_shrink_left(rng, &pc, r); + if (likely(pc->order)) + __range_shrink_left(rng, pc, r); } /* @@ -314,34 +306,34 @@ tfw_stats_adj_min(TfwPcntRanges *rng, unsigned int r_time) static void tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time) { - TfwPcntCtl pc3, pc2 = { .atomic = rng->ctl[2].atomic }; + TfwPcntCtl *pc3, *pc2 = &rng->ctl[2]; /* Binary search of an appropriate range. */ - if (r_time <= pc2.end) { - TfwPcntCtl pc0, pc1 = { .atomic = rng->ctl[1].atomic }; - if (pc1.end < r_time) { - ++(*__rng(&pc2, rng->cnt[2], r_time)); + if (r_time <= pc2->end) { + TfwPcntCtl *pc0, *pc1 = &rng->ctl[1]; + + if (r_time > pc1->end) { + ++(*__rng(pc2, rng->cnt[2], r_time)); tfw_stats_adjust(rng, 2); goto totals; } - pc0.atomic = rng->ctl[0].atomic; - BUG_ON(pc0.begin != 1); /* left bound is never moved */ - if (pc0.end < r_time) { - ++(*__rng(&pc1, rng->cnt[1], r_time)); + pc0 = &rng->ctl[0]; + BUG_ON(pc0->begin != 1); /* left bound is never moved */ + if (r_time > pc0->end) { + ++(*__rng(pc1, rng->cnt[1], r_time)); tfw_stats_adjust(rng, 1); goto totals; } - ++(*__rng(&pc0, rng->cnt[0], r_time)); + + ++(*__rng(pc0, rng->cnt[0], r_time)); goto totals; } - pc3.atomic = rng->ctl[3].atomic; - if (unlikely(r_time > pc3.end)) { + pc3 = &rng->ctl[3]; + if (unlikely(r_time > pc3->end)) tfw_stats_extend(rng, &r_time); - pc3.atomic = rng->ctl[3].atomic; - } - ++(*__rng(&pc3, rng->cnt[3], r_time)); + ++(*__rng(pc3, rng->cnt[3], r_time)); tfw_stats_adjust(rng, 3); totals: @@ -512,10 +504,10 @@ typedef struct { * including cross atlantic. */ static const TfwPcntCtl tfw_rngctl_init[TFW_STATS_RANGES] = { - {{0, 1, 16}}, - {{1, 17, 47}}, - {{2, 48, 108}}, - {{4, 109, 349}} + {0, 1, 16}, + {1, 17, 47}, + {2, 48, 108}, + {4, 109, 349} }; static int tfw_apm_jtmwindow; /* Time window in jiffies. */ @@ -573,7 +565,7 @@ __tfw_apm_state_next(TfwPcntRanges *rng, TfwApmRBEState *st) static inline void tfw_apm_state_next(TfwPcntRanges *rng, TfwApmRBEState *st) { - BUG_ON(st->i >= TFW_STATS_TOTAL_BCKTS); + BUG_ON(st->i >= TFW_STATS_RANGES * TFW_STATS_BCKTS); ++st->i; __tfw_apm_state_next(rng, st); @@ -896,16 +888,8 @@ __tfw_apm_update(TfwApmData *data, unsigned long jtstamp, unsigned int rtt) void tfw_apm_update(void *apmref, unsigned long jtstamp, unsigned long jrtt) { - unsigned int rtt = jiffies_to_msecs(jrtt); - BUG_ON(!apmref); - /* - * APM stats can't handle response times that are greater than - * the maximum value possible for TfwPcntCtl{}->end. Currently - * the value is USHRT_MAX which is about 65 secs in milliseconds. - */ - if (likely(rtt < (1U << (FIELD_SIZEOF(TfwPcntCtl, end) * 8)))) - __tfw_apm_update(apmref, jtstamp, rtt); + __tfw_apm_update(apmref, jtstamp, jiffies_to_msecs(jrtt)); } static void From a88984a505d08163259d279be8e456a3348b5654 Mon Sep 17 00:00:00 2001 From: Aleksey Baulin Date: Wed, 28 Jun 2017 19:43:32 +0300 Subject: [PATCH 8/9] Fix a bug when the last range is extended by multiple 'order'. As the value of 'order' increases, the range may grow by 2, 4, 8, etc. times. Yet just a half of the range was coalesced. Also, this patch removes the code that was intended to deal with an overflow of TfwPcntCtl{}->end. That's unnecessary now that it is of type unsigned int. --- tempesta_fw/apm.c | 91 ++++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 36 deletions(-) diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c index 20aef1ea6..570098865 100644 --- a/tempesta_fw/apm.c +++ b/tempesta_fw/apm.c @@ -81,10 +81,11 @@ */ #define TFW_STATS_RANGES 4 #define TFW_STATS_RLAST (TFW_STATS_RANGES - 1) -#define TFW_STATS_BCKTS 16 +#define TFW_STATS_BCKTS_ORDER 4 +#define TFW_STATS_BCKTS (1 << TFW_STATS_BCKTS_ORDER) #define TFW_STATS_RSPAN(order) ((TFW_STATS_BCKTS - 1) << (order)) -#define TFW_STATS_RSPANUL(order) ((TFW_STATS_BCKTS - 1UL) << (order)) +#define TFW_STATS_RSPAN_UL(order) ((TFW_STATS_BCKTS - 1UL) << (order)) typedef struct { unsigned int order; @@ -122,7 +123,7 @@ __range_grow_right(TfwPcntRanges *rng, TfwPcntCtl *pc, int r) TFW_DBG3(" -- extend right bound of range %d to begin=%u order=%u" " end=%u\n", r, pc->begin, pc->order, pc->end); - /* Coalesce all counters to left half of the buckets. */ + /* Coalesce counters to buckets on the left half of the range. */ for (i = 0; i < TFW_STATS_BCKTS / 2; ++i) rng->cnt[r][i] = rng->cnt[r][2 * i] + rng->cnt[r][2 * i + 1]; } @@ -160,46 +161,62 @@ __range_shrink_left(TfwPcntRanges *rng, TfwPcntCtl *pc, int r) * Extend the last range so that larger response times can be handled. */ static void -tfw_stats_extend(TfwPcntRanges *rng, unsigned int *r_time) +tfw_stats_extend(TfwPcntRanges *rng, unsigned int r_time) { - int i; + int i, b; TfwPcntCtl *pc = &rng->ctl[TFW_STATS_RLAST]; - unsigned long end = pc->end, prend; + unsigned int sum, parts, units, shift, order = pc->order; + + BUILD_BUG_ON_NOT_POWER_OF_2(TFW_STATS_BCKTS); do { - ++pc->order; - prend = end; - end = pc->begin + TFW_STATS_RSPANUL(pc->order); - } while (end < *r_time); + ++order; + pc->end = pc->begin + TFW_STATS_RSPAN_UL(order); + } while (pc->end < r_time); /* - * If the value of "end" exceeds the maximum value of unsigned - * short, then the previous value of "end" is definitely within - * the limits. Roll back to the previous values, and adjust - * "r_time" to fit the range. If unable to expand at all, then - * just return the adjusted "r_time". + * Consirering that TfwPcntCtl{}->end is of type unsigned int, + * it's totally unimaginable that this situation may ever happen. */ - if (unlikely(end >= (1UL << (FIELD_SIZEOF(TfwPcntCtl, end) * 8)))) { - TFW_DBG3("%s: Unable to extend beyond r_time=[%u] end=[%lu]." - __func__, *r_time, end); - --pc->order; - *r_time = end = prend; - if (end == pc->end) - return; - } + BUG_ON(pc->end >= (1UL << (FIELD_SIZEOF(TfwPcntCtl, end) * 8))); + + shift = min_t(unsigned int, order - pc->order, TFW_STATS_BCKTS_ORDER); + units = 1 << shift; + parts = TFW_STATS_BCKTS >> shift; - pc->end = (typeof(pc->end))end; + pc->order = order; TFW_DBG3(" -- extend last range to begin=%u order=%u end=%u\n", pc->begin, pc->order, pc->end); - /* Coalesce all counters to the left half of the buckets. */ - for (i = 0; i < TFW_STATS_BCKTS / 2; ++i) - rng->cnt[TFW_STATS_RLAST][i] = - rng->cnt[TFW_STATS_RLAST][2 * i] - + rng->cnt[TFW_STATS_RLAST][2 * i + 1]; - for (i = TFW_STATS_BCKTS / 2; i < TFW_STATS_BCKTS; ++i) - rng->cnt[TFW_STATS_RLAST][i] = 0; + /* + * Coalesce counters to buckets on the left side of the range. + * Clear the buckets that represent the new extended range. + */ + for (i = 0; i < parts; ++i) { + switch (units) { + case 2: + rng->cnt[TFW_STATS_RLAST][i] = + rng->cnt[TFW_STATS_RLAST][2 * i] + + rng->cnt[TFW_STATS_RLAST][2 * i + 1]; + break; + case 4: + rng->cnt[TFW_STATS_RLAST][i] = + rng->cnt[TFW_STATS_RLAST][4 * i] + + rng->cnt[TFW_STATS_RLAST][4 * i + 1] + + rng->cnt[TFW_STATS_RLAST][4 * i + 2] + + rng->cnt[TFW_STATS_RLAST][4 * i + 3]; + break; + default: + sum = 0; + for (b = i * units; b < (i + 1) * units; ++b) + sum += rng->cnt[TFW_STATS_RLAST][b]; + rng->cnt[TFW_STATS_RLAST][i] = sum; + break; + } + } + memset(&rng->cnt[TFW_STATS_RLAST][parts], 0, + sizeof(rng->cnt[0][0]) * (TFW_STATS_BCKTS - parts)); } /** @@ -219,8 +236,7 @@ tfw_stats_adjust(TfwPcntRanges *rng, int r) TfwPcntCtl *pc, *prepc; unsigned long prend, cnt = 0, sum = 0, max = 0, i_max = 0; - if (unlikely(r == 0)) - return; + BUG_ON(r == 0); for (i = 0; i < TFW_STATS_BCKTS; ++i) { if (rng->cnt[r][i]) { @@ -238,6 +254,9 @@ tfw_stats_adjust(TfwPcntRanges *rng, int r) if (likely(max <= sum * 2 / cnt)) return; + TFW_DBG3(" -- range %d has an outlier %lu (avg=%lu total=%lu) at" + " bucket %lu\n", r, max, sum / cnt, sum, i_max); + /* * If too many hits fall in the gap between r'th and (r - 1)'th * ranges, and (r - 1)'th range can grow, then grow that range @@ -249,7 +268,7 @@ tfw_stats_adjust(TfwPcntRanges *rng, int r) */ pc = &rng->ctl[r]; prepc = &rng->ctl[r - 1]; - prend = prepc->begin + TFW_STATS_RSPANUL(prepc->order + 1); + prend = prepc->begin + TFW_STATS_RSPAN_UL(prepc->order + 1); if ((i_max == 0) && (prend < pc->begin)) { __range_grow_right(rng, prepc, r - 1); @@ -332,7 +351,7 @@ tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time) pc3 = &rng->ctl[3]; if (unlikely(r_time > pc3->end)) - tfw_stats_extend(rng, &r_time); + tfw_stats_extend(rng, r_time); ++(*__rng(pc3, rng->cnt[3], r_time)); tfw_stats_adjust(rng, 3); @@ -547,7 +566,7 @@ static inline void __tfw_apm_state_next(TfwPcntRanges *rng, TfwApmRBEState *st) { int i = st->i, r, b; - unsigned short rtt; + unsigned int rtt; for (r = i / TFW_STATS_BCKTS; r < TFW_STATS_RANGES; ++r) { for (b = i % TFW_STATS_BCKTS; b < TFW_STATS_BCKTS; ++b, ++i) { From 78253e80a8638cc3949bb672b4d02d8c422f6cea Mon Sep 17 00:00:00 2001 From: Aleksey Baulin Date: Thu, 29 Jun 2017 00:43:29 +0300 Subject: [PATCH 9/9] Align the reference implementation with the current code in APM. Remove atomic ops. Fix all bugs that have been found so far. --- tempesta_fw/t/unit/user_space/Makefile | 2 +- tempesta_fw/t/unit/user_space/percentiles.c | 323 ++++++++++---------- 2 files changed, 165 insertions(+), 160 deletions(-) diff --git a/tempesta_fw/t/unit/user_space/Makefile b/tempesta_fw/t/unit/user_space/Makefile index bdec6b733..2b39913eb 100644 --- a/tempesta_fw/t/unit/user_space/Makefile +++ b/tempesta_fw/t/unit/user_space/Makefile @@ -28,7 +28,7 @@ endif CACHELINE := $(shell getconf LEVEL1_DCACHE_LINESIZE) CFLAGS = -O0 -ggdb -Wall -Werror \ - -pthread -DL1_CACHE_BYTES=$(CACHELINE) \ + -DL1_CACHE_BYTES=$(CACHELINE) \ -I../../../../ktest CXXFLAGS = -std=c++11 ${CFLAGS} TARGETS = alb percentiles slr diff --git a/tempesta_fw/t/unit/user_space/percentiles.c b/tempesta_fw/t/unit/user_space/percentiles.c index 021695450..49c8eb90a 100644 --- a/tempesta_fw/t/unit/user_space/percentiles.c +++ b/tempesta_fw/t/unit/user_space/percentiles.c @@ -41,12 +41,13 @@ * Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include +#include #include -#include #include -#include +#include +#define min(a,b) (((a) < (b)) ? (a) : (b)) #define SET(s) {ARRAY_SIZE(s), s} /* @@ -120,28 +121,29 @@ basic_percentilie(const unsigned int *set, size_t len, Percentilie *pcnts, * placed at separate cache line and control hadlers are also at their own * cache line. */ -#define TFW_STAT_RANGES 4 -#define TFW_STAT_RLAST (TFW_STAT_RANGES - 1) -#define TFW_STAT_BCKTS 16 - -typedef union { - struct { - unsigned int order; - unsigned short begin; - unsigned short end; - } __attribute__((packed)); - unsigned long atomic; +#define TFW_STATS_RANGES 4 +#define TFW_STATS_RLAST (TFW_STATS_RANGES - 1) +#define TFW_STATS_BCKTS_ORDER 4 +#define TFW_STATS_BCKTS (1 << TFW_STATS_BCKTS_ORDER) + +#define TFW_STATS_RSPAN(order) ((TFW_STATS_BCKTS - 1) << (order)) +#define TFW_STATS_RSPAN_UL(order) ((TFW_STATS_BCKTS - 1UL) << (order)) + +typedef struct { + unsigned int order; + unsigned int begin; + unsigned int end; } TfwPcntCtl; typedef struct { - TfwPcntCtl ctl[TFW_STAT_RANGES]; - atomic64_t tot_cnt; - unsigned long __padding[TFW_STAT_RLAST]; - atomic_t cnt[TFW_STAT_RANGES][TFW_STAT_BCKTS]; + TfwPcntCtl ctl[TFW_STATS_RANGES]; + unsigned long tot_cnt; + unsigned long __padding[TFW_STATS_RLAST]; + unsigned long cnt[TFW_STATS_RANGES][TFW_STATS_BCKTS]; } TfwPcntRanges __attribute__((aligned(L1_CACHE_BYTES))); -static inline atomic_t * -__rng(TfwPcntCtl *pc, atomic_t *cnt, unsigned int r_time) +static inline unsigned long * +__rng(TfwPcntCtl *pc, unsigned long *cnt, unsigned int r_time) { if (r_time <= pc->begin) return &cnt[0]; @@ -154,20 +156,14 @@ __range_grow_right(TfwPcntRanges *rng, TfwPcntCtl *pc, int r) int i; ++pc->order; - pc->end = pc->begin + ((TFW_STAT_BCKTS - 1) << pc->order); - rng->ctl[r].atomic = pc->atomic; + pc->end = pc->begin + TFW_STATS_RSPAN(pc->order);; printf(" -- extend right bound of range %d to begin=%u order=%u" " end=%u\n", r, pc->begin, pc->order, pc->end); - /* - * Coalesce all counters to left half of the buckets. - * Some concurrent updates can be lost. - */ - for (i = 0; i < TFW_STAT_BCKTS / 2; ++i) - atomic_set(&rng->cnt[r][i], - atomic_read(&rng->cnt[r][2 * i]) - + atomic_read(&rng->cnt[r][2 * i + 1])); + /* Coalesce counters to buckets on the left half of the range. */ + for (i = 0; i < TFW_STATS_BCKTS / 2; ++i) + rng->cnt[r][i] = rng->cnt[r][2 * i] + rng->cnt[r][2 * i + 1]; } static void @@ -177,184 +173,193 @@ __range_shrink_left(TfwPcntRanges *rng, TfwPcntCtl *pc, int r) unsigned long cnt_full, cnt_half; --pc->order; - pc->begin = pc->end - ((TFW_STAT_BCKTS - 1) << pc->order); - rng->ctl[r].atomic = pc->atomic; + pc->begin = pc->end - TFW_STATS_RSPAN(pc->order); printf(" -- shrink left bound of range %d to begin=%u order=%u" " end=%u\n", r, pc->begin, pc->order, pc->end); /* * Write sum of the left half counters to the first bucket and equally - * split counters of the right half among rest of the buckets. - * Some concurrent updates can be lost. + * split counters of the right half among the rest of the buckets. */ - for (i = 1; i < TFW_STAT_BCKTS / 2; ++i) - atomic_add(atomic_read(&rng->cnt[r][i]), &rng->cnt[r][0]); - cnt_full = atomic_read(&rng->cnt[r][TFW_STAT_BCKTS / 2]); + for (i = 1; i < TFW_STATS_BCKTS / 2; ++i) + rng->cnt[r][0] += rng->cnt[r][i]; + cnt_full = rng->cnt[r][TFW_STATS_BCKTS / 2]; cnt_half = cnt_full / 2; - atomic_add(cnt_half, &rng->cnt[r][0]); - atomic_set(&rng->cnt[r][1], cnt_full - cnt_half); - for (i = 1; i < TFW_STAT_BCKTS / 2; ++i) { - cnt_full = atomic_read(&rng->cnt[r][TFW_STAT_BCKTS / 2 + i]); + rng->cnt[r][0] += cnt_half; + rng->cnt[r][1] = cnt_full - cnt_half; + for (i = 1; i < TFW_STATS_BCKTS / 2; ++i) { + cnt_full = rng->cnt[r][TFW_STATS_BCKTS / 2 + i]; cnt_half = cnt_full / 2; - atomic_set(&rng->cnt[r][i * 2], cnt_half); - atomic_set(&rng->cnt[r][i * 2 + 1], cnt_full - cnt_half); + rng->cnt[r][i * 2] = cnt_half; + rng->cnt[r][i * 2 + 1] = cnt_full - cnt_half; } } /** - * Extend last range so that we can handle large response times. + * Extend the last range so that larger response times can be handled. */ static void -tfw_stat_extend(TfwPcntRanges *rng, unsigned int r_time) +tfw_stats_extend(TfwPcntRanges *rng, unsigned int r_time) { - int i; - TfwPcntCtl pc = { .atomic = rng->ctl[TFW_STAT_RLAST].atomic }; + int i, b; + TfwPcntCtl *pc = &rng->ctl[TFW_STATS_RLAST]; + unsigned int sum, parts, units, shift, order = pc->order; do { - ++pc.order; - pc.end = pc.begin + ((TFW_STAT_BCKTS - 1) << pc.order); - } while (pc.end < r_time); - rng->ctl[TFW_STAT_RLAST].atomic = pc.atomic; + ++order; + pc->end = pc->begin + TFW_STATS_RSPAN_UL(order); + } while (pc->end < r_time); + + shift = min(order - pc->order, TFW_STATS_BCKTS_ORDER); + units = 1 << shift; + parts = TFW_STATS_BCKTS >> shift; + + pc->order = order; printf(" -- extend last range to begin=%u order=%u end=%u\n", - pc.begin, pc.order, pc.end); + pc->begin, pc->order, pc->end); /* - * Coalesce all counters to left half of the buckets. - * Some concurrent updates can be lost. + * Coalesce counters to buckets on the left side of the range. + * Clear the buckets that represent the new extended range. */ - for (i = 0; i < TFW_STAT_BCKTS / 2; ++i) - atomic_set(&rng->cnt[TFW_STAT_RLAST][i], - atomic_read(&rng->cnt[TFW_STAT_RLAST][2 * i]) - + atomic_read(&rng->cnt[TFW_STAT_RLAST][2 * i + 1])); + for (i = 0; i < parts; ++i) { + switch (units) { + case 2: + rng->cnt[TFW_STATS_RLAST][i] = + rng->cnt[TFW_STATS_RLAST][2 * i] + + rng->cnt[TFW_STATS_RLAST][2 * i + 1]; + break; + case 4: + rng->cnt[TFW_STATS_RLAST][i] = + rng->cnt[TFW_STATS_RLAST][4 * i] + + rng->cnt[TFW_STATS_RLAST][4 * i + 1] + + rng->cnt[TFW_STATS_RLAST][4 * i + 2] + + rng->cnt[TFW_STATS_RLAST][4 * i + 3]; + break; + default: + sum = 0; + for (b = i * units; b < (i + 1) * units; ++b) + sum += rng->cnt[TFW_STATS_RLAST][b]; + rng->cnt[TFW_STATS_RLAST][i] = sum; + break; + } + } + memset(&rng->cnt[TFW_STATS_RLAST][parts], 0, + sizeof(rng->cnt[0][0]) * (TFW_STATS_BCKTS - parts)); } /** - * Check range @r whether it contains large outliers and adjust it if so. + * See if range @r contains large outliers. Adjust it if so. * - * The most left bound is fixed to 1ms. The most right bound is only growing - * to handle large values. So adjustment can increase gaps between ranges - * moving left range bounds only with reducing a range order or decrease the - * gaps moving right range bounds with enlarging a range order. I.e. ranges - * worms right and the algorithm converges at the largest faced response time. + * The leftmost bound is fixed to 1ms. The rightmost bound is only growing + * to handle large values. So the adjustment may either increase the gaps + * between ranges by decreasing a range order and moving left range bounds, + * or decrease the gaps by increasing a range order and moving right range + * bounds. I.e. ranges worm to the right and the algorithm converges at the + * largest response time faced. */ static void -tfw_stat_adjust(TfwPcntRanges *rng, int r) +tfw_stats_adjust(TfwPcntRanges *rng, int r) { - TfwPcntCtl pc; - static spinlock_t sa_guard = __RAW_SPIN_LOCK_UNLOCKED(sa_guard); - unsigned long i, cnt = 0, sum = 0, max = 0, i_max = 0; + int i; + TfwPcntCtl *pc, *prepc; + unsigned long prend, cnt = 0, sum = 0, max = 0, i_max = 0; - if (!spin_trylock(&sa_guard)) - return; /* somebody is already adjusting statistic ranges */ + BUG_ON(r == 0); - for (i = 0; i < TFW_STAT_BCKTS; ++i) { - if (atomic_read(&rng->cnt[r][i])) { - sum += atomic_read(&rng->cnt[r][i]); + for (i = 0; i < TFW_STATS_BCKTS; ++i) { + if (rng->cnt[r][i]) { + sum += rng->cnt[r][i]; ++cnt; } - if (max < atomic_read(&rng->cnt[r][i])) { - max = atomic_read(&rng->cnt[r][i]); + if (max < rng->cnt[r][i]) { + max = rng->cnt[r][i]; i_max = i; } } + BUG_ON(!cnt); + + /* outlier means (max < avg * 2) */ if (likely(max <= sum * 2 / cnt)) - /* outlier means (max < avg * 2) */ - goto out; + return; printf(" -- range %d has outlier %lu (avg=%lu total=%lu) at" " bucket %lu\n", r, max, sum / cnt, sum, i_max); - if (r && i_max == 0) { - /* - * Too many hits at the gap between r'th and (r - 1)'th ranges. - * Move right bound of (r - 1)'th range to the right. - */ - TfwPcntCtl pc_curr = { .atomic = rng->ctl[r].atomic }; - pc.atomic = rng->ctl[r - 1].atomic; - if (pc.begin + ((TFW_STAT_BCKTS - 1) << (pc.order + 1)) - < pc_curr.begin) - { - __range_grow_right(rng, &pc, r - 1); - /* - * Evenly distibute 0'th among right half of (r - 1)'th - * range. This is rough approximation. - */ - cnt = max / (TFW_STAT_BCKTS / 2 + 1); - atomic_sub(cnt * (TFW_STAT_BCKTS / 2), - &rng->cnt[r][0]); - for (i = TFW_STAT_BCKTS / 2; i < TFW_STAT_BCKTS; ++i) - atomic_set(&rng->cnt[r - 1][i], cnt); + /* + * If too many hits fall in the gap between r'th and (r - 1)'th + * ranges, and (r - 1)'th range can grow, then grow that range + * and spread these hits evenly in the right half of (r - 1)'th + * range as a rough approximation. Afterwards, move on to reduce + * the range order. The first bucket gets a higher count. Since + * the left bound has been moved, the right bound of (r - 1)'th + * range will be moved next time. + */ + pc = &rng->ctl[r]; + prepc = &rng->ctl[r - 1]; + prend = prepc->begin + TFW_STATS_RSPAN_UL(prepc->order + 1); - } - /* - * Fall through to reduce the interval order: the first bucket - * gets ever higher counter, but since the left bound is moved, - * we'll move right bound of (r - 1)'th range next time. - */ + if ((i_max == 0) && (prend < pc->begin)) { + __range_grow_right(rng, prepc, r - 1); + + cnt = max / (TFW_STATS_BCKTS / 2 + 1); + rng->cnt[r][0] -= cnt * (TFW_STATS_BCKTS / 2); + for (i = TFW_STATS_BCKTS / 2; i < TFW_STATS_BCKTS; ++i) + rng->cnt[r - 1][i] = cnt; } /* - * Too large order - reduce it by moving left bound. - * If servers are too fast (all responses within 1ms), - * then there is nothing to do for us. + * The range order is too big. Reduce it by moving the left bound. + * If servers are too fast (all responses within 1ms), then there's + * nothing to do here. */ - if (!r) - goto out; - pc.atomic = rng->ctl[r].atomic; - if (likely(pc.order)) - __range_shrink_left(rng, &pc, r); - -out: - spin_unlock(&sa_guard); + if (likely(pc->order)) + __range_shrink_left(rng, pc, r); } /** * Update server response time statistic. * @r_time is in milliseconds (1/HZ second), use jiffies to get it. - * - * Can be ran concurrently w/ tfw_stat_adjust(), so counter to update is - * decided by range control handlers read at the begin. During the function - * execution the control handlers may change, so we can update wrong - * bucket and/or range. That's acceptable by our model. We only care about - * correct array indexing. */ static void -tfw_stat_upd(TfwPcntRanges *rng, unsigned int r_time) +tfw_stats_upd(TfwPcntRanges *rng, unsigned int r_time) { - TfwPcntCtl pc3, pc2 = { .atomic = rng->ctl[2].atomic }; - - atomic64_inc(&rng->tot_cnt); + TfwPcntCtl *pc3, *pc2 = &rng->ctl[2]; /* Binary search of appropriate range. */ - if (r_time <= pc2.end) { - TfwPcntCtl pc0, pc1 = { .atomic = rng->ctl[1].atomic }; - if (pc1.end < r_time) { - atomic_inc(__rng(&pc2, rng->cnt[2], r_time)); - tfw_stat_adjust(rng, 2); - return; + if (r_time <= pc2->end) { + TfwPcntCtl *pc0, *pc1 = &rng->ctl[1]; + + if (r_time > pc1->end) { + ++(*__rng(pc2, rng->cnt[2], r_time)); + tfw_stats_adjust(rng, 2); + goto totals; } - pc0.atomic = rng->ctl[0].atomic; - BUG_ON(pc0.begin != 1); /* left bound is never moved */ - if (pc0.end < r_time) { - atomic_inc(__rng(&pc1, rng->cnt[1], r_time)); - tfw_stat_adjust(rng, 1); - return; + pc0 = &rng->ctl[0]; + BUG_ON(pc0->begin != 1); /* left bound is never moved */ + if (r_time > pc0->end) { + ++(*__rng(pc1, rng->cnt[1], r_time)); + tfw_stats_adjust(rng, 1); + goto totals; } - atomic_inc(__rng(&pc0, rng->cnt[0], r_time)); - tfw_stat_adjust(rng, 0); - return; - } - pc3.atomic = rng->ctl[3].atomic; - if (unlikely(r_time > pc3.end)) { - tfw_stat_extend(rng, r_time); - pc3.atomic = rng->ctl[3].atomic; + ++(*__rng(pc0, rng->cnt[0], r_time)); + goto totals; } - atomic_inc(__rng(&pc3, rng->cnt[3], r_time)); - tfw_stat_adjust(rng, 3); + + pc3 = &rng->ctl[3]; + if (unlikely(r_time > pc3->end)) + tfw_stats_extend(rng, r_time); + ++(*__rng(pc3, rng->cnt[3], r_time)); + tfw_stats_adjust(rng, 3); + +totals: + ++rng->tot_cnt; + + return; } /** @@ -362,10 +367,10 @@ tfw_stat_upd(TfwPcntRanges *rng, unsigned int r_time) * @pcnts must be sorted. */ static void -tfw_stat_calc(TfwPcntRanges *rng, Percentilie *pcnts, size_t np, bool clear) +tfw_stats_calc(TfwPcntRanges *rng, Percentilie *pcnts, size_t np, bool clear) { int i, r, b, p = 0; - unsigned long cnt, tot_cnt = atomic64_read(&rng->tot_cnt); + unsigned long cnt, tot_cnt = rng->tot_cnt; unsigned long pval[np]; if (unlikely(!tot_cnt)) @@ -378,21 +383,21 @@ tfw_stat_calc(TfwPcntRanges *rng, Percentilie *pcnts, size_t np, bool clear) pcnts[p++].val = 0; } - for (cnt = 0, r = 0; r < TFW_STAT_RANGES; ++r) - for (b = 0; b < TFW_STAT_BCKTS; ++b) { - cnt += atomic_read(&rng->cnt[r][b]); + for (cnt = 0, r = 0; r < TFW_STATS_RANGES; ++r) + for (b = 0; b < TFW_STATS_BCKTS; ++b) { + cnt += rng->cnt[r][b]; for ( ; p < np && pval[p] <= cnt; ++p) { pcnts[p].ith = cnt * 100 / tot_cnt; pcnts[p].val = rng->ctl[r].begin + (b << rng->ctl[r].order); } if (clear) - atomic_set(&rng->cnt[r][b], 0); + rng->cnt[r][b] = 0; } BUG_ON (p < np); if (clear) - atomic64_set(&rng->tot_cnt, 0); + rng->tot_cnt = 0; } /* @@ -400,10 +405,10 @@ tfw_stat_calc(TfwPcntRanges *rng, Percentilie *pcnts, size_t np, bool clear) * including crossatalantic. */ static TfwPcntRanges rng = { - .ctl = { {{0, 1, 16}}, - {{1, 17, 47}}, - {{2, 48, 108}}, - {{4, 109, 349}} + .ctl = { {0, 1, 16}, + {1, 17, 47}, + {2, 48, 108}, + {4, 109, 349} } }; @@ -415,14 +420,14 @@ tfw_percentilie(const unsigned int *set, size_t len, Percentilie *pcnts, /* 1. Emulate getting @set in stream manner. */ for (i = 0; i < len; ++i) - tfw_stat_upd(&rng, set[i]); + tfw_stats_upd(&rng, set[i]); /* * 2. Perform percentilies calculation. * Zero the statistic on each call. In real life this should be done * once per T, configurable time. */ - tfw_stat_calc(&rng, pcnts, np, true); + tfw_stats_calc(&rng, pcnts, np, true); } int @@ -439,7 +444,7 @@ main(int argc, char *argv[]) /* Store previous statistic for Tempesta trends. * This should be used for /proc/tempesta/perfstat since - * tfw_stat_calc() should be called on timer. + * tfw_stats_calc() should be called on timer. * BTW (for further extensions) it's also good to send probe * request to all the servers on the timer to estimate their * availability.