Skip to content

Commit

Permalink
Handle statistics on selectivity estimation errors.
Browse files Browse the repository at this point in the history
Per idea from Oleg Bartunov.

We now compute the selectivity estimation error, but in a ratio and raw number
of rows, per qual, and keep track of the minimum, maximum, mean and standard
deviation for both values.

Also add two new GUCS, to store only the quals that have at least a certain
selectivity estimation error ratio or number of rows.
  • Loading branch information
rjuju committed Dec 29, 2019
1 parent 8d41caf commit 5ef44ed
Show file tree
Hide file tree
Showing 2 changed files with 192 additions and 29 deletions.
68 changes: 50 additions & 18 deletions pg_qualstats--2.0.0dev.sql
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,22 @@ LANGUAGE C;
the total number of execution of this predicate.
nbfiltered (bigint):
the number of lines filtered by this predicate
min_err_estimate_ratio(double precision):
the minimum selectivity estimation error ratio for this predicate
max_err_estimate_ratio(double precision):
the maximum selectivity estimation error ratio for this predicate
mean_err_estimate_ratio(double precision):
the mean selectivity estimation error ratio for this predicate
stddev_err_estimate_ratio(double precision):
the standard deviation for selectivity estimation error ratio for this predicate
min_err_estimate_num(bigint):
the minimum number of line for selectivity estimation error for this predicate
max_err_estimate_num(bigint):
the maximum number of line for selectivity estimation error for this predicate
mean_err_estimate_num(double precision):
the mean number of line for selectivity estimation error for this predicate
stddev_err_estimate_num(double precision):
the standard deviation for number of line for selectivity estimation error for this predicate
constant_position (int):
the position of the constant in the original query, as filled by the lexer.
queryid (bigint):
Expand All @@ -81,24 +97,32 @@ LANGUAGE C;
powa=# select * from powa_statements where queryid != 2;
powa=# select * from pg_qualstats();
-[ RECORD 1 ]-----+-----------
userid | 16384
dbid | 850774
lrelid | 851367
lattnum | 1
opno | 417
rrelid |
rattnum |
qualid |
uniquequalid |
qualnodeid | 1711571257
uniquequalnodeid | 466568149
occurences | 1
execution_count | 1206
nbfiltered | 0
constant_position | 47
queryid | 3644521490
constvalue | 2::integer
eval_type | f
userid | 10
dbid | 32799
lrelid | 189341
lattnum | 2
opno | 417
rrelid |
rattnum |
qualid |
uniquequalid |
qualnodeid | 1391544855
uniquequalnodeid | 551979005
occurences | 1
execution_count | 31
nbfiltered | 0
min_err_estimate_ratio | 32.741935483871
max_err_estimate_ratio | 32.741935483871
mean_err_estimate_ratio | 32.741935483871
stddev_err_estimate_ratio | 0
min_err_estimate_num | 984
max_err_estimate_num | 984
mean_err_estimate_num | 984
stddev_err_estimate_num | 0
constant_position | 47
queryid | -6668685762776610659
constvalue | 2::integer
eval_type | f
*/
CREATE FUNCTION pg_qualstats(
OUT userid oid,
Expand All @@ -115,6 +139,14 @@ CREATE FUNCTION pg_qualstats(
OUT occurences bigint,
OUT execution_count bigint,
OUT nbfiltered bigint,
OUT min_err_estimate_ratio double precision,
OUT max_err_estimate_ratio double precision,
OUT mean_err_estimate_ratio double precision,
OUT stddev_err_estimate_ratio double precision,
OUT min_err_estimate_num bigint,
OUT max_err_estimate_num bigint,
OUT mean_err_estimate_num double precision,
OUT stddev_err_estimate_num double precision,
OUT constant_position int,
OUT queryid bigint,
OUT constvalue varchar,
Expand Down
153 changes: 142 additions & 11 deletions pg_qualstats.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
*-------------------------------------------------------------------------
*/
#include <limits.h>
#include <math.h>
#include "postgres.h"
#include "access/hash.h"
#include "access/htup_details.h"
Expand Down Expand Up @@ -68,7 +69,7 @@

PG_MODULE_MAGIC;

#define PGQS_COLUMNS 18 /* number of columns in pg_qualstats SRF */
#define PGQS_COLUMNS 26 /* number of columns in pg_qualstats SRF */
#define PGQS_NAME_COLUMNS 7 /* number of column added when using
* pg_qualstats_column SRF */
#define PGQS_USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */
Expand All @@ -79,6 +80,9 @@ PG_MODULE_MAGIC;

#define PGQS_FLAGS (INSTRUMENT_ROWS|INSTRUMENT_BUFFERS)

#define PGQS_RATIO 0
#define PGQS_NUM 1

/*---- Function declarations ----*/

void _PG_init(void);
Expand Down Expand Up @@ -132,6 +136,8 @@ static bool pgqs_resolve_oids; /* resolve oids */
static bool pgqs_enabled;
static bool pgqs_track_constants;
static double pgqs_sample_rate;
static int pgqs_min_err_ratio;
static int pgqs_min_err_num;
static int query_is_sampled; /* Is the current query sampled, per backend */
static int nesting_level = 0; /* Current nesting depth of ExecutorRun calls */
static bool pgqs_assign_sample_rate_check_hook(double *newval, void **extra, GucSource source);
Expand Down Expand Up @@ -212,7 +218,12 @@ typedef struct pgqsEntry
int64 nbfiltered; /* # of lines discarded by the operator */
int position; /* content position in query text */
double usage; /* # of qual execution, used for deallocation */
int64 occurences;
double min_err_estim[2]; /* min estimation error ratio and num */
double max_err_estim[2]; /* max estimation error ratio and num */
double mean_err_estim[2]; /* mean estimation error ratio and num */
double sum_err_estim[2]; /* sum of variances in estimation error
* ratio and num */
int64 occurences; /* # of qual execution, 1 per query */
} pgqsEntry;

typedef struct pgqsEntryWithNames
Expand Down Expand Up @@ -255,6 +266,7 @@ typedef struct pgqsWalkerContext
uint32 uniquequalid; /* Hash of the parent, including the consts */
int64 count;
int64 nbfiltered;
double err_estim[2];
int nentries; /* number of entries found so far */
char evaltype;
const char *querytext;
Expand All @@ -277,6 +289,7 @@ static Expr *pgqs_resolve_var(Var *var, pgqsWalkerContext *context);
static void pgqs_entry_dealloc(void);
static inline void pgqs_entry_init(pgqsEntry *entry);
static inline void pgqs_entry_copy_raw(pgqsEntry *dest, pgqsEntry *src);
static void pgqs_entry_err_estim(pgqsEntry *e, double *err_estim, int64 occurences);
static void pgqs_queryentry_dealloc(void);
static void pgqs_localentry_dealloc(int nvictims);
static void pgqs_fillnames(pgqsEntryWithNames *entry);
Expand Down Expand Up @@ -387,6 +400,32 @@ _PG_init(void)
NULL,
NULL);

DefineCustomIntVariable("pg_qualstats.min_err_estimate_ratio",
"Error estimation ratio threshold to save quals",
NULL,
&pgqs_min_err_ratio,
0,
0,
INT_MAX,
PGC_USERSET,
0,
NULL,
NULL,
NULL);

DefineCustomIntVariable("pg_qualstats.min_err_estimate_num",
"Error estimation num threshold to save quals",
NULL,
&pgqs_min_err_num,
0,
0,
INT_MAX,
PGC_USERSET,
0,
NULL,
NULL,
NULL);

EmitWarningsOnPlaceholders("pg_qualstats");

parse_int(GetConfigOption("track_activity_query_size", false, false),
Expand Down Expand Up @@ -741,7 +780,9 @@ pgqs_ExecutorEnd(QueryDesc *queryDesc)
newEntry->count += localentry->count;
newEntry->nbfiltered += localentry->nbfiltered;
newEntry->usage += localentry->usage;
newEntry->occurences += localentry->occurences;
/* compute estimation error min, max, mean and variance */
pgqs_entry_err_estim(newEntry, localentry->mean_err_estim,
localentry->occurences);
}
/* cleanup local hash */
hash_search(pgqs_localhash, &localentry->key, HASH_REMOVE, NULL);
Expand Down Expand Up @@ -837,6 +878,42 @@ pgqs_entry_copy_raw(pgqsEntry *dest, pgqsEntry *src)
(sizeof(pgqsEntry) - sizeof(pgqsHashKey)));
}

/*
* Accurately compute estimation error ratio and num variance using Welford's
* method. See <http://www.johndcook.com/blog/standard_deviation/>
* Also maintain min and max values.
*/
static void
pgqs_entry_err_estim(pgqsEntry *e, double err_estim[2], int64 occurences)
{
e->occurences += occurences;

for (int i = 0; i < 2; i++)
{
if ((e->occurences - occurences) == 0)
{
e->min_err_estim[i] = err_estim[i];
e->max_err_estim[i] = err_estim[i];
e->mean_err_estim[i] = err_estim[i];
}
else
{
double old_err = e->mean_err_estim[i];

e->mean_err_estim[i] +=
(err_estim[i] - old_err) / e->occurences;
e->sum_err_estim[i] +=
(err_estim[i] - old_err) * (err_estim[i] - e->mean_err_estim[i]);
}

/* calculate min/max counters */
if (e->min_err_estim[i] > err_estim[i])
e->min_err_estim[i] = err_estim[i];
if (e->max_err_estim[i] < err_estim[i])
e->max_err_estim[i] = err_estim[i];
}
}

/*
* Deallocate the first example query.
* Caller must hold an exlusive lock on pgqs->querylock
Expand Down Expand Up @@ -908,6 +985,8 @@ pgqs_collectNodeStats(PlanState *planstate, List *ancestors, pgqsWalkerContext *
Instrumentation *instrument = planstate->instrument;
int64 oldcount = context->count;
double oldfiltered = context->nbfiltered;
double old_err_ratio = context->err_estim[PGQS_RATIO];
double old_err_num = context->err_estim[PGQS_NUM];
double total_filtered = 0;
ListCell *lc;
List *parent = 0;
Expand Down Expand Up @@ -976,18 +1055,47 @@ pgqs_collectNodeStats(PlanState *planstate, List *ancestors, pgqsWalkerContext *
context->nbfiltered = total_filtered;
context->count = instrument->tuplecount + instrument->ntuples + total_filtered;

/* Add the indexquals */
context->evaltype = 'i';
expression_tree_walker((Node *) indexquals, pgqs_whereclause_tree_walker, context);
if (plan->plan_rows == instrument->ntuples)
{
context->err_estim[PGQS_RATIO] = 0;
context->err_estim[PGQS_NUM] = 0;
}
else if (plan->plan_rows > instrument->ntuples)
{
/* XXX should use use a bigger value? */
if (instrument->ntuples == 0)
context->err_estim[PGQS_RATIO] = plan->plan_rows * 1.0L;
else
context->err_estim[PGQS_RATIO] = plan->plan_rows * 1.0L / instrument->ntuples;
context->err_estim[PGQS_NUM] = plan->plan_rows - instrument->ntuples;
}
else
{
/* plan_rows cannot be zero */
context->err_estim[PGQS_RATIO] = instrument->ntuples * 1.0L / plan->plan_rows;
context->err_estim[PGQS_NUM] = instrument->ntuples - plan->plan_rows;
}

/* Add the generic quals */
context->evaltype = 'f';
expression_tree_walker((Node *) quals, pgqs_whereclause_tree_walker, context);
if ( context->err_estim[PGQS_RATIO] >= pgqs_min_err_ratio &&
context->err_estim[PGQS_NUM] >= pgqs_min_err_num)
{
/* Add the indexquals */
context->evaltype = 'i';
expression_tree_walker((Node *) indexquals,
pgqs_whereclause_tree_walker, context);

/* Add the generic quals */
context->evaltype = 'f';
expression_tree_walker((Node *) quals, pgqs_whereclause_tree_walker,
context);
}

context->qualid = 0;
context->uniquequalid = 0;
context->count = oldcount;
context->nbfiltered = oldfiltered;
context->err_estim[PGQS_RATIO] = old_err_ratio;
context->err_estim[PGQS_NUM] = old_err_num;

foreach(lc, planstate->initPlan)
{
Expand Down Expand Up @@ -1211,7 +1319,8 @@ pgqs_process_booltest(BooleanTest *expr, pgqsWalkerContext *context)
entry->nbfiltered += context->nbfiltered;
entry->count += context->count;
entry->usage += 1;
entry->occurences += 1;
/* compute estimation error min, max, mean and variance */
pgqs_entry_err_estim(entry, context->err_estim, 1);

return entry;
}
Expand Down Expand Up @@ -1545,7 +1654,8 @@ pgqs_process_opexpr(OpExpr *expr, pgqsWalkerContext *context)
entry->nbfiltered += context->nbfiltered;
entry->count += context->count;
entry->usage += 1;
entry->occurences += 1;
/* compute estimation error min, max, mean and variance */
pgqs_entry_err_estim(entry, context->err_estim, 1);

return entry;
}
Expand Down Expand Up @@ -1773,6 +1883,7 @@ pg_qualstats_common(PG_FUNCTION_ARGS, bool include_names)
while ((entry = hash_seq_search(&hash_seq)) != NULL)
{
int i = 0;
double stddev_estim[2];

memset(values, 0, sizeof(Datum) * nb_columns);
memset(nulls, 0, sizeof(bool) * nb_columns);
Expand Down Expand Up @@ -1816,6 +1927,26 @@ pg_qualstats_common(PG_FUNCTION_ARGS, bool include_names)
values[i++] = Int64GetDatum(entry->count);
values[i++] = Int64GetDatum(entry->nbfiltered);

for (int j = 0; j < 2; j++)
{
if (j == PGQS_RATIO) /* min/max ratio are double precision */
{
values[i++] = Float8GetDatum(entry->min_err_estim[j]);
values[i++] = Float8GetDatum(entry->max_err_estim[j]);
}
else /* min/max num are bigint */
{
values[i++] = Int64GetDatum(entry->min_err_estim[j]);
values[i++] = Int64GetDatum(entry->max_err_estim[j]);
}
values[i++] = Float8GetDatum(entry->mean_err_estim[j]);
if (entry->occurences > 1)
stddev_estim[j] = sqrt(entry->sum_err_estim[j] / entry->occurences);
else
stddev_estim[j] = 0.0;
values[i++] = Float8GetDatumFast(stddev_estim[j]);
}

if (entry->position == -1)
nulls[i++] = true;
else
Expand Down

0 comments on commit 5ef44ed

Please sign in to comment.