Skip to content

Commit

Permalink
Expand crash testing of tiered storage, especially FIFO (facebook#13176)
Browse files Browse the repository at this point in the history
Summary:
* Test tiered storage FIFO setting `file_temperature_age_thresholds` in crash test, with dynamic mutability.
* Re-organize db_crashtest.py slightly to better handle tiered storage parameters and their interaction with compaction_style and num_levels. I have put most of this logic in the python script so that `db_stress` command lines reflect settings in effect as best as possible.
* Tweak crash test settings for preclude_last_level_data_seconds. This seems to have amplified the possibility of hitting "Corruption: Unsafe to store Seq later" even with universal compaction, which I am working on a fix for. We should also be able to enable tiered+leveled when this is fixed. (TODO / follow-up items)
* Code formatting / small simplifications in db_crashtest.py

Pull Request resolved: facebook#13176

Test Plan:
no production code changes

Kicked off about 24 CI jobs (temporary internal link https://fburl.com/sandcastle/s61rzusr)

Reviewed By: cbi42

Differential Revision: D66674123

Pulled By: pdillinger

fbshipit-source-id: 33dd7f9d291ec4a9516665b4adb998fd9a2b9266
  • Loading branch information
pdillinger authored and facebook-github-bot committed Dec 4, 2024
1 parent 80c3705 commit 3b91fe8
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 15 deletions.
1 change: 1 addition & 0 deletions db_stress_tool/db_stress_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,7 @@ DECLARE_bool(inplace_update_support);
DECLARE_uint32(uncache_aggressiveness);
DECLARE_int32(test_ingest_standalone_range_deletion_one_in);
DECLARE_bool(allow_unprepared_value);
DECLARE_string(file_temperature_age_thresholds);

constexpr long KB = 1024;
constexpr int kRandomValueMaxFactor = 3;
Expand Down
4 changes: 4 additions & 0 deletions db_stress_tool/db_stress_gflags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,10 @@ DEFINE_uint32(use_timed_put_one_in, 0,
"If greater than zero, TimedPut is used per every N write ops on "
"on average.");

DEFINE_string(file_temperature_age_thresholds, "",
"See CompactionOptionsFIFO::file_temperature_age_thresholds. "
"empty == unset");

static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);

Expand Down
22 changes: 22 additions & 0 deletions db_stress_tool/db_stress_test_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,16 @@ bool StressTest::BuildOptionsTable() {
"2147483646", "2147483647"});
}

if (!FLAGS_file_temperature_age_thresholds.empty()) {
// Modify file_temperature_age_thresholds only if it is set initially
// (FIFO tiered storage setup)
options_tbl.emplace(
"file_temperature_age_thresholds",
std::vector<std::string>{
"{{temperature=kWarm;age=30}:{temperature=kCold;age=300}}",
"{{temperature=kCold;age=100}}", "{}"});
}

options_table_ = std::move(options_tbl);

for (const auto& iter : options_table_) {
Expand Down Expand Up @@ -4156,6 +4166,18 @@ void InitializeOptionsFromFlags(
options.default_temperature =
StringToTemperature(FLAGS_default_temperature.c_str());

if (!FLAGS_file_temperature_age_thresholds.empty()) {
Status s = GetColumnFamilyOptionsFromString(
{}, options,
"compaction_options_fifo={file_temperature_age_thresholds=" +
FLAGS_file_temperature_age_thresholds + "}",
&options);
if (!s.ok()) {
fprintf(stderr, "While setting file_temperature_age_thresholds: %s\n",
s.ToString().c_str());
exit(1);
}
}
options.preclude_last_level_data_seconds =
FLAGS_preclude_last_level_data_seconds;
options.preserve_internal_time_seconds = FLAGS_preserve_internal_time_seconds;
Expand Down
59 changes: 44 additions & 15 deletions tools/db_crashtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,16 +593,21 @@ def is_direct_io_supported(dbname):
}

tiered_params = {
# Set tiered compaction hot data time as: 1 minute, 1 hour, 10 hour
"preclude_last_level_data_seconds": lambda: random.choice([60, 3600, 36000]),
# only test universal compaction for now, level has known issue of
# endless compaction
"compaction_style": 1,
# For Leveled/Universal compaction (ignored for FIFO)
# Bias toward times that can elapse during a crash test run series
"preclude_last_level_data_seconds": lambda: random.choice([10, 60, 1200, 86400]),
"last_level_temperature": "kCold",
# For FIFO compaction (ignored otherwise)
"file_temperature_age_thresholds": lambda: random.choice(
[
"{{temperature=kWarm;age=30}:{temperature=kCold;age=300}}",
"{{temperature=kCold;age=100}}",
]
),
# tiered storage doesn't support blob db yet
"enable_blob_files": 0,
"use_blob_db": 0,
"default_write_temperature": lambda: random.choice(["kUnknown", "kHot", "kWarm"]),
"last_level_temperature": "kCold",
}

multiops_txn_default_params = {
Expand Down Expand Up @@ -800,6 +805,12 @@ def finalize_and_sanitize(src_params):
# now assertion failures are triggered.
dest_params["compaction_ttl"] = 0
dest_params["periodic_compaction_seconds"] = 0
# Disable irrelevant tiering options
dest_params["preclude_last_level_data_seconds"] = 0
dest_params["last_level_temperature"] = "kUnknown"
else:
# Disable irrelevant tiering options
dest_params["file_temperature_age_thresholds"] = ""
if dest_params["partition_filters"] == 1:
if dest_params["index_type"] != 2:
dest_params["partition_filters"] = 0
Expand Down Expand Up @@ -924,7 +935,10 @@ def finalize_and_sanitize(src_params):
dest_params["check_multiget_consistency"] = 0
dest_params["check_multiget_entity_consistency"] = 0
if dest_params.get("disable_wal") == 0:
if dest_params.get("reopen") > 0 or (dest_params.get("manual_wal_flush_one_in") and dest_params.get("column_families") != 1):
if dest_params.get("reopen") > 0 or (
dest_params.get("manual_wal_flush_one_in")
and dest_params.get("column_families") != 1
):
# Reopen with WAL currently requires persisting WAL data before closing for reopen.
# Previous injected WAL write errors may not be cleared by the time of closing and ready
# for persisting WAL.
Expand Down Expand Up @@ -969,7 +983,10 @@ def finalize_and_sanitize(src_params):
# can cause checkpoint verification to fail. So make the two mutually exclusive.
if dest_params.get("checkpoint_one_in") != 0:
dest_params["lock_wal_one_in"] = 0
if dest_params.get("ingest_external_file_one_in") == 0 or dest_params.get("delrangepercent") == 0:
if (
dest_params.get("ingest_external_file_one_in") == 0
or dest_params.get("delrangepercent") == 0
):
dest_params["test_ingest_standalone_range_deletion_one_in"] = 0
return dest_params

Expand Down Expand Up @@ -1017,6 +1034,14 @@ def gen_cmd_params(args):
):
params.update(blob_params)

if "compaction_style" not in params:
# Default to leveled compaction
# TODO: Fix "Unsafe to store Seq later" with tiered+leveled and
# enable that combination rather than falling back to universal.
# TODO: There is also an alleged bug with leveled compaction
# infinite looping but that likely would not fail the crash test.
params["compaction_style"] = 0 if not args.test_tiered_storage else 1

for k, v in vars(args).items():
if v is not None:
params[k] = v
Expand Down Expand Up @@ -1145,7 +1170,9 @@ def blackbox_crash_main(args, unknown_args):
cmd = gen_cmd(
dict(list(cmd_params.items()) + list({"db": dbname}.items())), unknown_args
)
hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params["verify_timeout"], True)
hit_timeout, retcode, outs, errs = execute_cmd(
cmd, cmd_params["verify_timeout"], True
)

# For the final run
print_output_and_exit_on_error(outs, errs, args.print_stderr_separately)
Expand Down Expand Up @@ -1230,12 +1257,8 @@ def whitebox_crash_main(args, unknown_args):
# Single level universal has a lot of special logic. Ensure we cover
# it sometimes.
if not args.test_tiered_storage and random.randint(0, 1) == 1:
additional_opts.update(
{
"num_levels": 1,
}
)
elif check_mode == 2 and not args.test_tiered_storage:
additional_opts["num_levels"] = 1
elif check_mode == 2:
# normal run with FIFO compaction mode
# ops_per_thread is divided by 5 because FIFO compaction
# style is quite a bit slower on reads with lot of files
Expand All @@ -1244,6 +1267,12 @@ def whitebox_crash_main(args, unknown_args):
"ops_per_thread": cmd_params["ops_per_thread"] // 5,
"compaction_style": 2,
}
# TODO: test transition from non-FIFO to FIFO with num_levels > 1.
# See https://github.com/facebook/rocksdb/pull/10348
# For now, tiered storage FIFO (file_temperature_age_thresholds)
# requires num_levels == 1 and non-tiered operates that way.
if args.test_tiered_storage:
additional_opts["num_levels"] = 1
else:
# normal run
additional_opts = {
Expand Down

0 comments on commit 3b91fe8

Please sign in to comment.