From dc243befa4c6e3c15d3534fe8c4c348977f24932 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 19 Mar 2024 03:01:47 +0000 Subject: [PATCH] Improve: Read benchmark duration from CLI --- scripts/bench.hpp | 35 ++++++++++++++++++++--------------- scripts/bench_container.cpp | 2 +- scripts/bench_search.cpp | 8 ++++++-- scripts/bench_similarity.cpp | 2 +- scripts/bench_sort.cpp | 2 +- scripts/bench_token.cpp | 2 +- 6 files changed, 30 insertions(+), 21 deletions(-) diff --git a/scripts/bench.hpp b/scripts/bench.hpp index b685474a..03bc4d1e 100644 --- a/scripts/bench.hpp +++ b/scripts/bench.hpp @@ -19,12 +19,6 @@ #include "test.hpp" // `read_file` -#if SZ_DEBUG // Make debugging faster -#define default_seconds_m 10 -#else -#define default_seconds_m 30 -#endif - namespace sz = ashvardanian::stringzilla; namespace ashvardanian { @@ -162,6 +156,8 @@ inline std::vector filter_by_length(std::vector tokens; @@ -206,9 +202,20 @@ inline dataset_t make_dataset_from_path(std::string path) { /** * @brief Loads a dataset, depending on the passed CLI arguments. */ -inline dataset_t make_dataset(int argc, char const *argv[]) { - if (argc != 2) { throw std::runtime_error("Usage: " + std::string(argv[0]) + " "); } - return make_dataset_from_path(argv[1]); +inline dataset_t prepare_benchmark_environment(int argc, char const *argv[]) { + if (argc < 2 || argc > 3) + throw std::runtime_error("Usage: " + std::string(argv[0]) + " [seconds_per_benchmark]"); + + dataset_t data = make_dataset_from_path(argv[1]); + + // If the seconds_per_benchmark argument is provided, update the value in the dataset + if (argc == 3) { + seconds_per_benchmark = std::stoi(argv[2]); + if (seconds_per_benchmark == 0) + throw std::invalid_argument("The number of seconds per task must be greater than 0."); + } + + return data; } inline sz_string_view_t to_c(std::string_view str) noexcept { return {str.data(), str.size()}; } @@ -224,8 +231,7 @@ inline sz_string_view_t to_c(sz_string_view_t str) noexcept { return str; } * @return Number of seconds per iteration. */ template -benchmark_result_t bench_on_tokens(strings_type &&strings, function_type &&function, - seconds_t max_time = default_seconds_m) { +benchmark_result_t bench_on_tokens(strings_type &&strings, function_type &&function) { namespace stdc = std::chrono; using stdcc = stdc::high_resolution_clock; @@ -245,7 +251,7 @@ benchmark_result_t bench_on_tokens(strings_type &&strings, function_type &&funct stdcc::time_point t2 = stdcc::now(); result.seconds = stdc::duration_cast(t2 - t1).count() / 1.e9; - if (result.seconds > max_time) break; + if (result.seconds > seconds_per_benchmark) break; } return result; @@ -259,8 +265,7 @@ benchmark_result_t bench_on_tokens(strings_type &&strings, function_type &&funct * @return Number of seconds per iteration. */ template -benchmark_result_t bench_on_token_pairs(strings_type &&strings, function_type &&function, - seconds_t max_time = default_seconds_m) { +benchmark_result_t bench_on_token_pairs(strings_type &&strings, function_type &&function) { namespace stdc = std::chrono; using stdcc = stdc::high_resolution_clock; @@ -282,7 +287,7 @@ benchmark_result_t bench_on_token_pairs(strings_type &&strings, function_type && stdcc::time_point t2 = stdcc::now(); result.seconds = stdc::duration_cast(t2 - t1).count() / 1.e9; - if (result.seconds > max_time) break; + if (result.seconds > seconds_per_benchmark) break; } return result; diff --git a/scripts/bench_container.cpp b/scripts/bench_container.cpp index 6a77ba45..17cd1ec6 100644 --- a/scripts/bench_container.cpp +++ b/scripts/bench_container.cpp @@ -72,7 +72,7 @@ void bench_tokens(strings_type const &strings) { int main(int argc, char const **argv) { std::printf("StringZilla. Starting search benchmarks.\n"); - dataset_t dataset = make_dataset(argc, argv); + dataset_t dataset = prepare_benchmark_environment(argc, argv); // Baseline benchmarks for real words, coming in all lengths std::printf("Benchmarking on real words:\n"); diff --git a/scripts/bench_search.cpp b/scripts/bench_search.cpp index c8175f58..ada4ded4 100644 --- a/scripts/bench_search.cpp +++ b/scripts/bench_search.cpp @@ -292,19 +292,23 @@ void bench_search(std::string const &haystack, std::vector const &s int main(int argc, char const **argv) { std::printf("StringZilla. Starting search benchmarks.\n"); - dataset_t dataset = make_dataset(argc, argv); + dataset_t dataset = prepare_benchmark_environment(argc, argv); // Splitting by new lines std::printf("Benchmarking for a newline symbol:\n"); bench_finds(dataset.text, {"\n"}, find_functions()); bench_rfinds(dataset.text, {"\n"}, rfind_functions()); + std::printf("Benchmarking for one whitespace:\n"); + bench_finds(dataset.text, {" "}, find_functions()); + bench_rfinds(dataset.text, {" "}, rfind_functions()); + std::printf("Benchmarking for an [\\n\\r\\v\\f] RegEx:\n"); bench_finds(dataset.text, {"\n\r\v\f"}, find_charset_functions()); bench_rfinds(dataset.text, {"\n\r\v\f"}, rfind_charset_functions()); // Typical ASCII tokenization and validation benchmarks - std::printf("Benchmarking for whitespaces:\n"); + std::printf("Benchmarking for all whitespaces:\n"); bench_finds(dataset.text, {{sz::whitespaces(), sizeof(sz::whitespaces())}}, find_charset_functions()); bench_rfinds(dataset.text, {{sz::whitespaces(), sizeof(sz::whitespaces())}}, rfind_charset_functions()); diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cpp index e1112a69..b2c36a60 100644 --- a/scripts/bench_similarity.cpp +++ b/scripts/bench_similarity.cpp @@ -107,7 +107,7 @@ void bench_similarity_on_bio_data() { void bench_similarity_on_input_data(int argc, char const **argv) { - dataset_t dataset = make_dataset(argc, argv); + dataset_t dataset = prepare_benchmark_environment(argc, argv); // Baseline benchmarks for real words, coming in all lengths std::printf("Benchmarking on real words:\n"); diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp index bc4af07a..683164ab 100644 --- a/scripts/bench_sort.cpp +++ b/scripts/bench_sort.cpp @@ -142,7 +142,7 @@ void bench_permute(char const *name, strings_t &strings, permute_t &permute, alg int main(int argc, char const **argv) { std::printf("StringZilla. Starting sorting benchmarks.\n"); - dataset_t dataset = make_dataset(argc, argv); + dataset_t dataset = prepare_benchmark_environment(argc, argv); strings_t strings {dataset.tokens.begin(), dataset.tokens.end()}; permute_t permute_base, permute_new; diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp index 4eb1db04..0a57ff69 100644 --- a/scripts/bench_token.cpp +++ b/scripts/bench_token.cpp @@ -161,7 +161,7 @@ void bench(strings_type &&strings) { } void bench_on_input_data(int argc, char const **argv) { - dataset_t dataset = make_dataset(argc, argv); + dataset_t dataset = prepare_benchmark_environment(argc, argv); std::printf("Benchmarking on the entire dataset:\n"); bench_unary_functions(dataset.tokens, random_generation_functions(100));