From d6a6eb848214135095e9034c24de748c4dfcbe53 Mon Sep 17 00:00:00 2001 From: Jonas Heinrich Date: Tue, 16 Apr 2024 11:50:57 +0200 Subject: [PATCH] Modify benchmarks to compare against stdlib functions This commit refactors and expands the microbenchmarks in order to evaluate the performance hit of handling full unicode. It is expected that `unicode-segmentation`'s functions are slower since they consider graphemes, the question is just how much. - bump criterion dependency - rename benchmarks to remove unicode/grapheme relationship - move benchmarks into benchmark group - add scalar versions with stdlib "equivalents" (scalars) --- Cargo.toml | 7 ++-- benches/chars.rs | 60 +++++++++++++++++++++++++++ benches/graphemes.rs | 63 ---------------------------- benches/unicode_words.rs | 61 ---------------------------- benches/word_bounds.rs | 88 +++++++++++++++------------------------- benches/words.rs | 59 +++++++++++++++++++++++++++ 6 files changed, 154 insertions(+), 184 deletions(-) create mode 100644 benches/chars.rs delete mode 100644 benches/graphemes.rs delete mode 100644 benches/unicode_words.rs create mode 100644 benches/words.rs diff --git a/Cargo.toml b/Cargo.toml index dda0abf..a8d25db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,17 +23,16 @@ no_std = [] # This is a no-op, preserved for backward compatibility only. [dev-dependencies] quickcheck = "0.7" -criterion = "0.3" +criterion = "0.5" [[bench]] -name = "graphemes" +name = "chars" harness = false [[bench]] -name = "unicode_words" +name = "words" harness = false [[bench]] name = "word_bounds" harness = false - diff --git a/benches/chars.rs b/benches/chars.rs new file mode 100644 index 0000000..d8dc5ea --- /dev/null +++ b/benches/chars.rs @@ -0,0 +1,60 @@ +//! Compares the performance of `UnicodeSegmentation::graphemes` with stdlib's UTF-8 scalar-based +//! `std::str::chars`. +//! +//! It is expected that `std::str::chars` is faster than `UnicodeSegmentation::graphemes` since it +//! does not consider the complexity of grapheme clusters. The question in this benchmark +//! is how much slower full unicode handling is. + +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; +use unicode_segmentation; + +use std::fs; +use unicode_segmentation::UnicodeSegmentation; + +const FILES: &[&str] = &[ + "arabic", + "english", + "hindi", + "japanese", + "korean", + "mandarin", + "russian", + "source_code", +]; + +#[inline(always)] +fn grapheme(text: &str) { + for c in UnicodeSegmentation::graphemes(black_box(&*text), true) { + black_box(c); + } +} + +#[inline(always)] +fn scalar(text: &str) { + for c in black_box(&*text).chars() { + black_box(c); + } +} + +fn bench_all(c: &mut Criterion) { + let mut group = c.benchmark_group("chars"); + + for file in FILES { + group.bench_with_input( + BenchmarkId::new("grapheme", file), + &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + |b, content| b.iter(|| grapheme(content)), + ); + } + + for file in FILES { + group.bench_with_input( + BenchmarkId::new("scalar", file), + &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + |b, content| b.iter(|| scalar(content)), + ); + } +} + +criterion_group!(benches, bench_all); +criterion_main!(benches); diff --git a/benches/graphemes.rs b/benches/graphemes.rs deleted file mode 100644 index 3a0b9b7..0000000 --- a/benches/graphemes.rs +++ /dev/null @@ -1,63 +0,0 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use unicode_segmentation; - -use std::fs; -use unicode_segmentation::UnicodeSegmentation; - -fn graphemes(c: &mut Criterion, lang: &str, path: &str) { - let text = fs::read_to_string(path).unwrap(); - - c.bench_function(&format!("graphemes_{}", lang), |bench| { - bench.iter(|| { - for g in UnicodeSegmentation::graphemes(black_box(&*text), true) { - black_box(g); - } - }) - }); -} - -fn graphemes_arabic(c: &mut Criterion) { - graphemes(c, "arabic", "benches/texts/arabic.txt"); -} - -fn graphemes_english(c: &mut Criterion) { - graphemes(c, "english", "benches/texts/english.txt"); -} - -fn graphemes_hindi(c: &mut Criterion) { - graphemes(c, "hindi", "benches/texts/hindi.txt"); -} - -fn graphemes_japanese(c: &mut Criterion) { - graphemes(c, "japanese", "benches/texts/japanese.txt"); -} - -fn graphemes_korean(c: &mut Criterion) { - graphemes(c, "korean", "benches/texts/korean.txt"); -} - -fn graphemes_mandarin(c: &mut Criterion) { - graphemes(c, "mandarin", "benches/texts/mandarin.txt"); -} - -fn graphemes_russian(c: &mut Criterion) { - graphemes(c, "russian", "benches/texts/russian.txt"); -} - -fn graphemes_source_code(c: &mut Criterion) { - graphemes(c, "source_code", "benches/texts/source_code.txt"); -} - -criterion_group!( - benches, - graphemes_arabic, - graphemes_english, - graphemes_hindi, - graphemes_japanese, - graphemes_korean, - graphemes_mandarin, - graphemes_russian, - graphemes_source_code, -); - -criterion_main!(benches); diff --git a/benches/unicode_words.rs b/benches/unicode_words.rs deleted file mode 100644 index a7f8f41..0000000 --- a/benches/unicode_words.rs +++ /dev/null @@ -1,61 +0,0 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; - -use std::fs; -use unicode_segmentation::UnicodeSegmentation; - -fn unicode_words(c: &mut Criterion, lang: &str, path: &str) { - let text = fs::read_to_string(path).unwrap(); - c.bench_function(&format!("unicode_words_{}", lang), |bench| { - bench.iter(|| { - for w in text.unicode_words() { - black_box(w); - } - }) - }); -} - -fn unicode_words_arabic(c: &mut Criterion) { - unicode_words(c, "arabic", "benches/texts/arabic.txt"); -} - -fn unicode_words_english(c: &mut Criterion) { - unicode_words(c, "english", "benches/texts/english.txt"); -} - -fn unicode_words_hindi(c: &mut Criterion) { - unicode_words(c, "hindi", "benches/texts/hindi.txt"); -} - -fn unicode_words_japanese(c: &mut Criterion) { - unicode_words(c, "japanese", "benches/texts/japanese.txt"); -} - -fn unicode_words_korean(c: &mut Criterion) { - unicode_words(c, "korean", "benches/texts/korean.txt"); -} - -fn unicode_words_mandarin(c: &mut Criterion) { - unicode_words(c, "mandarin", "benches/texts/mandarin.txt"); -} - -fn unicode_words_russian(c: &mut Criterion) { - unicode_words(c, "russian", "benches/texts/russian.txt"); -} - -fn unicode_words_source_code(c: &mut Criterion) { - unicode_words(c, "source_code", "benches/texts/source_code.txt"); -} - -criterion_group!( - benches, - unicode_words_arabic, - unicode_words_english, - unicode_words_hindi, - unicode_words_japanese, - unicode_words_korean, - unicode_words_mandarin, - unicode_words_russian, - unicode_words_source_code, -); - -criterion_main!(benches); diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs index cae7a88..42d50ff 100644 --- a/benches/word_bounds.rs +++ b/benches/word_bounds.rs @@ -1,61 +1,37 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; use std::fs; use unicode_segmentation::UnicodeSegmentation; -fn word_bounds(c: &mut Criterion, lang: &str, path: &str) { - let text = fs::read_to_string(path).unwrap(); - c.bench_function(&format!("word_bounds_{}", lang), |bench| { - bench.iter(|| { - for w in text.split_word_bounds() { - black_box(w); - } - }); - }); -} - -fn word_bounds_arabic(c: &mut Criterion) { - word_bounds(c, "arabic", "benches/texts/arabic.txt"); -} - -fn word_bounds_english(c: &mut Criterion) { - word_bounds(c, "english", "benches/texts/english.txt"); -} - -fn word_bounds_hindi(c: &mut Criterion) { - word_bounds(c, "hindi", "benches/texts/hindi.txt"); -} - -fn word_bounds_japanese(c: &mut Criterion) { - word_bounds(c, "japanese", "benches/texts/japanese.txt"); -} - -fn word_bounds_korean(c: &mut Criterion) { - word_bounds(c, "korean", "benches/texts/korean.txt"); -} - -fn word_bounds_mandarin(c: &mut Criterion) { - word_bounds(c, "mandarin", "benches/texts/mandarin.txt"); -} - -fn word_bounds_russian(c: &mut Criterion) { - word_bounds(c, "russian", "benches/texts/russian.txt"); -} - -fn word_bounds_source_code(c: &mut Criterion) { - word_bounds(c, "source_code", "benches/texts/source_code.txt"); -} - -criterion_group!( - benches, - word_bounds_arabic, - word_bounds_english, - word_bounds_hindi, - word_bounds_japanese, - word_bounds_korean, - word_bounds_mandarin, - word_bounds_russian, - word_bounds_source_code, -); - +const FILES: &[&str] = &[ + "arabic", + "english", + "hindi", + "japanese", + "korean", + "mandarin", + "russian", + "source_code", +]; + +#[inline(always)] +fn grapheme(text: &str) { + for w in text.split_word_bounds() { + black_box(w); + } +} + +fn bench_all(c: &mut Criterion) { + let mut group = c.benchmark_group("word_bounds"); + + for file in FILES { + group.bench_with_input( + BenchmarkId::new("grapheme", file), + &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + |b, content| b.iter(|| grapheme(content)), + ); + } +} + +criterion_group!(benches, bench_all); criterion_main!(benches); diff --git a/benches/words.rs b/benches/words.rs new file mode 100644 index 0000000..86785d5 --- /dev/null +++ b/benches/words.rs @@ -0,0 +1,59 @@ +//! Compares the performance of `UnicodeSegmentation::unicode_words` with stdlib's UTF-8 +//! scalar-based `std::str::split_whitespace`. +//! +//! It is expected that `std::str::split_whitespace` is faster than +//! `UnicodeSegmentation::unicode_words` since it does not consider the complexity of grapheme +//! clusters. The question in this benchmark is how much slower full unicode handling is. + +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; + +use std::fs; +use unicode_segmentation::UnicodeSegmentation; + +const FILES: &[&str] = &[ + "arabic", + "english", + "hindi", + "japanese", + "korean", + "mandarin", + "russian", + "source_code", +]; + +#[inline(always)] +fn grapheme(text: &str) { + for w in text.unicode_words() { + black_box(w); + } +} + +#[inline(always)] +fn scalar(text: &str) { + for w in text.split_whitespace() { + black_box(w); + } +} + +fn bench_all(c: &mut Criterion) { + let mut group = c.benchmark_group("words"); + + for file in FILES { + group.bench_with_input( + BenchmarkId::new("grapheme", file), + &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + |b, content| b.iter(|| grapheme(content)), + ); + } + + for file in FILES { + group.bench_with_input( + BenchmarkId::new("scalar", file), + &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + |b, content| b.iter(|| scalar(content)), + ); + } +} + +criterion_group!(benches, bench_all); +criterion_main!(benches);