log: add an interest cache for logs emitted through the log crate (t…

…okio-rs#1636) ## Motivation We use `tracing` as our logger in [`substrate`](https://github.com/paritytech/substrate). We've noticed that as soon as *any* `trace` log is enabled (even one which doesn't exists) the whole logging machinery starts to take a lot of time, even if nothing at all is actually printed! In one of our quick-and-dirty reproduction benchmarks (JIT-ing a WASM program) we saw the total real runtime rise from around ~1.3s to ~7s just by adding a `trace` log filter which doesn't match anything. (Depending on the hardware and on how many threads are simultaneously logging this figure varies pretty widely, but it's always a very significant drop.) After looking into this problem I've found that the culprit of the slowdown were `trace!` and `debug!` logs sprinkled quite liberally in some of the more hot codepaths. When there are no `trace`-level filters defined on the logger it can basically reject those inert `trace!` and `debug!` logs purely based on the current maximum logging level (which is cheap!), but as soon as you define *any* trace filter the current maximum logging changes, and then every `trace!` and `debug!` log has to go through the whole filtering machinery before it can be rejected. While this is cheap if you only do it once, it starts to become very expensive when you do it a lot, especially when you're running multiple threads and enable log reloading. (This is related to tokio-rs#1632.) ## Solution I've added an opt-in per-thread LRU cache which tries to cache whenever the logger is actually interested in a given `target` + `level` pair for every log emitted through the `log` crate. I've also added a benchmark very roughly replicating the situation from our code; here's the performance *without* the cache: (`cargo bench`) ``` [838.67 ns 846.51 ns 854.04 ns] ``` And here's the performance *with* the cache: (`cargo bench --features interest-cache`) ``` [25.322 ns 25.556 ns 25.820 ns] ``` As you can see the per-call cost was cut down to less than ~3%.
kaffarell · Oct 18, 2021 · 4e5f0f0 · 4e5f0f0
1 parent a10ddf8
commit 4e5f0f0
Show file tree

Hide file tree

Showing 5 changed files with 705 additions and 3 deletions.
diff --git a/tracing-log/Cargo.toml b/tracing-log/Cargo.toml
@@ -21,19 +21,28 @@ default = ["log-tracer", "trace-logger", "std"]
 std = ["log/std"]
 log-tracer = []
 trace-logger = []
+interest-cache = ["lru", "ahash"]
 
 [dependencies]
 tracing-core = { path = "../tracing-core", version = "0.1.17"}
 log = { version = "0.4" }
 lazy_static = "1.3.0"
 env_logger = { version = "0.7", optional = true }
+lru = { version = "0.7.0", optional = true }
+ahash = { version = "0.7.4", optional = true }
 
 [dev-dependencies]
 tracing = { path = "../tracing", version = "0.1"}
+tracing-subscriber = { path = "../tracing-subscriber" }
+criterion = { version = "0.3", default_features = false }
 
 [badges]
 maintenance = { status = "actively-maintained" }
 
 [package.metadata.docs.rs]
 all-features = true
 rustdoc-args = ["--cfg", "docsrs"]
+
+[[bench]]
+name = "logging"
+harness = false
diff --git a/tracing-log/benches/logging.rs b/tracing-log/benches/logging.rs
@@ -0,0 +1,91 @@
+use criterion::{criterion_group, criterion_main, Criterion};
+use log::trace;
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+use tracing_subscriber::{EnvFilter, FmtSubscriber};
+
+// This creates a bunch of threads and makes sure they start executing
+// a given callback almost exactly at the same time.
+fn run_on_many_threads<F, R>(thread_count: usize, callback: F) -> Vec<R>
+where
+    F: Fn() -> R + 'static + Send + Clone,
+    R: Send + 'static,
+{
+    let started_count = Arc::new(AtomicUsize::new(0));
+    let barrier = Arc::new(AtomicBool::new(false));
+    #[allow(clippy::needless_collect)]
+    let threads: Vec<_> = (0..thread_count)
+        .map(|_| {
+            let started_count = started_count.clone();
+            let barrier = barrier.clone();
+            let callback = callback.clone();
+
+            std::thread::spawn(move || {
+                started_count.fetch_add(1, Ordering::SeqCst);
+                while !barrier.load(Ordering::SeqCst) {
+                    std::thread::yield_now();
+                }
+
+                callback()
+            })
+        })
+        .collect();
+
+    while started_count.load(Ordering::SeqCst) != thread_count {
+        std::thread::yield_now();
+    }
+    barrier.store(true, Ordering::SeqCst);
+
+    threads
+        .into_iter()
+        .map(|handle| handle.join())
+        .collect::<Result<Vec<R>, _>>()
+        .unwrap()
+}
+
+fn bench_logger(c: &mut Criterion) {
+    let env_filter = EnvFilter::default()
+        .add_directive("info".parse().unwrap())
+        .add_directive("ws=off".parse().unwrap())
+        .add_directive("yamux=off".parse().unwrap())
+        .add_directive("regalloc=off".parse().unwrap())
+        .add_directive("cranelift_codegen=off".parse().unwrap())
+        .add_directive("cranelift_wasm=warn".parse().unwrap())
+        .add_directive("hyper=warn".parse().unwrap())
+        .add_directive("dummy=trace".parse().unwrap());
+
+    let builder = tracing_log::LogTracer::builder().with_max_level(log::LevelFilter::Trace);
+
+    #[cfg(feature = "interest-cache")]
+    let builder = builder.with_interest_cache(tracing_log::InterestCacheConfig::default());
+
+    builder.init().unwrap();
+
+    let builder = FmtSubscriber::builder()
+        .with_env_filter(env_filter)
+        .with_filter_reloading();
+
+    let subscriber = builder.finish();
+    tracing::subscriber::set_global_default(subscriber).unwrap();
+
+    const THREAD_COUNT: usize = 8;
+
+    c.bench_function("log_from_multiple_threads", |b| {
+        b.iter_custom(|count| {
+            let durations = run_on_many_threads(THREAD_COUNT, move || {
+                let start = Instant::now();
+                for _ in 0..count {
+                    trace!("A dummy log!");
+                }
+                start.elapsed()
+            });
+
+            let total_time: Duration = durations.into_iter().sum();
+            Duration::from_nanos((total_time.as_nanos() / THREAD_COUNT as u128) as u64)
+        })
+    });
+}
+
+criterion_group!(benches, bench_logger);
+criterion_main!(benches);