From 27224c2a59e4ebc9b43fe6370ce22ec17ad3f666 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 24 Oct 2023 17:35:25 -0400
Subject: [PATCH 01/23] ZIL: Detect single-threaded workloads

... by checking that previous block is fully written and flushed.
It allows to skip commit delays since we can give up on aggregation
in that case.  This removes zil_min_commit_timeout parameter, since
for single-threaded workloads it is not needed at all, while on very
fast devices even some multi-threaded workloads may get detected as
single-threaded and still bypass the wait.  To give multi-threaded
workloads more aggregation chances increase zfs_commit_timeout_pct
from 5 to 10%, as they should suffer less from additional latency.

Also single-threaded workloads detection allows in perspective better
prediction of the next block size.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15381
---
 include/sys/zil_impl.h |  4 +-
 man/man4/zfs.4         |  9 +----
 module/zfs/zil.c       | 91 +++++++++++++++++++-----------------------
 3 files changed, 44 insertions(+), 60 deletions(-)

diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h
index f780ad3d61bc..c9db6d428ea2 100644
--- a/include/sys/zil_impl.h
+++ b/include/sys/zil_impl.h
@@ -181,6 +181,7 @@ typedef struct zil_vdev_node {
 	avl_node_t	zv_node;	/* AVL tree linkage */
 } zil_vdev_node_t;
 
+#define	ZIL_BURSTS 8
 #define	ZIL_PREV_BLKS 16
 
 /*
@@ -222,8 +223,9 @@ struct zilog {
 	clock_t		zl_replay_time;	/* lbolt of when replay started */
 	uint64_t	zl_replay_blks;	/* number of log blocks replayed */
 	zil_header_t	zl_old_header;	/* debugging aid */
-	uint_t		zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
+	uint_t		zl_parallel;	/* workload is multi-threaded */
 	uint_t		zl_prev_rotor;	/* rotor for zl_prev[] */
+	uint_t		zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
 	txg_node_t	zl_dirty_link;	/* protected by dp_dirty_zilogs list */
 	uint64_t	zl_dirty_max_txg; /* highest txg used to dirty zilog */
 
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index bab40dd91c72..5307f1f32e93 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -799,7 +799,7 @@ Note that this should not be set below the ZED thresholds
 (currently 10 checksums over 10 seconds)
 or else the daemon may not trigger any action.
 .
-.It Sy zfs_commit_timeout_pct Ns = Ns Sy 5 Ns % Pq uint
+.It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint
 This controls the amount of time that a ZIL block (lwb) will remain "open"
 when it isn't "full", and it has a thread waiting for it to be committed to
 stable storage.
@@ -2206,13 +2206,6 @@ This sets the maximum number of write bytes logged via WR_COPIED.
 It tunes a tradeoff between additional memory copy and possibly worse log
 space efficiency vs additional range lock/unlock.
 .
-.It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64
-This sets the minimum delay in nanoseconds ZIL care to delay block commit,
-waiting for more records.
-If ZIL writes are too fast, kernel may not be able sleep for so short interval,
-increasing log latency above allowed by
-.Sy zfs_commit_timeout_pct .
-.
 .It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable the cache flush commands that are normally sent to disk by
 the ZIL after an LWB write has completed.
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 5642f082bdb8..8742fc6623a1 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -91,15 +91,7 @@
  * committed to stable storage. Please refer to the zil_commit_waiter()
  * function (and the comments within it) for more details.
  */
-static uint_t zfs_commit_timeout_pct = 5;
-
-/*
- * Minimal time we care to delay commit waiting for more ZIL records.
- * At least FreeBSD kernel can't sleep for less than 2us at its best.
- * So requests to sleep for less then 5us is a waste of CPU time with
- * a risk of significant log latency increase due to oversleep.
- */
-static uint64_t zil_min_commit_timeout = 5000;
+static uint_t zfs_commit_timeout_pct = 10;
 
 /*
  * See zil.h for more information about these fields.
@@ -2732,6 +2724,19 @@ zil_commit_writer_stall(zilog_t *zilog)
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 }
 
+static void
+zil_burst_done(zilog_t *zilog)
+{
+	if (!list_is_empty(&zilog->zl_itx_commit_list) ||
+	    zilog->zl_cur_used == 0)
+		return;
+
+	if (zilog->zl_parallel)
+		zilog->zl_parallel--;
+
+	zilog->zl_cur_used = 0;
+}
+
 /*
  * This function will traverse the commit list, creating new lwbs as
  * needed, and committing the itxs from the commit list to these newly
@@ -2746,7 +2751,6 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 	list_t nolwb_waiters;
 	lwb_t *lwb, *plwb;
 	itx_t *itx;
-	boolean_t first = B_TRUE;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
@@ -2772,9 +2776,22 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		zil_commit_activate_saxattr_feature(zilog);
 		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 		    lwb->lwb_state == LWB_STATE_OPENED);
-		first = (lwb->lwb_state == LWB_STATE_NEW) &&
-		    ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL ||
-		    plwb->lwb_state == LWB_STATE_FLUSH_DONE);
+
+		/*
+		 * If the lwb is still opened, it means the workload is really
+		 * multi-threaded and we won the chance of write aggregation.
+		 * If it is not opened yet, but previous lwb is still not
+		 * flushed, it still means the workload is multi-threaded, but
+		 * there was too much time between the commits to aggregate, so
+		 * we try aggregation next times, but without too much hopes.
+		 */
+		if (lwb->lwb_state == LWB_STATE_OPENED) {
+			zilog->zl_parallel = ZIL_BURSTS;
+		} else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
+		    != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
+			zilog->zl_parallel = MAX(zilog->zl_parallel,
+			    ZIL_BURSTS / 2);
+		}
 	}
 
 	while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
@@ -2849,7 +2866,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 					 * Our lwb is done, leave the rest of
 					 * itx list to somebody else who care.
 					 */
-					first = B_FALSE;
+					zilog->zl_parallel = ZIL_BURSTS;
 					break;
 				}
 			} else {
@@ -2941,28 +2958,15 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		 * try and pack as many itxs into as few lwbs as
 		 * possible, without significantly impacting the latency
 		 * of each individual itx.
-		 *
-		 * If we had no already running or open LWBs, it can be
-		 * the workload is single-threaded.  And if the ZIL write
-		 * latency is very small or if the LWB is almost full, it
-		 * may be cheaper to bypass the delay.
 		 */
-		if (lwb->lwb_state == LWB_STATE_OPENED && first) {
-			hrtime_t sleep = zilog->zl_last_lwb_latency *
-			    zfs_commit_timeout_pct / 100;
-			if (sleep < zil_min_commit_timeout ||
-			    lwb->lwb_nmax - lwb->lwb_nused <
-			    lwb->lwb_nmax / 8) {
-				list_insert_tail(ilwbs, lwb);
-				lwb = zil_lwb_write_close(zilog, lwb,
-				    LWB_STATE_NEW);
-				zilog->zl_cur_used = 0;
-				if (lwb == NULL) {
-					while ((lwb = list_remove_head(ilwbs))
-					    != NULL)
-						zil_lwb_write_issue(zilog, lwb);
-					zil_commit_writer_stall(zilog);
-				}
+		if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
+			list_insert_tail(ilwbs, lwb);
+			lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
+			zil_burst_done(zilog);
+			if (lwb == NULL) {
+				while ((lwb = list_remove_head(ilwbs)) != NULL)
+					zil_lwb_write_issue(zilog, lwb);
+				zil_commit_writer_stall(zilog);
 			}
 		}
 	}
@@ -3120,19 +3124,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
-	/*
-	 * Since the lwb's zio hadn't been issued by the time this thread
-	 * reached its timeout, we reset the zilog's "zl_cur_used" field
-	 * to influence the zil block size selection algorithm.
-	 *
-	 * By having to issue the lwb's zio here, it means the size of the
-	 * lwb was too large, given the incoming throughput of itxs.  By
-	 * setting "zl_cur_used" to zero, we communicate this fact to the
-	 * block size selection algorithm, so it can take this information
-	 * into account, and potentially select a smaller size for the
-	 * next lwb block that is allocated.
-	 */
-	zilog->zl_cur_used = 0;
+	zil_burst_done(zilog);
 
 	if (nlwb == NULL) {
 		/*
@@ -4250,9 +4242,6 @@ EXPORT_SYMBOL(zil_kstat_values_update);
 ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
 	"ZIL block open timeout percentage");
 
-ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, U64, ZMOD_RW,
-	"Minimum delay we care for ZIL block commit");
-
 ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
 	"Disable intent logging replay");
 

From 98c24e91f2bb339ba5a72e7ce64361febf1b855c Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 17 Nov 2023 17:00:59 -0500
Subject: [PATCH 02/23] ZIO: Optimize zio_flush()

- Generalize vdev_nowritecache handling by traversing through the
VDEV tree and skipping children ZIOs where not supported.
 - Remove intermediate zio_null() in case of several VDEV children.
 - Remove children handling from zio_ioctl().  There are no other
use cases for this code beside DKIOCFLUSHWRITECACHED, and would there
be, I doubt they would so straightforward apply to all VDEV children.

Comparing to removed previous optimization this should improve cases
of redundant ZILs/SLOGs.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15515
---
 module/zfs/zil.c |  2 +-
 module/zfs/zio.c | 36 +++++++++++++++---------------------
 2 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 8742fc6623a1..7ad0fb344b7b 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1622,7 +1622,7 @@ zil_lwb_write_done(zio_t *zio)
 
 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
-		if (vd != NULL && !vd->vdev_nowritecache) {
+		if (vd != NULL) {
 			/*
 			 * The "ZIO_FLAG_DONT_PROPAGATE" is currently
 			 * always used within "zio_flush". This means,
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index d8eb075eef54..d0b4016237b9 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1435,23 +1435,10 @@ zio_t *
 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
     zio_done_func_t *done, void *private, zio_flag_t flags)
 {
-	zio_t *zio;
-	int c;
-
-	if (vd->vdev_children == 0) {
-		zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
-		    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
-		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
-
-		zio->io_cmd = cmd;
-	} else {
-		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
-
-		for (c = 0; c < vd->vdev_children; c++)
-			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
-			    done, private, flags));
-	}
-
+	zio_t *zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
+	    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
+	    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
+	zio->io_cmd = cmd;
 	return (zio);
 }
 
@@ -1622,11 +1609,18 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
 }
 
 void
-zio_flush(zio_t *zio, vdev_t *vd)
+zio_flush(zio_t *pio, vdev_t *vd)
 {
-	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
-	    NULL, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+	if (vd->vdev_nowritecache)
+		return;
+	if (vd->vdev_children == 0) {
+		zio_nowait(zio_ioctl(pio, vd->vdev_spa, vd,
+		    DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL |
+		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+	} else {
+		for (uint64_t c = 0; c < vd->vdev_children; c++)
+			zio_flush(pio, vd->vdev_child[c]);
+	}
 }
 
 void

From 6e7331d140f5c489e7ec3b4fd0582a4dad5a389e Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 21 Dec 2023 13:54:44 -0500
Subject: [PATCH 03/23] ZIL: Improve next log block size prediction

Track history in context of bursts, not individual log blocks. It
allows to not blow away all the history by single large burst of
many block, and same time allows optimizations covering multiple
blocks in a burst and even predicted following burst.  For each
burst account its optimal block size and minimal first block size.
Use that statistics from the last 8 bursts to predict first block
size of the next burst.

Remove predefined set of block sizes. Allocate any size we see fit,
multiple of 4KB, as required by ZIL now.  With compression enabled
by default, ZFS already writes pretty random block sizes, so this
should not surprise space allocator any more.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15635
---
 include/sys/zil_impl.h |   8 +-
 module/zfs/zil.c       | 267 ++++++++++++++++++++++++++++++-----------
 2 files changed, 201 insertions(+), 74 deletions(-)

diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h
index c9db6d428ea2..9a34bafc1c77 100644
--- a/include/sys/zil_impl.h
+++ b/include/sys/zil_impl.h
@@ -182,7 +182,6 @@ typedef struct zil_vdev_node {
 } zil_vdev_node_t;
 
 #define	ZIL_BURSTS 8
-#define	ZIL_PREV_BLKS 16
 
 /*
  * Stable storage intent log management structure.  One per dataset.
@@ -217,7 +216,9 @@ struct zilog {
 	uint64_t	zl_parse_lr_count; /* number of log records parsed */
 	itxg_t		zl_itxg[TXG_SIZE]; /* intent log txg chains */
 	list_t		zl_itx_commit_list; /* itx list to be committed */
-	uint64_t	zl_cur_used;	/* current commit log size used */
+	uint64_t	zl_cur_size;	/* current burst full size */
+	uint64_t	zl_cur_left;	/* current burst remaining size */
+	uint64_t	zl_cur_max;	/* biggest record in current burst */
 	list_t		zl_lwb_list;	/* in-flight log write list */
 	avl_tree_t	zl_bp_tree;	/* track bps during log parse */
 	clock_t		zl_replay_time;	/* lbolt of when replay started */
@@ -225,7 +226,8 @@ struct zilog {
 	zil_header_t	zl_old_header;	/* debugging aid */
 	uint_t		zl_parallel;	/* workload is multi-threaded */
 	uint_t		zl_prev_rotor;	/* rotor for zl_prev[] */
-	uint_t		zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
+	uint_t		zl_prev_opt[ZIL_BURSTS]; /* optimal block size */
+	uint_t		zl_prev_min[ZIL_BURSTS]; /* minimal first block size */
 	txg_node_t	zl_dirty_link;	/* protected by dp_dirty_zilogs list */
 	uint64_t	zl_dirty_max_txg; /* highest txg used to dirty zilog */
 
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 7ad0fb344b7b..9b5d866a8c22 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -144,6 +144,7 @@ static kmem_cache_t *zil_zcw_cache;
 
 static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
 static itx_t *zil_itx_clone(itx_t *oitx);
+static uint64_t zil_max_waste_space(zilog_t *zilog);
 
 static int
 zil_bp_compare(const void *x1, const void *x2)
@@ -1710,24 +1711,6 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
 	mutex_exit(&zilog->zl_lock);
 }
 
-/*
- * Define a limited set of intent log block sizes.
- *
- * These must be a multiple of 4KB. Note only the amount used (again
- * aligned to 4KB) actually gets written. However, we can't always just
- * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
- */
-static const struct {
-	uint64_t	limit;
-	uint64_t	blksz;
-} zil_block_buckets[] = {
-	{ 4096,		4096 },			/* non TX_WRITE */
-	{ 8192 + 4096,	8192 + 4096 },		/* database */
-	{ 32768 + 4096,	32768 + 4096 },		/* NFS writes */
-	{ 65536 + 4096,	65536 + 4096 },		/* 64KB writes */
-	{ UINT64_MAX,	SPA_OLD_MAXBLOCKSIZE},	/* > 128KB writes */
-};
-
 /*
  * Maximum block size used by the ZIL.  This is picked up when the ZIL is
  * initialized.  Otherwise this should not be used directly; see
@@ -1735,6 +1718,91 @@ static const struct {
  */
 static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
 
+/*
+ * Plan splitting of the provided burst size between several blocks.
+ */
+static uint_t
+zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
+{
+	uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t);
+
+	if (size <= md) {
+		/*
+		 * Small bursts are written as-is in one block.
+		 */
+		*minsize = size;
+		return (size);
+	} else if (size > 8 * md) {
+		/*
+		 * Big bursts use maximum blocks.  The first block size
+		 * is hard to predict, but it does not really matter.
+		 */
+		*minsize = 0;
+		return (md);
+	}
+
+	/*
+	 * Medium bursts try to divide evenly to better utilize several SLOG
+	 * VDEVs.  The first block size we predict assuming the worst case of
+	 * maxing out others.  Fall back to using maximum blocks if due to
+	 * large records or wasted space we can not predict anything better.
+	 */
+	uint_t s = size;
+	uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t));
+	uint_t chunk = DIV_ROUND_UP(s, n);
+	uint_t waste = zil_max_waste_space(zilog);
+	waste = MAX(waste, zilog->zl_cur_max);
+	if (chunk <= md - waste) {
+		*minsize = MAX(s - (md - waste) * (n - 1), waste);
+		return (chunk);
+	} else {
+		*minsize = 0;
+		return (md);
+	}
+}
+
+/*
+ * Try to predict next block size based on previous history.  Make prediction
+ * sufficient for 7 of 8 previous bursts.  Don't try to save if the saving is
+ * less then 50%, extra writes may cost more, but we don't want single spike
+ * to badly affect our predictions.
+ */
+static uint_t
+zil_lwb_predict(zilog_t *zilog)
+{
+	uint_t m, o;
+
+	/* If we are in the middle of a burst, take it into account also. */
+	if (zilog->zl_cur_size > 0) {
+		o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m);
+	} else {
+		o = UINT_MAX;
+		m = 0;
+	}
+
+	/* Find minimum optimal size.  We don't need to go below that. */
+	for (int i = 0; i < ZIL_BURSTS; i++)
+		o = MIN(o, zilog->zl_prev_opt[i]);
+
+	/* Find two biggest minimal first block sizes above the optimal. */
+	uint_t m1 = MAX(m, o), m2 = o;
+	for (int i = 0; i < ZIL_BURSTS; i++) {
+		m = zilog->zl_prev_min[i];
+		if (m >= m1) {
+			m2 = m1;
+			m1 = m;
+		} else if (m > m2) {
+			m2 = m;
+		}
+	}
+
+	/*
+	 * If second minimum size gives 50% saving -- use it.  It may cost us
+	 * one additional write later, but the space saving is just too big.
+	 */
+	return ((m1 < m2 * 2) ? m1 : m2);
+}
+
 /*
  * Close the log block for being issued and allocate the next one.
  * Has to be called under zl_issuer_lock to chain more lwbs.
@@ -1742,7 +1810,7 @@ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
 static lwb_t *
 zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
 {
-	int i;
+	uint64_t blksz, plan, plan2;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
@@ -1757,34 +1825,40 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
 		return (NULL);
 
 	/*
-	 * Log blocks are pre-allocated. Here we select the size of the next
-	 * block, based on size used in the last block.
-	 * - first find the smallest bucket that will fit the block from a
-	 *   limited set of block sizes. This is because it's faster to write
-	 *   blocks allocated from the same metaslab as they are adjacent or
-	 *   close.
-	 * - next find the maximum from the new suggested size and an array of
-	 *   previous sizes. This lessens a picket fence effect of wrongly
-	 *   guessing the size if we have a stream of say 2k, 64k, 2k, 64k
-	 *   requests.
-	 *
-	 * Note we only write what is used, but we can't just allocate
-	 * the maximum block size because we can exhaust the available
-	 * pool log space.
+	 * Log blocks are pre-allocated.  Here we select the size of the next
+	 * block, based on what's left of this burst and the previous history.
+	 * While we try to only write used part of the block, we can't just
+	 * always allocate the maximum block size because we can exhaust all
+	 * available pool log space, so we try to be reasonable.
 	 */
-	uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
-	for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
-		continue;
-	zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
-	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
-	for (i = 0; i < ZIL_PREV_BLKS; i++)
-		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
-	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog,
-	    uint64_t, zil_blksz,
-	    uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]);
-	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
-
-	return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state));
+	if (zilog->zl_cur_left > 0) {
+		/*
+		 * We are in the middle of a burst and know how much is left.
+		 * But if workload is multi-threaded there may be more soon.
+		 * Try to predict what can it be and plan for the worst case.
+		 */
+		uint_t m;
+		plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
+		if (zilog->zl_parallel) {
+			plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left +
+			    zil_lwb_predict(zilog), &m);
+			if (plan < plan2)
+				plan = plan2;
+		}
+	} else {
+		/*
+		 * The previous burst is done and we can only predict what
+		 * will come next.
+		 */
+		plan = zil_lwb_predict(zilog);
+	}
+	blksz = plan + sizeof (zil_chain_t);
+	blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t);
+	blksz = MIN(blksz, zilog->zl_max_block_size);
+	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz,
+	    uint64_t, plan);
+
+	return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state));
 }
 
 /*
@@ -1835,7 +1909,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 	int wsz = lwb->lwb_sz;
 	if (lwb->lwb_error == 0) {
 		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
-		if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
+		if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk)
 			prio = ZIO_PRIORITY_SYNC_WRITE;
 		else
 			prio = ZIO_PRIORITY_ASYNC_WRITE;
@@ -1996,6 +2070,42 @@ zil_max_copied_data(zilog_t *zilog)
 	return (MIN(max_data, zil_maxcopied));
 }
 
+static uint64_t
+zil_itx_record_size(itx_t *itx)
+{
+	lr_t *lr = &itx->itx_lr;
+
+	if (lr->lrc_txtype == TX_COMMIT)
+		return (0);
+	ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
+	return (lr->lrc_reclen);
+}
+
+static uint64_t
+zil_itx_data_size(itx_t *itx)
+{
+	lr_t *lr = &itx->itx_lr;
+	lr_write_t *lrw = (lr_write_t *)lr;
+
+	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
+		ASSERT3U(lr->lrc_reclen, ==, sizeof (lr_write_t));
+		return (P2ROUNDUP_TYPED(lrw->lr_length, sizeof (uint64_t),
+		    uint64_t));
+	}
+	return (0);
+}
+
+static uint64_t
+zil_itx_full_size(itx_t *itx)
+{
+	lr_t *lr = &itx->itx_lr;
+
+	if (lr->lrc_txtype == TX_COMMIT)
+		return (0);
+	ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
+	return (lr->lrc_reclen + zil_itx_data_size(itx));
+}
+
 /*
  * Estimate space needed in the lwb for the itx.  Allocate more lwbs or
  * split the itx as needed, but don't touch the actual transaction data.
@@ -2038,16 +2148,9 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
 	}
 
 	reclen = lr->lrc_reclen;
-	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
-		ASSERT3U(reclen, ==, sizeof (lr_write_t));
-		dlen = P2ROUNDUP_TYPED(
-		    lrw->lr_length, sizeof (uint64_t), uint64_t);
-	} else {
-		ASSERT3U(reclen, >=, sizeof (lr_t));
-		dlen = 0;
-	}
+	ASSERT3U(reclen, >=, sizeof (lr_t));
 	ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0));
-	zilog->zl_cur_used += (reclen + dlen);
+	dlen = zil_itx_data_size(itx);
 
 cont:
 	/*
@@ -2088,6 +2191,7 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
 		clrw->lr_length = dnow;
 		lrw->lr_offset += dnow;
 		lrw->lr_length -= dnow;
+		zilog->zl_cur_left -= dnow;
 	} else {
 		citx = itx;
 		clr = lr;
@@ -2109,10 +2213,8 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
 	list_insert_tail(&lwb->lwb_itxs, citx);
 
 	dlen -= dnow;
-	if (dlen > 0) {
-		zilog->zl_cur_used += reclen;
+	if (dlen > 0)
 		goto cont;
-	}
 
 	if (lr->lrc_txtype == TX_WRITE &&
 	    lr->lrc_txg > spa_freeze_txg(zilog->zl_spa))
@@ -2139,13 +2241,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
 	if (lr->lrc_txtype == TX_COMMIT)
 		return;
 
-	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
-		dlen = P2ROUNDUP_TYPED(
-		    lrw->lr_length, sizeof (uint64_t), uint64_t);
-	} else {
-		dlen = 0;
-	}
 	reclen = lr->lrc_reclen;
+	dlen = zil_itx_data_size(itx);
 	ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled);
 
 	lr_buf = lwb->lwb_buf + lwb->lwb_nfilled;
@@ -2576,6 +2673,7 @@ zil_get_commit_list(zilog_t *zilog)
 		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
 		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 		list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
+		itx_t *itx = NULL;
 		if (unlikely(zilog->zl_suspend > 0)) {
 			/*
 			 * ZIL was just suspended, but we lost the race.
@@ -2585,10 +2683,20 @@ zil_get_commit_list(zilog_t *zilog)
 			if (!list_is_empty(sync_list))
 				wtxg = MAX(wtxg, txg);
 		} else {
+			itx = list_head(sync_list);
 			list_move_tail(commit_list, sync_list);
 		}
 
 		mutex_exit(&itxg->itxg_lock);
+
+		while (itx != NULL) {
+			uint64_t s = zil_itx_full_size(itx);
+			zilog->zl_cur_size += s;
+			zilog->zl_cur_left += s;
+			s = zil_itx_record_size(itx);
+			zilog->zl_cur_max = MAX(zilog->zl_cur_max, s);
+			itx = list_next(commit_list, itx);
+		}
 	}
 	return (wtxg);
 }
@@ -2728,13 +2836,20 @@ static void
 zil_burst_done(zilog_t *zilog)
 {
 	if (!list_is_empty(&zilog->zl_itx_commit_list) ||
-	    zilog->zl_cur_used == 0)
+	    zilog->zl_cur_size == 0)
 		return;
 
 	if (zilog->zl_parallel)
 		zilog->zl_parallel--;
 
-	zilog->zl_cur_used = 0;
+	uint_t r = (zilog->zl_prev_rotor + 1) & (ZIL_BURSTS - 1);
+	zilog->zl_prev_rotor = r;
+	zilog->zl_prev_opt[r] = zil_lwb_plan(zilog, zilog->zl_cur_size,
+	    &zilog->zl_prev_min[r]);
+
+	zilog->zl_cur_size = 0;
+	zilog->zl_cur_max = 0;
+	zilog->zl_cur_left = 0;
 }
 
 /*
@@ -2867,6 +2982,8 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 					 * itx list to somebody else who care.
 					 */
 					zilog->zl_parallel = ZIL_BURSTS;
+					zilog->zl_cur_left -=
+					    zil_itx_full_size(itx);
 					break;
 				}
 			} else {
@@ -2876,8 +2993,10 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 				}
 				list_insert_tail(&nolwb_itxs, itx);
 			}
+			zilog->zl_cur_left -= zil_itx_full_size(itx);
 		} else {
 			ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
+			zilog->zl_cur_left -= zil_itx_full_size(itx);
 			zil_itx_destroy(itx);
 		}
 	}
@@ -2960,9 +3079,9 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		 * of each individual itx.
 		 */
 		if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
+			zil_burst_done(zilog);
 			list_insert_tail(ilwbs, lwb);
 			lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
-			zil_burst_done(zilog);
 			if (lwb == NULL) {
 				while ((lwb = list_remove_head(ilwbs)) != NULL)
 					zil_lwb_write_issue(zilog, lwb);
@@ -3120,12 +3239,11 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 	 * since we've reached the commit waiter's timeout and it still
 	 * hasn't been issued.
 	 */
+	zil_burst_done(zilog);
 	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
-	zil_burst_done(zilog);
-
 	if (nlwb == NULL) {
 		/*
 		 * When zil_lwb_write_close() returns NULL, this
@@ -3720,7 +3838,9 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
 	zilog->zl_dirty_max_txg = 0;
 	zilog->zl_last_lwb_opened = NULL;
 	zilog->zl_last_lwb_latency = 0;
-	zilog->zl_max_block_size = zil_maxblocksize;
+	zilog->zl_max_block_size = MIN(MAX(P2ALIGN_TYPED(zil_maxblocksize,
+	    ZIL_MIN_BLKSZ, uint64_t), ZIL_MIN_BLKSZ),
+	    spa_maxblocksize(dmu_objset_spa(os)));
 
 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -3740,6 +3860,11 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
 	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
 	cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL);
 
+	for (int i = 0; i < ZIL_BURSTS; i++) {
+		zilog->zl_prev_opt[i] = zilog->zl_max_block_size -
+		    sizeof (zil_chain_t);
+	}
+
 	return (zilog);
 }
 

From 41a8e55101e31ded1df3fb53a1b7c62317454e68 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 8 Jan 2024 19:49:39 -0500
Subject: [PATCH 04/23] ZIL: Update Linux tracing after #15635

While picking parts from #14909 I've missed Linux tracing specific
ones, that went unnoticed in default configurations, but breaks the
build in some.

Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15730
---
 include/os/linux/zfs/sys/trace_zil.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/os/linux/zfs/sys/trace_zil.h b/include/os/linux/zfs/sys/trace_zil.h
index afa1a274e43c..ae1caa3ac473 100644
--- a/include/os/linux/zfs/sys/trace_zil.h
+++ b/include/os/linux/zfs/sys/trace_zil.h
@@ -51,7 +51,9 @@
 		__field(uint64_t,	zl_parse_lr_seq)		    \
 		__field(uint64_t,	zl_parse_blk_count)		    \
 		__field(uint64_t,	zl_parse_lr_count)		    \
-		__field(uint64_t,	zl_cur_used)			    \
+		__field(uint64_t,	zl_cur_size)			    \
+		__field(uint64_t,	zl_cur_left)			    \
+		__field(uint64_t,	zl_cur_max)			    \
 		__field(clock_t,	zl_replay_time)			    \
 		__field(uint64_t,	zl_replay_blks)
 
@@ -72,7 +74,9 @@
 		__entry->zl_parse_lr_seq	= zilog->zl_parse_lr_seq;   \
 		__entry->zl_parse_blk_count	= zilog->zl_parse_blk_count;\
 		__entry->zl_parse_lr_count	= zilog->zl_parse_lr_count; \
-		__entry->zl_cur_used	= zilog->zl_cur_used;		    \
+		__entry->zl_cur_size	= zilog->zl_cur_size;		    \
+		__entry->zl_cur_left	= zilog->zl_cur_left;		    \
+		__entry->zl_cur_max	= zilog->zl_cur_max;		    \
 		__entry->zl_replay_time	= zilog->zl_replay_time;	    \
 		__entry->zl_replay_blks	= zilog->zl_replay_blks;
 
@@ -82,7 +86,8 @@
 	"replay %u stop_sync %u logbias %u sync %u "			    \
 	"parse_error %u parse_blk_seq %llu parse_lr_seq %llu "		    \
 	"parse_blk_count %llu parse_lr_count %llu "			    \
-	"cur_used %llu replay_time %lu replay_blks %llu }"
+	"cur_size %llu cur_left %llu cur_max %llu replay_time %lu "	    \
+	"replay_blks %llu }"
 
 #define	ZILOG_TP_PRINTK_ARGS						    \
 	    __entry->zl_lr_seq, __entry->zl_commit_lr_seq,		    \
@@ -92,7 +97,8 @@
 	    __entry->zl_stop_sync, __entry->zl_logbias, __entry->zl_sync,   \
 	    __entry->zl_parse_error, __entry->zl_parse_blk_seq,		    \
 	    __entry->zl_parse_lr_seq, __entry->zl_parse_blk_count,	    \
-	    __entry->zl_parse_lr_count, __entry->zl_cur_used,		    \
+	    __entry->zl_parse_lr_count, __entry->zl_cur_size,		    \
+	    __entry->zl_cur_left, __entry->zl_cur_max,			    \
 	    __entry->zl_replay_time, __entry->zl_replay_blks
 
 #define	ITX_TP_STRUCT_ENTRY						    \

From 84d6363dbaa671c49aa3613c715ca995b6ec461c Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 7 Aug 2023 16:54:41 -0400
Subject: [PATCH 05/23] Refactor dmu_prefetch().

- Split dmu_prefetch_dnode() from dmu_prefetch() into a separate
function.  It is quite inconvenient to read the code where len = 0
means dnode prefetch instead indirect/data prefetch.  One function
doing both has no benefits, since the code paths are independent.
 - Improve dmu_prefetch() handling of long block ranges.  Instead
of limiting L0 data length to prefetch for to dmu_prefetch_max,
make dmu_prefetch_max limit the actual amount of prefetch at the
specified level, and, if there is more, prefetch all the rest at
higher indirection level.  It should improve random access times
within the prefetched range of any length, reducing importance of
specific dmu_prefetch_max value.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15076
---
 include/sys/dmu.h                    |   1 +
 module/os/freebsd/zfs/zfs_vnops_os.c |   4 +-
 module/os/linux/zfs/zfs_vnops_os.c   |   7 +-
 module/zfs/dmu.c                     | 103 ++++++++++++++++-----------
 module/zfs/dsl_deadlist.c            |   8 +--
 module/zfs/spa_log_spacemap.c        |   4 +-
 module/zfs/zvol.c                    |   2 +-
 7 files changed, 72 insertions(+), 57 deletions(-)

diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 06b4dc27dfea..5bdb7c0293b8 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -889,6 +889,7 @@ extern uint_t zfs_max_recordsize;
  */
 void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	uint64_t len, enum zio_priority pri);
+void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);
 
 typedef struct dmu_object_info {
 	/* All sizes are in bytes unless otherwise indicated. */
diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index 05f28033be6a..1ba25bce6196 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -1869,10 +1869,8 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
 
 		ASSERT3S(outcount, <=, bufsize);
 
-		/* Prefetch znode */
 		if (prefetch)
-			dmu_prefetch(os, objnum, 0, 0, 0,
-			    ZIO_PRIORITY_SYNC_READ);
+			dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
 
 		/*
 		 * Move to the next entry, fill in the previous offset.
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 7c473bc7e965..be528f6e8176 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -1610,11 +1610,8 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
 		if (done)
 			break;
 
-		/* Prefetch znode */
-		if (prefetch) {
-			dmu_prefetch(os, objnum, 0, 0, 0,
-			    ZIO_PRIORITY_SYNC_READ);
-		}
+		if (prefetch)
+			dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
 
 		/*
 		 * Move to the next entry, fill in the previous offset.
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 3215ab1c2a14..d82211e6d4c7 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -695,74 +695,93 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag)
 }
 
 /*
- * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
+ * Issue prefetch I/Os for the given blocks.  If level is greater than 0, the
  * indirect blocks prefetched will be those that point to the blocks containing
- * the data starting at offset, and continuing to offset + len.
+ * the data starting at offset, and continuing to offset + len.  If the range
+ * it too long, prefetch the first dmu_prefetch_max bytes as requested, while
+ * for the rest only a higher level, also fitting within dmu_prefetch_max.  It
+ * should primarily help random reads, since for long sequential reads there is
+ * a speculative prefetcher.
  *
  * Note that if the indirect blocks above the blocks being prefetched are not
- * in cache, they will be asynchronously read in.
+ * in cache, they will be asynchronously read in.  Dnode read by dnode_hold()
+ * is currently synchronous.
  */
 void
 dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
-	uint64_t blkid;
-	int nblks, err;
-
-	if (len == 0) {  /* they're interested in the bonus buffer */
-		dn = DMU_META_DNODE(os);
+	int64_t level2 = level;
+	uint64_t start, end, start2, end2;
 
-		if (object == 0 || object >= DN_MAX_OBJECT)
-			return;
-
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		blkid = dbuf_whichblock(dn, level,
-		    object * sizeof (dnode_phys_t));
-		dbuf_prefetch(dn, level, blkid, pri, 0);
-		rw_exit(&dn->dn_struct_rwlock);
+	if (dmu_prefetch_max == 0 || len == 0) {
+		dmu_prefetch_dnode(os, object, pri);
 		return;
 	}
 
-	/*
-	 * See comment before the definition of dmu_prefetch_max.
-	 */
-	len = MIN(len, dmu_prefetch_max);
-
-	/*
-	 * XXX - Note, if the dnode for the requested object is not
-	 * already cached, we will do a *synchronous* read in the
-	 * dnode_hold() call.  The same is true for any indirects.
-	 */
-	err = dnode_hold(os, object, FTAG, &dn);
-	if (err != 0)
+	if (dnode_hold(os, object, FTAG, &dn) != 0)
 		return;
 
 	/*
-	 * offset + len - 1 is the last byte we want to prefetch for, and offset
-	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
-	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
-	 * offset)  is the first.  Then the number we need to prefetch is the
-	 * last - first + 1.
+	 * Depending on len we may do two prefetches: blocks [start, end) at
+	 * level, and following blocks [start2, end2) at higher level2.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	if (level > 0 || dn->dn_datablkshift != 0) {
-		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
-		    dbuf_whichblock(dn, level, offset) + 1;
+	if (dn->dn_datablkshift != 0) {
+		/*
+		 * The object has multiple blocks.  Calculate the full range
+		 * of blocks [start, end2) and then split it into two parts,
+		 * so that the first [start, end) fits into dmu_prefetch_max.
+		 */
+		start = dbuf_whichblock(dn, level, offset);
+		end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1;
+		uint8_t ibs = dn->dn_indblkshift;
+		uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs;
+		uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs;
+		start2 = end = MIN(end2, start + limit);
+
+		/*
+		 * Find level2 where [start2, end2) fits into dmu_prefetch_max.
+		 */
+		uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
+		limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
+		do {
+			level2++;
+			start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
+			end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps;
+		} while (end2 - start2 > limit);
 	} else {
-		nblks = (offset < dn->dn_datablksz);
+		/* There is only one block.  Prefetch it or nothing. */
+		start = start2 = end2 = 0;
+		end = start + (level == 0 && offset < dn->dn_datablksz);
 	}
 
-	if (nblks != 0) {
-		blkid = dbuf_whichblock(dn, level, offset);
-		for (int i = 0; i < nblks; i++)
-			dbuf_prefetch(dn, level, blkid + i, pri, 0);
-	}
+	for (uint64_t i = start; i < end; i++)
+		dbuf_prefetch(dn, level, i, pri, 0);
+	for (uint64_t i = start2; i < end2; i++)
+		dbuf_prefetch(dn, level2, i, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 }
 
+/*
+ * Issue prefetch I/Os for the given object's dnode.
+ */
+void
+dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
+{
+	if (object == 0 || object >= DN_MAX_OBJECT)
+		return;
+
+	dnode_t *dn = DMU_META_DNODE(os);
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t));
+	dbuf_prefetch(dn, 0, blkid, pri, 0);
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
 /*
  * Get the next "chunk" of file data to free.  We traverse the file from
  * the end so that the file gets shorter over time (if we crashes in the
diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c
index ac30a370813f..e6c8d4be13b4 100644
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@@ -173,8 +173,8 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
 		 * in parallel.  Then open them all in a second pass.
 		 */
 		dle->dle_bpobj.bpo_object = za.za_first_integer;
-		dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object,
-		    0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object,
+		    ZIO_PRIORITY_SYNC_READ);
 
 		avl_add(&dl->dl_tree, dle);
 	}
@@ -235,8 +235,8 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl)
 		 * in parallel.  Then open them all in a second pass.
 		 */
 		dlce->dlce_bpobj = za.za_first_integer;
-		dmu_prefetch(dl->dl_os, dlce->dlce_bpobj,
-		    0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj,
+		    ZIO_PRIORITY_SYNC_READ);
 		avl_add(&dl->dl_cache, dlce);
 	}
 	VERIFY3U(error, ==, ENOENT);
diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c
index 2878e68c6e4b..cf05158b63f8 100644
--- a/module/zfs/spa_log_spacemap.c
+++ b/module/zfs/spa_log_spacemap.c
@@ -1147,8 +1147,8 @@ spa_ld_log_sm_data(spa_t *spa)
 	/* Prefetch log spacemaps dnodes. */
 	for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
 	    sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
-		dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj,
-		    0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj,
+		    ZIO_PRIORITY_SYNC_READ);
 	}
 
 	uint_t pn = 0;
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 20ea71f23376..89b523ddd903 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -931,7 +931,7 @@ zvol_prefetch_minors_impl(void *arg)
 	job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
 	    FTAG, &os);
 	if (job->error == 0) {
-		dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ);
 		dmu_objset_disown(os, B_TRUE, FTAG);
 	}
 }

From 01d48764a7e8eefdea606505699535737b9c042f Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 13 Feb 2024 14:15:16 -0500
Subject: [PATCH 06/23] Linux: Cleanup taskq threads spawn/exit

This changes taskq_thread_should_stop() to limit maximum exit rate
for idle threads to one per 5 seconds.  I believe the previous one
was broken, not allowing any thread exits for tasks arriving more
than one at a time and so completing while others are running.

Also while there:
 - Remove taskq_thread_spawn() calls on task allocation errors.
 - Remove extra taskq_thread_should_stop() call.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15873
---
 include/os/linux/spl/sys/taskq.h |  2 +-
 man/man4/spl.4                   | 18 ++-----
 module/os/linux/spl/spl-taskq.c  | 85 +++++++++++---------------------
 3 files changed, 34 insertions(+), 71 deletions(-)

diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h
index 6c1b4377a98a..6c0dbbefda7f 100644
--- a/include/os/linux/spl/sys/taskq.h
+++ b/include/os/linux/spl/sys/taskq.h
@@ -104,7 +104,7 @@ typedef struct taskq {
 	/* list node for the cpu hotplug callback */
 	struct hlist_node	tq_hp_cb_node;
 	boolean_t		tq_hp_support;
-	unsigned long		lastshouldstop; /* when to purge dynamic */
+	unsigned long		lastspawnstop;	/* when to purge dynamic */
 } taskq_t;
 
 typedef struct taskq_ent {
diff --git a/man/man4/spl.4 b/man/man4/spl.4
index 414a92394858..5cc12764e18c 100644
--- a/man/man4/spl.4
+++ b/man/man4/spl.4
@@ -186,18 +186,8 @@ reading it could cause a lock-up if the list grow too large
 without limiting the output.
 "(truncated)" will be shown if the list is larger than the limit.
 .
-.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 10000 Pq uint
-(Linux-only)
-How long a taskq has to have had no work before we tear it down.
-Previously, we would tear down a dynamic taskq worker as soon
-as we noticed it had no work, but it was observed that this led
-to a lot of churn in tearing down things we then immediately
-spawned anew.
-In practice, it seems any nonzero value will remove the vast
-majority of this churn, while the nontrivially larger value
-was chosen to help filter out the little remaining churn on
-a mostly idle system.
-Setting this value to
-.Sy 0
-will revert to the previous behavior.
+.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 5000 Pq uint
+Minimum idle threads exit interval for dynamic taskqs.
+Smaller values allow idle threads exit more often and potentially be
+respawned again on demand, causing more churn.
 .El
diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c
index d18f935b167c..0e44aa1fcd38 100644
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@@ -36,12 +36,12 @@ static int spl_taskq_thread_bind = 0;
 module_param(spl_taskq_thread_bind, int, 0644);
 MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
 
-static uint_t spl_taskq_thread_timeout_ms = 10000;
+static uint_t spl_taskq_thread_timeout_ms = 5000;
 /* BEGIN CSTYLED */
 module_param(spl_taskq_thread_timeout_ms, uint, 0644);
 /* END CSTYLED */
 MODULE_PARM_DESC(spl_taskq_thread_timeout_ms,
-	"Time to require a dynamic thread be idle before it gets cleaned up");
+	"Minimum idle threads exit interval for dynamic taskqs");
 
 static int spl_taskq_thread_dynamic = 1;
 module_param(spl_taskq_thread_dynamic, int, 0444);
@@ -594,8 +594,7 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 	ASSERT(tq->tq_nactive <= tq->tq_nthreads);
 	if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
 		/* Dynamic taskq may be able to spawn another thread */
-		if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
-		    taskq_thread_spawn(tq) == 0)
+		if (taskq_thread_spawn(tq) == 0)
 			goto out;
 	}
 
@@ -629,11 +628,11 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 	spin_unlock(&t->tqent_lock);
 
 	wake_up(&tq->tq_work_waitq);
-out:
+
 	/* Spawn additional taskq threads if required. */
 	if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
-
+out:
 	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
 	return (rc);
 }
@@ -676,10 +675,11 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
 	ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
 
 	spin_unlock(&t->tqent_lock);
-out:
+
 	/* Spawn additional taskq threads if required. */
 	if (tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
+out:
 	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
 	return (rc);
 }
@@ -704,9 +704,8 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
 
 	if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
 		/* Dynamic taskq may be able to spawn another thread */
-		if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
-		    taskq_thread_spawn(tq) == 0)
-			goto out2;
+		if (taskq_thread_spawn(tq) == 0)
+			goto out;
 		flags |= TQ_FRONT;
 	}
 
@@ -742,11 +741,11 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
 	spin_unlock(&t->tqent_lock);
 
 	wake_up(&tq->tq_work_waitq);
-out:
+
 	/* Spawn additional taskq threads if required. */
 	if (tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
-out2:
+out:
 	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
 }
 EXPORT_SYMBOL(taskq_dispatch_ent);
@@ -825,6 +824,7 @@ taskq_thread_spawn(taskq_t *tq)
 	if (!(tq->tq_flags & TASKQ_DYNAMIC))
 		return (0);
 
+	tq->lastspawnstop = jiffies;
 	if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) &&
 	    (tq->tq_flags & TASKQ_ACTIVE)) {
 		spawning = (++tq->tq_nspawn);
@@ -836,9 +836,9 @@ taskq_thread_spawn(taskq_t *tq)
 }
 
 /*
- * Threads in a dynamic taskq should only exit once it has been completely
- * drained and no other threads are actively servicing tasks.  This prevents
- * threads from being created and destroyed more than is required.
+ * Threads in a dynamic taskq may exit once there is no more work to do.
+ * To prevent threads from being created and destroyed too often limit
+ * the exit rate to one per spl_taskq_thread_timeout_ms.
  *
  * The first thread is the thread list is treated as the primary thread.
  * There is nothing special about the primary thread but in order to avoid
@@ -847,44 +847,22 @@ taskq_thread_spawn(taskq_t *tq)
 static int
 taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
 {
-	if (!(tq->tq_flags & TASKQ_DYNAMIC))
+	ASSERT(!taskq_next_ent(tq));
+	if (!(tq->tq_flags & TASKQ_DYNAMIC) || !spl_taskq_thread_dynamic)
 		return (0);
-
+	if (!(tq->tq_flags & TASKQ_ACTIVE))
+		return (1);
 	if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t,
 	    tqt_thread_list) == tqt)
 		return (0);
-
-	int no_work =
-	    ((tq->tq_nspawn == 0) &&	/* No threads are being spawned */
-	    (tq->tq_nactive == 0) &&	/* No threads are handling tasks */
-	    (tq->tq_nthreads > 1) &&	/* More than 1 thread is running */
-	    (!taskq_next_ent(tq)) &&	/* There are no pending tasks */
-	    (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */
-
-	/*
-	 * If we would have said stop before, let's instead wait a bit, maybe
-	 * we'll see more work come our way soon...
-	 */
-	if (no_work) {
-		/* if it's 0, we want the old behavior. */
-		/* if the taskq is being torn down, we also want to go away. */
-		if (spl_taskq_thread_timeout_ms == 0 ||
-		    !(tq->tq_flags & TASKQ_ACTIVE))
-			return (1);
-		unsigned long lasttime = tq->lastshouldstop;
-		if (lasttime > 0) {
-			if (time_after(jiffies, lasttime +
-			    msecs_to_jiffies(spl_taskq_thread_timeout_ms)))
-				return (1);
-			else
-				return (0);
-		} else {
-			tq->lastshouldstop = jiffies;
-		}
-	} else {
-		tq->lastshouldstop = 0;
-	}
-	return (0);
+	ASSERT3U(tq->tq_nthreads, >, 1);
+	if (tq->tq_nspawn != 0)
+		return (0);
+	if (time_before(jiffies, tq->lastspawnstop +
+	    msecs_to_jiffies(spl_taskq_thread_timeout_ms)))
+		return (0);
+	tq->lastspawnstop = jiffies;
+	return (1);
 }
 
 static int
@@ -935,10 +913,8 @@ taskq_thread(void *args)
 		if (list_empty(&tq->tq_pend_list) &&
 		    list_empty(&tq->tq_prio_list)) {
 
-			if (taskq_thread_should_stop(tq, tqt)) {
-				wake_up_all(&tq->tq_wait_waitq);
+			if (taskq_thread_should_stop(tq, tqt))
 				break;
-			}
 
 			add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
 			spin_unlock_irqrestore(&tq->tq_lock, flags);
@@ -1013,9 +989,6 @@ taskq_thread(void *args)
 			tqt->tqt_id = TASKQID_INVALID;
 			tqt->tqt_flags = 0;
 			wake_up_all(&tq->tq_wait_waitq);
-		} else {
-			if (taskq_thread_should_stop(tq, tqt))
-				break;
 		}
 
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -1122,7 +1095,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
 	tq->tq_flags = (flags | TASKQ_ACTIVE);
 	tq->tq_next_id = TASKQID_INITIAL;
 	tq->tq_lowest_id = TASKQID_INITIAL;
-	tq->lastshouldstop = 0;
+	tq->lastspawnstop = jiffies;
 	INIT_LIST_HEAD(&tq->tq_free_list);
 	INIT_LIST_HEAD(&tq->tq_pend_list);
 	INIT_LIST_HEAD(&tq->tq_prio_list);

From 8e86cd5e85e5fc38b6e32946acd8610751560344 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 20 Mar 2024 20:22:36 -0400
Subject: [PATCH 07/23] Update resume token at object receive.

Before this change resume token was updated only on data receive.
Usually it is enough to resume replication without much overlap.
But we've got a report of a curios case, where replication source
was traversed with recursive grep, which through enabled atime
modified every object without modifying any data.  It produced
several gigabytes of replication traffic without a single data
write and so without a single resume point.

While the resume token was not designed to resume from an object,
I've found that the send implementation always sends object before
any data. So by requesting resume from offset 0 we are effectively
resuming from the object, followed (or not) by the data at offset
0, just as we need it.

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15927
---
 module/zfs/dmu_recv.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 54aa60259ea1..2cf10909738b 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -2110,6 +2110,16 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
 		dmu_buf_rele(db, FTAG);
 		dnode_rele(dn, FTAG);
 	}
+
+	/*
+	 * If the receive fails, we want the resume stream to start with the
+	 * same record that we last successfully received. There is no way to
+	 * request resume from the object record, but we can benefit from the
+	 * fact that sender always sends object record before anything else,
+	 * after which it will "resend" data at offset 0 and resume normally.
+	 */
+	save_resume_state(rwa, drro->drr_object, 0, tx);
+
 	dmu_tx_commit(tx);
 
 	return (0);

From 98d846c512f6eb931f883a19ec6a62a082f13468 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 21 Mar 2024 18:42:21 -0400
Subject: [PATCH 08/23] BRT: Change brt_pending_tree sorting order

It does not look important how exactly brt_pending_tree is sorted.
When cloning large file, it is quite likely that all of its blocks
have identical physical birth times, so comparing them first does
not provide useful entropy, while accesses additional cache line.
In most cases combination of vdev and offset provides unique result
and physical birth time comparison is not even needed.  Meanwhile,
when traversing the tree inside brt_pending_apply(), it can be
beneficial for dbuf cache and CPU cache hits to group processing
by vdev and so by the per-VDEV BRT ZAPs.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15954
---
 module/zfs/brt.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 225ddaca1e54..3d565cd1397c 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1420,13 +1420,14 @@ brt_pending_entry_compare(const void *x1, const void *x2)
 	const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
 	int cmp;
 
-	cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2));
+	cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
+	    DVA_GET_VDEV(&bp2->blk_dva[0]));
 	if (cmp == 0) {
-		cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
-		    DVA_GET_VDEV(&bp2->blk_dva[0]));
-		if (cmp == 0) {
-			cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
-			    DVA_GET_OFFSET(&bp2->blk_dva[0]));
+		cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
+		    DVA_GET_OFFSET(&bp2->blk_dva[0]));
+		if (unlikely(cmp == 0)) {
+			cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1),
+			    BP_PHYSICAL_BIRTH(bp2));
 		}
 	}
 

From 85a00acbd6002ba1cad3a125e70abcc51da44fdf Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 21 Mar 2024 19:43:53 -0400
Subject: [PATCH 09/23] ZAP: Some cleanups/micro-optimizations

- Remove custom zap_memset(), use regular memset().
- Use PANIC() instead of opaque cmn_err(CE_PANIC).
- Provide entry parameter to zap_leaf_rehash_entry().
- Reduce branching in zap_leaf_array_create() inner loop.
- Remove signedness where it should not be.

Should be no function changes.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15976
---
 include/sys/zap_leaf.h |  8 ++---
 module/zfs/zap_leaf.c  | 77 +++++++++++++++++++-----------------------
 2 files changed, 38 insertions(+), 47 deletions(-)

diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h
index ebc67c2bf465..d563edd7ba59 100644
--- a/include/sys/zap_leaf.h
+++ b/include/sys/zap_leaf.h
@@ -47,7 +47,7 @@ struct zap_stats;
  * entries - header space (2*chunksize)
  */
 #define	ZAP_LEAF_NUMCHUNKS_BS(bs) \
-	(((1<<(bs)) - 2*ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \
+	(((1U << (bs)) - 2 * ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \
 	ZAP_LEAF_CHUNKSIZE - 2)
 
 #define	ZAP_LEAF_NUMCHUNKS(l) (ZAP_LEAF_NUMCHUNKS_BS(((l)->l_bs)))
@@ -80,7 +80,7 @@ struct zap_stats;
  * chunks per entry (3).
  */
 #define	ZAP_LEAF_HASH_SHIFT_BS(bs) ((bs) - 5)
-#define	ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1 << ZAP_LEAF_HASH_SHIFT_BS(bs))
+#define	ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1U << ZAP_LEAF_HASH_SHIFT_BS(bs))
 #define	ZAP_LEAF_HASH_SHIFT(l) (ZAP_LEAF_HASH_SHIFT_BS(((l)->l_bs)))
 #define	ZAP_LEAF_HASH_NUMENTRIES(l) (ZAP_LEAF_HASH_NUMENTRIES_BS(((l)->l_bs)))
 
@@ -163,7 +163,7 @@ typedef struct zap_leaf {
 	dmu_buf_user_t l_dbu;
 	krwlock_t l_rwlock;
 	uint64_t l_blkid;		/* 1<<ZAP_BLOCK_SHIFT byte block off */
-	int l_bs;			/* block size shift */
+	uint_t l_bs;			/* block size shift */
 	dmu_buf_t *l_dbuf;
 } zap_leaf_t;
 
@@ -243,7 +243,7 @@ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
  */
 
 extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort);
-extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
+extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t len);
 extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort);
 extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l,
     struct zap_stats *zs);
diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c
index e6afb1c58c95..032aca92695e 100644
--- a/module/zfs/zap_leaf.c
+++ b/module/zfs/zap_leaf.c
@@ -41,7 +41,8 @@
 #include <sys/zap_leaf.h>
 #include <sys/arc.h>
 
-static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
+static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le,
+    uint16_t entry);
 
 #define	CHAIN_END 0xffff /* end of the chunk chain */
 
@@ -52,16 +53,6 @@ static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
 
 #define	LEAF_HASH_ENTPTR(l, h)	(&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
 
-static void
-zap_memset(void *a, int c, size_t n)
-{
-	char *cp = a;
-	char *cpend = cp + n;
-
-	while (cp < cpend)
-		*cp++ = c;
-}
-
 static void
 stv(int len, void *addr, uint64_t value)
 {
@@ -79,7 +70,7 @@ stv(int len, void *addr, uint64_t value)
 		*(uint64_t *)addr = value;
 		return;
 	default:
-		cmn_err(CE_PANIC, "bad int len %d", len);
+		PANIC("bad int len %d", len);
 	}
 }
 
@@ -96,13 +87,13 @@ ldv(int len, const void *addr)
 	case 8:
 		return (*(uint64_t *)addr);
 	default:
-		cmn_err(CE_PANIC, "bad int len %d", len);
+		PANIC("bad int len %d", len);
 	}
 	return (0xFEEDFACEDEADBEEFULL);
 }
 
 void
-zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
+zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t size)
 {
 	zap_leaf_t l;
 	dmu_buf_t l_dbuf;
@@ -119,10 +110,10 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
 	buf->l_hdr.lh_prefix_len =	BSWAP_16(buf->l_hdr.lh_prefix_len);
 	buf->l_hdr.lh_freelist =	BSWAP_16(buf->l_hdr.lh_freelist);
 
-	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
+	for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
 
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
+	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
 		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
 		struct zap_leaf_entry *le;
 
@@ -160,11 +151,11 @@ void
 zap_leaf_init(zap_leaf_t *l, boolean_t sort)
 {
 	l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
-	zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
+	memset(&zap_leaf_phys(l)->l_hdr, 0,
 	    sizeof (struct zap_leaf_header));
-	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
 	}
@@ -185,7 +176,7 @@ zap_leaf_chunk_alloc(zap_leaf_t *l)
 {
 	ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
 
-	int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
+	uint_t chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
 
@@ -223,28 +214,29 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf,
 {
 	uint16_t chunk_head;
 	uint16_t *chunkp = &chunk_head;
-	int byten = 0;
+	int byten = integer_size;
 	uint64_t value = 0;
 	int shift = (integer_size - 1) * 8;
 	int len = num_integers;
 
 	ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN);
 
+	if (len > 0)
+		value = ldv(integer_size, buf);
 	while (len > 0) {
 		uint16_t chunk = zap_leaf_chunk_alloc(l);
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 
 		la->la_type = ZAP_CHUNK_ARRAY;
 		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
-			if (byten == 0)
-				value = ldv(integer_size, buf);
 			la->la_array[i] = value >> shift;
 			value <<= 8;
-			if (++byten == integer_size) {
-				byten = 0;
-				buf += integer_size;
+			if (--byten == 0) {
 				if (--len == 0)
 					break;
+				byten = integer_size;
+				buf += integer_size;
+				value = ldv(integer_size, buf);
 			}
 		}
 
@@ -264,7 +256,7 @@ zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
 	*chunkp = CHAIN_END;
 
 	while (chunk != CHAIN_END) {
-		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
+		uint_t nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
 		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
 		    ZAP_CHUNK_ARRAY);
 		zap_leaf_chunk_free(l, chunk);
@@ -333,7 +325,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
 
 static boolean_t
 zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
-    int chunk, int array_numints)
+    uint_t chunk, int array_numints)
 {
 	int bseen = 0;
 
@@ -562,7 +554,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
 
 	uint64_t valuelen = integer_size * num_integers;
 
-	int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
+	uint_t numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
 	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
 	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
 		return (SET_ERROR(E2BIG));
@@ -624,7 +616,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
 
 	/* link it into the hash chain */
 	/* XXX if we did the search above, we could just use that */
-	uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
+	uint16_t *chunkp = zap_leaf_rehash_entry(l, le, chunk);
 
 	zap_leaf_phys(l)->l_hdr.lh_nentries++;
 
@@ -687,9 +679,8 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
  */
 
 static uint16_t *
-zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
+zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, uint16_t entry)
 {
-	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
 	struct zap_leaf_entry *le2;
 	uint16_t *chunkp;
 
@@ -722,7 +713,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
 		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
 		struct zap_leaf_array *la =
 		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
-		int nextchunk = la->la_next;
+		uint_t nextchunk = la->la_next;
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
@@ -739,7 +730,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
 }
 
 static void
-zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
+zap_leaf_transfer_entry(zap_leaf_t *l, uint_t entry, zap_leaf_t *nl)
 {
 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
@@ -748,7 +739,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 	struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
 	*nle = *le; /* structure assignment */
 
-	(void) zap_leaf_rehash_entry(nl, chunk);
+	(void) zap_leaf_rehash_entry(nl, nle, chunk);
 
 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
 	nle->le_value_chunk =
@@ -766,7 +757,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 void
 zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 {
-	int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+	uint_t bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* set new prefix and prefix_len */
 	zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
@@ -777,7 +768,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* break existing hash chains */
-	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
 
 	if (sort)
@@ -792,7 +783,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 	 * but this accesses memory more sequentially, and when we're
 	 * called, the block is usually pretty full.
 	 */
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
 		if (le->le_type != ZAP_CHUNK_ENTRY)
 			continue;
@@ -800,14 +791,14 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 		if (le->le_hash & (1ULL << bit))
 			zap_leaf_transfer_entry(l, i, nl);
 		else
-			(void) zap_leaf_rehash_entry(l, i);
+			(void) zap_leaf_rehash_entry(l, le, i);
 	}
 }
 
 void
 zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 {
-	int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+	uint_t n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_leafs_with_2n_pointers[n]++;
@@ -823,9 +814,9 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_blocks_n_tenths_full[n]++;
 
-	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
-		int nentries = 0;
-		int chunk = zap_leaf_phys(l)->l_hash[i];
+	for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
+		uint_t nentries = 0;
+		uint_t chunk = zap_leaf_phys(l)->l_hash[i];
 
 		while (chunk != CHAIN_END) {
 			struct zap_leaf_entry *le =

From 656f9d7f3caa48d31b638a77ba794e903ab498e9 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 17:58:04 -0400
Subject: [PATCH 10/23] BRT: Skip duplicate BRT prefetches

If there is a pending entry for this block, then we've already
issued BRT prefetch for it within this TXG, so don't do it again.
BRT vdev lookup and following zap_prefetch_uint64() call can be
pretty expensive and should be avoided when not necessary.

Reviewed-by: Pawel Jakub Dawidek <pawel@dawidek.net>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15941
---
 module/zfs/brt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 3d565cd1397c..7ddec0b4b9bb 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1472,10 +1472,10 @@ brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
 		kmem_cache_free(brt_pending_entry_cache, newbpe);
 	} else {
 		ASSERT(bpe == NULL);
-	}
 
-	/* Prefetch BRT entry, as we will need it in the syncing context. */
-	brt_prefetch(brt, bp);
+		/* Prefetch BRT entry for the syncing context. */
+		brt_prefetch(brt, bp);
+	}
 }
 
 void

From 0e0786b9991fafc93e9f2bc9073d326d8bfb5850 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 17:58:50 -0400
Subject: [PATCH 11/23] ZAP: Massively switch to _by_dnode() interfaces

Before this change ZAP called dnode_hold() for almost every block
access, that was clearly visible in profiler under heavy load, such
as BRT.  This patch makes it always hold the dnode reference between
zap_lockdir() and zap_unlockdir().  It allows to avoid most of dnode
operations between those.  It also adds several new _by_dnode() APIs
to ZAP and uses them in BRT code.  Also adds dmu_prefetch_by_dnode()
variant and uses it in the ZAP code.

After this there remains only one call to dmu_buf_dnode_enter(),
which seems to be unneeded.  So remove the call and the functions.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15951
---
 include/sys/dmu.h      |   4 +-
 include/sys/zap.h      |   8 ++
 include/sys/zap_impl.h |   1 +
 module/zfs/brt.c       |  72 +++-----------
 module/zfs/dbuf.c      |  15 ---
 module/zfs/dmu.c       |  18 +++-
 module/zfs/dmu_recv.c  |   7 +-
 module/zfs/zap.c       |  43 ++++-----
 module/zfs/zap_micro.c | 206 +++++++++++++++++++++++++++++------------
 9 files changed, 202 insertions(+), 172 deletions(-)

diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 5bdb7c0293b8..26b329b53f05 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -739,8 +739,6 @@ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 objset_t *dmu_buf_get_objset(dmu_buf_t *db);
-dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
-void dmu_buf_dnode_exit(dmu_buf_t *db);
 
 /* Block until any in-progress dmu buf user evictions complete. */
 void dmu_buf_user_evict_wait(void);
@@ -889,6 +887,8 @@ extern uint_t zfs_max_recordsize;
  */
 void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	uint64_t len, enum zio_priority pri);
+void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
+	uint64_t len, enum zio_priority pri);
 void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);
 
 typedef struct dmu_object_info {
diff --git a/include/sys/zap.h b/include/sys/zap.h
index 308a7c7284d7..96ddcc324b65 100644
--- a/include/sys/zap.h
+++ b/include/sys/zap.h
@@ -253,6 +253,9 @@ int zap_add_by_dnode(dnode_t *dn, const char *key,
 int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx);
+int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx);
 
 /*
  * Set the attribute with the given name to the given value.  If an
@@ -267,6 +270,9 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
 int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
 
 /*
  * Get the length (in integers) and the integer size of the specified
@@ -292,6 +298,8 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
 int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx);
 int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, dmu_tx_t *tx);
+int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, dmu_tx_t *tx);
 
 /*
  * Returns (in *count) the number of attributes in the specified zap
diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h
index 74853f5faceb..2959aa9b2ca4 100644
--- a/include/sys/zap_impl.h
+++ b/include/sys/zap_impl.h
@@ -145,6 +145,7 @@ typedef struct zap {
 	dmu_buf_user_t zap_dbu;
 	objset_t *zap_objset;
 	uint64_t zap_object;
+	dnode_t *zap_dnode;
 	struct dmu_buf *zap_dbuf;
 	krwlock_t zap_rwlock;
 	boolean_t zap_ismicro;
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 7ddec0b4b9bb..5e10df9dfe56 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -955,52 +955,10 @@ brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
 	if (mos_entries == 0)
 		return;
 
-	BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
-	    (u_longlong_t)mos_entries, (u_longlong_t)vdevid,
-	    (u_longlong_t)bre->bre_offset);
 	(void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
 }
 
-static int
-brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
-{
-	int error;
-
-	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
-	ASSERT(brtvd->bv_mos_entries != 0);
-	ASSERT(bre->bre_refcount > 0);
-
-	error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
-	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
-	    sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
-	BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
-	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
-	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
-	    (u_longlong_t)bre->bre_refcount, error);
-
-	return (error);
-}
-
-static int
-brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
-{
-	int error;
-
-	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
-	ASSERT(brtvd->bv_mos_entries != 0);
-	ASSERT0(bre->bre_refcount);
-
-	error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
-	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
-	BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
-	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
-	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
-	    (u_longlong_t)bre->bre_refcount, error);
-
-	return (error);
-}
-
 /*
  * Return TRUE if we _can_ have BRT entry for this bp. It might be false
  * positive, but gives us quick answer if we should look into BRT, which
@@ -1559,24 +1517,16 @@ brt_pending_apply(spa_t *spa, uint64_t txg)
 }
 
 static void
-brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
+brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
 {
-
-	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
-	ASSERT(brtvd->bv_mos_entries != 0);
-
 	if (bre->bre_refcount == 0) {
-		int error;
-
-		error = brt_entry_remove(brt, brtvd, bre, tx);
-		ASSERT(error == 0 || error == ENOENT);
-		/*
-		 * If error == ENOENT then zfs_clone_range() was done from a
-		 * removed (but opened) file (open(), unlink()).
-		 */
-		ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
+		int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset,
+		    BRT_KEY_WORDS, tx);
+		VERIFY(error == 0 || error == ENOENT);
 	} else {
-		VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
+		VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset,
+		    BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount),
+		    &bre->bre_refcount, tx));
 	}
 }
 
@@ -1585,6 +1535,7 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx)
 {
 	brt_vdev_t *brtvd;
 	brt_entry_t *bre;
+	dnode_t *dn;
 	uint64_t vdevid;
 	void *c;
 
@@ -1608,14 +1559,19 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx)
 		if (brtvd->bv_mos_brtvdev == 0)
 			brt_vdev_create(brt, brtvd, tx);
 
+		VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries,
+		    FTAG, &dn));
+
 		c = NULL;
 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
-			brt_sync_entry(brt, brtvd, bre, tx);
+			brt_sync_entry(dn, bre, tx);
 			brt_entry_free(bre);
 			ASSERT(brt->brt_nentries > 0);
 			brt->brt_nentries--;
 		}
 
+		dnode_rele(dn, FTAG);
+
 		brt_vdev_sync(brt, brtvd, tx);
 
 		if (brtvd->bv_totalcount == 0)
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 280001bc34b6..ae5657d762f5 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -4127,21 +4127,6 @@ dmu_buf_get_objset(dmu_buf_t *db)
 	return (dbi->db_objset);
 }
 
-dnode_t *
-dmu_buf_dnode_enter(dmu_buf_t *db)
-{
-	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-	DB_DNODE_ENTER(dbi);
-	return (DB_DNODE(dbi));
-}
-
-void
-dmu_buf_dnode_exit(dmu_buf_t *db)
-{
-	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-	DB_DNODE_EXIT(dbi);
-}
-
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index d82211e6d4c7..8986f55e792a 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -712,8 +712,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
-	int64_t level2 = level;
-	uint64_t start, end, start2, end2;
 
 	if (dmu_prefetch_max == 0 || len == 0) {
 		dmu_prefetch_dnode(os, object, pri);
@@ -723,6 +721,18 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	if (dnode_hold(os, object, FTAG, &dn) != 0)
 		return;
 
+	dmu_prefetch_by_dnode(dn, level, offset, len, pri);
+
+	dnode_rele(dn, FTAG);
+}
+
+void
+dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
+    uint64_t len, zio_priority_t pri)
+{
+	int64_t level2 = level;
+	uint64_t start, end, start2, end2;
+
 	/*
 	 * Depending on len we may do two prefetches: blocks [start, end) at
 	 * level, and following blocks [start2, end2) at higher level2.
@@ -762,8 +772,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	for (uint64_t i = start2; i < end2; i++)
 		dbuf_prefetch(dn, level2, i, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
-
-	dnode_rele(dn, FTAG);
 }
 
 /*
@@ -2563,6 +2571,8 @@ EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
 EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
 EXPORT_SYMBOL(dmu_buf_rele_array);
 EXPORT_SYMBOL(dmu_prefetch);
+EXPORT_SYMBOL(dmu_prefetch_by_dnode);
+EXPORT_SYMBOL(dmu_prefetch_dnode);
 EXPORT_SYMBOL(dmu_free_range);
 EXPORT_SYMBOL(dmu_free_long_range);
 EXPORT_SYMBOL(dmu_free_long_object);
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 2cf10909738b..9f1c25f866f7 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -2353,7 +2353,6 @@ receive_process_write_record(struct receive_writer_arg *rwa,
 	if (rwa->heal) {
 		blkptr_t *bp;
 		dmu_buf_t *dbp;
-		dnode_t *dn;
 		int flags = DB_RF_CANFAIL;
 
 		if (rwa->raw)
@@ -2385,19 +2384,15 @@ receive_process_write_record(struct receive_writer_arg *rwa,
 			dmu_buf_rele(dbp, FTAG);
 			return (err);
 		}
-		dn = dmu_buf_dnode_enter(dbp);
 		/* Make sure the on-disk block and recv record sizes match */
-		if (drrw->drr_logical_size !=
-		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT) {
+		if (drrw->drr_logical_size != dbp->db_size) {
 			err = ENOTSUP;
-			dmu_buf_dnode_exit(dbp);
 			dmu_buf_rele(dbp, FTAG);
 			return (err);
 		}
 		/* Get the block pointer for the corrupted block */
 		bp = dmu_buf_get_blkptr(dbp);
 		err = do_corrective_recv(rwa, drrw, rrd, bp);
-		dmu_buf_dnode_exit(dbp);
 		dmu_buf_rele(dbp, FTAG);
 		return (err);
 	}
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index dde05d7005c2..da86defb445c 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -133,7 +133,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 	 * set up block 1 - the first leaf
 	 */
 	dmu_buf_t *db;
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db, tx);
 
@@ -182,7 +182,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
 		tbl->zt_nextblk = newblk;
 		ASSERT0(tbl->zt_blks_copied);
-		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0,
 		    tbl->zt_blk << bs, tbl->zt_numblks << bs,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
@@ -193,21 +193,21 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 
 	uint64_t b = tbl->zt_blks_copied;
 	dmu_buf_t *db_old;
-	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
 
 	/* first half of entries in old[b] go to new[2*b+0] */
 	dmu_buf_t *db_new;
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func(db_old->db_data, db_new->db_data, hepb);
 	dmu_buf_rele(db_new, FTAG);
 
 	/* second half of entries in old[b] go to new[2*b+1] */
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func((uint64_t *)db_old->db_data + hepb,
@@ -255,7 +255,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 	uint64_t off = idx & ((1<<(bs-3))-1);
 
 	dmu_buf_t *db;
-	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
@@ -267,7 +267,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 		uint64_t off2 = idx2 & ((1<<(bs-3))-1);
 		dmu_buf_t *db2;
 
-		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
 		    DMU_READ_NO_PREFETCH);
 		if (err != 0) {
@@ -296,16 +296,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 	uint64_t blk = idx >> (bs-3);
 	uint64_t off = idx & ((1<<(bs-3))-1);
 
-	/*
-	 * Note: this is equivalent to dmu_buf_hold(), but we use
-	 * _dnode_enter / _by_dnode because it's faster because we don't
-	 * have to hold the dnode.
-	 */
-	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
 	dmu_buf_t *db;
-	int err = dmu_buf_hold_by_dnode(dn,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
-	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err != 0)
 		return (err);
 	*valp = ((uint64_t *)db->db_data)[off];
@@ -319,11 +312,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 		 */
 		blk = (idx*2) >> (bs-3);
 
-		dn = dmu_buf_dnode_enter(zap->zap_dbuf);
-		err = dmu_buf_hold_by_dnode(dn,
+		err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    (tbl->zt_nextblk + blk) << bs, FTAG, &db,
 		    DMU_READ_NO_PREFETCH);
-		dmu_buf_dnode_exit(zap->zap_dbuf);
 		if (err == 0)
 			dmu_buf_rele(db, FTAG);
 	}
@@ -368,7 +359,7 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 
 		uint64_t newblk = zap_allocate_blocks(zap, 1);
 		dmu_buf_t *db_new;
-		int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
 		    DMU_READ_NO_PREFETCH);
 		if (err != 0)
@@ -433,7 +424,7 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 	l->l_blkid = zap_allocate_blocks(zap, 1);
 	l->l_dbuf = NULL;
 
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
 	    DMU_READ_NO_PREFETCH));
 	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
@@ -533,10 +524,8 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
 		return (SET_ERROR(ENOENT));
 
 	int bs = FZAP_BLOCK_SHIFT(zap);
-	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
-	int err = dmu_buf_hold_by_dnode(dn,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
-	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err != 0)
 		return (err);
 
@@ -985,7 +974,7 @@ fzap_prefetch(zap_name_t *zn)
 	if (zap_idx_to_blk(zap, idx, &blk) != 0)
 		return;
 	int bs = FZAP_BLOCK_SHIFT(zap);
-	dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
+	dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs,
 	    ZIO_PRIORITY_SYNC_READ);
 }
 
@@ -1228,7 +1217,7 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
 	 */
 	if (zc->zc_hash == 0 && zap_iterate_prefetch &&
 	    zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
-		dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0,
 		    zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
 		    ZIO_PRIORITY_ASYNC_READ);
 	}
@@ -1356,7 +1345,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 		zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 		    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
 	} else {
-		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
 		    ZIO_PRIORITY_SYNC_READ);
@@ -1366,7 +1355,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 			dmu_buf_t *db;
 			int err;
 
-			err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+			err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 			    (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
 			    FTAG, &db, DMU_READ_NO_PREFETCH);
 			if (err == 0) {
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index 085d9cd8b4b6..d806988af96d 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -415,7 +415,7 @@ mze_destroy(zap_t *zap)
 }
 
 static zap_t *
-mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
+mzap_open(dmu_buf_t *db)
 {
 	zap_t *winner;
 	uint64_t *zap_hdr = (uint64_t *)db->db_data;
@@ -427,8 +427,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 	zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
 	rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
 	rw_enter(&zap->zap_rwlock, RW_WRITER);
-	zap->zap_objset = os;
-	zap->zap_object = obj;
+	zap->zap_objset = dmu_buf_get_objset(db);
+	zap->zap_object = db->db_object;
 	zap->zap_dbuf = db;
 
 	if (zap_block_type != ZBT_MICRO) {
@@ -518,7 +518,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
  * have the specified tag.
  */
 static int
-zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
+zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 {
 	ASSERT0(db->db_offset);
@@ -528,13 +528,13 @@ zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
 
 	*zapp = NULL;
 
-	dmu_object_info_from_db(db, &doi);
+	dmu_object_info_from_dnode(dn, &doi);
 	if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
 		return (SET_ERROR(EINVAL));
 
 	zap_t *zap = dmu_buf_get_user(db);
 	if (zap == NULL) {
-		zap = mzap_open(os, obj, db);
+		zap = mzap_open(db);
 		if (zap == NULL) {
 			/*
 			 * mzap_open() didn't like what it saw on-disk.
@@ -563,6 +563,7 @@ zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
 	}
 
 	zap->zap_objset = os;
+	zap->zap_dnode = dn;
 
 	if (lt == RW_WRITER)
 		dmu_buf_will_dirty(db, tx);
@@ -598,23 +599,16 @@ zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
     zap_t **zapp)
 {
 	dmu_buf_t *db;
+	int err;
 
-	int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
-	if (err != 0) {
+	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0)
 		return (err);
-	}
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(db, &doi);
-		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
-	}
-#endif
-
-	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
-	if (err != 0) {
+	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0)
 		dmu_buf_rele(db, tag);
-	}
+	else
+		VERIFY(dnode_add_ref(dn, tag));
 	return (err);
 }
 
@@ -623,21 +617,23 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
     zap_t **zapp)
 {
+	dnode_t *dn;
 	dmu_buf_t *db;
+	int err;
 
-	int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	err = dnode_hold(os, obj, tag, &dn);
 	if (err != 0)
 		return (err);
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(db, &doi);
-		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
+	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		dnode_rele(dn, tag);
+		return (err);
 	}
-#endif
-	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
-	if (err != 0)
+	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0) {
 		dmu_buf_rele(db, tag);
+		dnode_rele(dn, tag);
+	}
 	return (err);
 }
 
@@ -645,6 +641,7 @@ void
 zap_unlockdir(zap_t *zap, const void *tag)
 {
 	rw_exit(&zap->zap_rwlock);
+	dnode_rele(zap->zap_dnode, tag);
 	dmu_buf_rele(zap->zap_dbuf, tag);
 }
 
@@ -730,7 +727,8 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
 	if (flags != 0) {
 		zap_t *zap;
 		/* Only fat zap supports flags; upgrade immediately. */
-		VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
+		VERIFY(dnode_add_ref(dn, FTAG));
+		VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER,
 		    B_FALSE, B_FALSE, &zap));
 		VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
 		zap_unlockdir(zap, FTAG);
@@ -1325,6 +1323,26 @@ zap_add_by_dnode(dnode_t *dn, const char *key,
 	return (err);
 }
 
+static int
+zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx, const void *tag)
+{
+	int err;
+
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
+	zap = zn->zn_zap;	/* fzap_add() may change zap */
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_add() failed */
+		zap_unlockdir(zap, tag);
+	return (err);
+}
+
 int
 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
@@ -1336,16 +1354,26 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
-	zap = zn->zn_zap;	/* fzap_add() may change zap */
-	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap, FTAG);
+	err = zap_add_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_add_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_add_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_add_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
@@ -1396,27 +1424,56 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	return (err);
 }
 
-int
-zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+static int
+zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
+    const void *tag)
 {
-	zap_t *zap;
+	int err;
 
-	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
+		zap_unlockdir(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
-	err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
+	err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
 	zap = zn->zn_zap;	/* fzap_update() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap, FTAG);
+		zap_unlockdir(zap, tag);
+	return (err);
+}
+
+int
+zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers, const void *val,
+    dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_update_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_update_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_update_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_update_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
@@ -1481,6 +1538,23 @@ zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
 	return (err);
 }
 
+static int
+zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
+    dmu_tx_t *tx, const void *tag)
+{
+	int err;
+
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_remove(zn, tx);
+	zap_name_free(zn);
+	zap_unlockdir(zap, tag);
+	return (err);
+}
+
 int
 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, dmu_tx_t *tx)
@@ -1491,14 +1565,23 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_remove(zn, tx);
-	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
+	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
+	/* zap_remove_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
+    dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
+	/* zap_remove_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
@@ -1704,14 +1787,17 @@ EXPORT_SYMBOL(zap_prefetch_uint64);
 EXPORT_SYMBOL(zap_add);
 EXPORT_SYMBOL(zap_add_by_dnode);
 EXPORT_SYMBOL(zap_add_uint64);
+EXPORT_SYMBOL(zap_add_uint64_by_dnode);
 EXPORT_SYMBOL(zap_update);
 EXPORT_SYMBOL(zap_update_uint64);
+EXPORT_SYMBOL(zap_update_uint64_by_dnode);
 EXPORT_SYMBOL(zap_length);
 EXPORT_SYMBOL(zap_length_uint64);
 EXPORT_SYMBOL(zap_remove);
 EXPORT_SYMBOL(zap_remove_by_dnode);
 EXPORT_SYMBOL(zap_remove_norm);
 EXPORT_SYMBOL(zap_remove_uint64);
+EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
 EXPORT_SYMBOL(zap_count);
 EXPORT_SYMBOL(zap_value_search);
 EXPORT_SYMBOL(zap_join);

From 28768edeef27402e26614995c559300ae1da5869 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 17:59:55 -0400
Subject: [PATCH 12/23] BRT: Relax brt_pending_apply() locking

Since brt_pending_apply() is running in syncing context, no other
brt_pending_tree accesses are possible for the TXG.  We don't need
to acquire brt_pending_lock here.

Reviewed-by: Pawel Jakub Dawidek <pawel@dawidek.net>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15955
---
 module/zfs/brt.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 5e10df9dfe56..416caeb11c7e 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1473,26 +1473,23 @@ brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
 void
 brt_pending_apply(spa_t *spa, uint64_t txg)
 {
-	brt_t *brt;
+	brt_t *brt = spa->spa_brt;
 	brt_pending_entry_t *bpe;
 	avl_tree_t *pending_tree;
-	kmutex_t *pending_lock;
 	void *c;
 
 	ASSERT3U(txg, !=, 0);
 
-	brt = spa->spa_brt;
+	/*
+	 * We are in syncing context, so no other brt_pending_tree accesses
+	 * are possible for the TXG. Don't need to acquire brt_pending_lock.
+	 */
 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
-	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
-
-	mutex_enter(pending_lock);
 
 	c = NULL;
 	while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
 		boolean_t added_to_ddt;
 
-		mutex_exit(pending_lock);
-
 		for (int i = 0; i < bpe->bpe_count; i++) {
 			/*
 			 * If the block has DEDUP bit set, it means that it
@@ -1510,10 +1507,7 @@ brt_pending_apply(spa_t *spa, uint64_t txg)
 		}
 
 		kmem_cache_free(brt_pending_entry_cache, bpe);
-		mutex_enter(pending_lock);
 	}
-
-	mutex_exit(pending_lock);
 }
 
 static void

From 38ce13b957e7631dfb701eeeec4a7243aca9f344 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 18:02:38 -0400
Subject: [PATCH 13/23] BRT: Make BRT block sizes configurable

Similar to DDT make BRT data and indirect block sizes configurable
via module parameters.  I am not sure what would be the best yet,
but similar to DDT 4KB blocks kill all chances of compression on
vdev with ashift=12 or more, that on my tests reaches 3x.

While here, fix documentation for respective DDT parameters.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15967
---
 man/man4/zfs.4   | 17 +++++++++++++++--
 module/zfs/brt.c | 22 +++++++++++-----------
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 5307f1f32e93..24ea390d6e9a 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -245,12 +245,25 @@ For blocks that could be forced to be a gang block (due to
 .Sy metaslab_force_ganging ) ,
 force this many of them to be gang blocks.
 .
-.It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
+.It Sy brt_zap_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int
+Controls prefetching BRT records for blocks which are going to be cloned.
+.
+.It Sy brt_zap_default_bs Ns = Ns Sy 12 Po 4 KiB Pc Pq int
+Default BRT ZAP data block size as a power of 2. Note that changing this after
+creating a BRT on the pool will not affect existing BRTs, only newly created
+ones.
+.
+.It Sy brt_zap_default_ibs Ns = Ns Sy 12 Po 4 KiB Pc Pq int
+Default BRT ZAP indirect block size as a power of 2. Note that changing this
+after creating a BRT on the pool will not affect existing BRTs, only newly
+created ones.
+.
+.It Sy ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
 Default DDT ZAP data block size as a power of 2. Note that changing this after
 creating a DDT on the pool will not affect existing DDTs, only newly created
 ones.
 .
-.It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
+.It Sy ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
 Default DDT ZAP indirect block size as a power of 2. Note that changing this
 after creating a DDT on the pool will not affect existing DDTs, only newly
 created ones.
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 416caeb11c7e..014f26517ddd 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -248,7 +248,7 @@ static kmem_cache_t *brt_pending_entry_cache;
 /*
  * Enable/disable prefetching of BRT entries that we are going to modify.
  */
-int zfs_brt_prefetch = 1;
+static int brt_zap_prefetch = 1;
 
 #ifdef ZFS_DEBUG
 #define	BRT_DEBUG(...)	do {						\
@@ -260,8 +260,8 @@ int zfs_brt_prefetch = 1;
 #define	BRT_DEBUG(...)	do { } while (0)
 #endif
 
-int brt_zap_leaf_blockshift = 12;
-int brt_zap_indirect_blockshift = 12;
+static int brt_zap_default_bs = 12;
+static int brt_zap_default_ibs = 12;
 
 static kstat_t	*brt_ksp;
 
@@ -458,8 +458,7 @@ brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
 
 	brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
 	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
-	    brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
-	    0, tx);
+	    brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
 	VERIFY(brtvd->bv_mos_entries != 0);
 	BRT_DEBUG("MOS entries created, object=%llu",
 	    (u_longlong_t)brtvd->bv_mos_entries);
@@ -1363,7 +1362,7 @@ brt_prefetch(brt_t *brt, const blkptr_t *bp)
 
 	ASSERT(bp != NULL);
 
-	if (!zfs_brt_prefetch)
+	if (!brt_zap_prefetch)
 		return;
 
 	brt_entry_fill(bp, &bre, &vdevid);
@@ -1680,9 +1679,10 @@ brt_unload(spa_t *spa)
 }
 
 /* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
-    "Enable prefetching of BRT entries");
-#ifdef ZFS_BRT_DEBUG
-ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
-#endif
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
+	"Enable prefetching of BRT ZAP entries");
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
+	"BRT ZAP leaf blockshift");
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
+	"BRT ZAP indirect blockshift");
 /* END CSTYLED */

From 8477add535bc42d9e7bc1b56ff70b0b2e12dc3ec Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 20:13:45 -0400
Subject: [PATCH 14/23] BRT: Skip getting length in brt_entry_lookup()

Unlike DDT, where ZAP values may have different lengths due to
compression, all BRT entries are identical 8-byte counters.  It
does not make sense to first fetch the length only to assert it.
zap_lookup_uint64() is specifically designed to work with counters
of different size and should return error if something odd found.
Calling it straight allows to save some measurable CPU time.

Reviewed-by: Pawel Jakub Dawidek <pawel@dawidek.net>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15950
---
 module/zfs/brt.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 014f26517ddd..bf8fdf6ea4b4 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -900,7 +900,6 @@ static int
 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
 {
 	uint64_t mos_entries;
-	uint64_t one, physsize;
 	int error;
 
 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
@@ -918,21 +917,8 @@ brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
 
 	brt_unlock(brt);
 
-	error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
-	    BRT_KEY_WORDS, &one, &physsize);
-	if (error == 0) {
-		ASSERT3U(one, ==, 1);
-		ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
-
-		error = zap_lookup_uint64(brt->brt_mos, mos_entries,
-		    &bre->bre_offset, BRT_KEY_WORDS, 1,
-		    sizeof (bre->bre_refcount), &bre->bre_refcount);
-		BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
-		    "count=%llu error=%d", (u_longlong_t)mos_entries,
-		    (u_longlong_t)brtvd->bv_vdevid,
-		    (u_longlong_t)bre->bre_offset,
-		    error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
-	}
+	error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
+	    BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount);
 
 	brt_wlock(brt);
 

From 75f414063748a44d7e5f56c2096a2913c39556b4 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 18 Mar 2024 14:19:53 -0400
Subject: [PATCH 15/23] BRT: Fix holes cloning.

 - When reading L0 block pointers handle buffers without ones and
without dirty records as a holes.  Those appear when dnode size
was increased, but the end was never written, so there are no new
indirection levels to store the pointers.  It makes no sense to
return EAGAIN here, since sync won't create new indirection levels
until there will be actual writes.
 - When cloning blocks set destination hole logical birth time
to the current TXG.  Otherwise if we are cloning over existing
data, newly created holes may not be properly replicated later.
Use BP_SET_BIRTH() when possible to not replicate its logic.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #15994
Closes #16007
---
 module/zfs/dmu.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 8986f55e792a..7d07accc7c9e 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -2265,11 +2265,13 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 
 		if (bp == NULL) {
 			/*
-			 * The block was created in this transaction group,
-			 * so it has no BP yet.
+			 * The file size was increased, but the block was never
+			 * written, otherwise we would either have the block
+			 * pointer or the dirty record and would not get here.
+			 * It is effectively a hole, so report it as such.
 			 */
-			error = SET_ERROR(EAGAIN);
-			goto out;
+			BP_ZERO(&bps[i]);
+			continue;
 		}
 		/*
 		 * Make sure we clone only data blocks.
@@ -2361,18 +2363,16 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 		dl = &dr->dt.dl;
 		dl->dr_overridden_by = *bp;
-		dl->dr_brtwrite = B_TRUE;
-		dl->dr_override_state = DR_OVERRIDDEN;
-		if (BP_IS_HOLE(bp)) {
-			dl->dr_overridden_by.blk_birth = 0;
-			dl->dr_overridden_by.blk_phys_birth = 0;
-		} else {
-			dl->dr_overridden_by.blk_birth = dr->dr_txg;
+		if (!BP_IS_HOLE(bp) || bp->blk_birth != 0) {
 			if (!BP_IS_EMBEDDED(bp)) {
-				dl->dr_overridden_by.blk_phys_birth =
-				    BP_PHYSICAL_BIRTH(bp);
+				BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg,
+				    BP_PHYSICAL_BIRTH(bp));
+			} else {
+				dl->dr_overridden_by.blk_birth = dr->dr_txg;
 			}
 		}
+		dl->dr_brtwrite = B_TRUE;
+		dl->dr_override_state = DR_OVERRIDDEN;
 
 		mutex_exit(&db->db_mtx);
 

From 3c98b5568df8d3edd92b7a0f549124a634c4a3bb Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 19 Mar 2024 12:25:14 -0400
Subject: [PATCH 16/23] BRT: Fix tests to work on non-empty pools

It should not normally happen, but if it does, better to not fail
everything for no good reason, or it may be hard to debug.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #16007
---
 .../functional/bclone/bclone_common.kshlib    | 47 ++++++++++---------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib
index 3b8eaea5bb54..84b92b4dcdc9 100644
--- a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib
+++ b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib
@@ -97,20 +97,19 @@ function verify_pool_prop_eq
 
 function verify_pool_props
 {
-    typeset -r dsize=$1
-    typeset -r ratio=$2
+    typeset -r oused=$1
+    typeset -r osaved=$2
+    typeset dsize=$3
+    typeset ratio=$4
 
     if [[ $dsize -eq 0 ]]; then
-        verify_pool_prop_eq bcloneused 0
-        verify_pool_prop_eq bclonesaved 0
-        verify_pool_prop_eq bcloneratio 1.00
-    else
-        if [[ $ratio -eq 1 ]]; then
-            verify_pool_prop_eq bcloneused 0
-        else
-            verify_pool_prop_eq bcloneused $dsize
-        fi
-        verify_pool_prop_eq bclonesaved $((dsize*(ratio-1)))
+        ratio=1
+    elif [[ $ratio -eq 1 ]]; then
+        dsize=0
+    fi
+    verify_pool_prop_eq bcloneused $(($oused+$dsize))
+    verify_pool_prop_eq bclonesaved $(($osaved+dsize*(ratio-1)))
+    if [[ $oused -eq 0 ]]; then
         verify_pool_prop_eq bcloneratio "${ratio}.00"
     fi
 }
@@ -124,16 +123,22 @@ function bclone_test
     typeset -r srcdir=$4
     typeset -r dstdir=$5
     typeset dsize
+    typeset oused
+    typeset osaved
 
     typeset -r original="${srcdir}/original"
     typeset -r clone="${dstdir}/clone"
 
     log_note "Testing file copy with datatype $datatype, file size $filesize, embedded $embedded"
 
+    # Save current block cloning stats for later use.
+    sync_pool $TESTPOOL
+    oused=$(get_pool_prop bcloneused $TESTPOOL)
+    osaved=$(get_pool_prop bclonesaved $TESTPOOL)
+
     # Create a test file with known content.
     case $datatype in
         random|text)
-            sync_pool $TESTPOOL
             if [[ $datatype = "random" ]]; then
                 dd if=/dev/urandom of=$original bs=$filesize count=1 2>/dev/null
             else
@@ -146,13 +151,13 @@ function bclone_test
             sync_pool $TESTPOOL
             # It is hard to predict block sizes that will be used,
             # so just do one clone and take it from bcloneused.
-            filesize=$(zpool get -Hp -o value bcloneused $TESTPOOL)
+            dsize=$(get_pool_prop bcloneused $TESTPOOL)
+            dsize=$(($dsize-$oused))
             if [[ $embedded = "false" ]]; then
-                log_must test $filesize -gt 0
+                log_must test $dsize -gt 0
             fi
             rm -f "${clone}-tmp"
             sync_pool $TESTPOOL
-            dsize=$filesize
             ;;
         hole)
             log_must truncate_test -s $filesize -f $original
@@ -217,7 +222,7 @@ function bclone_test
     test_file_integrity $original_checksum "${clone}4" $filesize
     test_file_integrity $original_checksum "${clone}5" $filesize
 
-    verify_pool_props $dsize 7
+    verify_pool_props $oused $osaved $dsize 7
 
     # Clear cache and test after fresh import.
     log_must zpool export $TESTPOOL
@@ -240,7 +245,7 @@ function bclone_test
 
     sync_pool $TESTPOOL
 
-    verify_pool_props $dsize 11
+    verify_pool_props $oused $osaved $dsize 11
 
     log_must zpool export $TESTPOOL
     log_must zpool import $TESTPOOL
@@ -268,7 +273,7 @@ function bclone_test
     test_file_integrity $original_checksum "${clone}8" $filesize
     test_file_integrity $original_checksum "${clone}9" $filesize
 
-    verify_pool_props $dsize 6
+    verify_pool_props $oused $osaved $dsize 6
 
     rm -f "${clone}0" "${clone}2" "${clone}4" "${clone}8" "${clone}9"
 
@@ -276,11 +281,11 @@ function bclone_test
 
     test_file_integrity $original_checksum "${clone}6" $filesize
 
-    verify_pool_props $dsize 1
+    verify_pool_props $oused $osaved $dsize 1
 
     rm -f "${clone}6"
 
     sync_pool $TESTPOOL
 
-    verify_pool_props $dsize 1
+    verify_pool_props $oused $osaved $dsize 1
 }

From 6552ab756db5f52bc1bc2bda4694fc48bf429192 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 19 Mar 2024 13:08:05 -0400
Subject: [PATCH 17/23] BRT: Check pool clone stats in more tests

This should allow to catch some leaks, if those happen.

While there fix some cosmetic issues.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #16007
---
 .../bclone/bclone_corner_cases.kshlib         | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib
index ddfbfc999c4e..aeb8efe91715 100644
--- a/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib
+++ b/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib
@@ -66,7 +66,7 @@ function bclone_corner_cases_init
     export SECOND_HALF_ORIG0_CHECKSUM=$(second_half_checksum $ORIG0)
     export SECOND_HALF_ORIG1_CHECKSUM=$(second_half_checksum $ORIG1)
     export SECOND_HALF_ORIG2_CHECKSUM=$(second_half_checksum $ORIG2)
-    export ZEROS_CHECKSUM=$(dd if=/dev/zero bs=$HALFRECORDSIZE count=1 | sha256digest)
+    export ZEROS_CHECKSUM=$(dd if=/dev/zero bs=$HALFRECORDSIZE count=1 2>/dev/null | sha256digest)
     export FIRST_HALF_CHECKSUM=""
     export SECOND_HALF_CHECKSUM=""
 }
@@ -210,6 +210,8 @@ function bclone_corner_cases_test
     typeset -r dstdir=$2
     typeset limit=$3
     typeset -i count=0
+    typeset oused
+    typeset osaved
 
     if [[ $srcdir != "count" ]]; then
         if [[ -n "$limit" ]]; then
@@ -217,6 +219,11 @@ function bclone_corner_cases_test
             limit=$(random_int_between 1 $total_count $((limit*2)) | sort -nu | head -n $limit | xargs)
         fi
         bclone_corner_cases_init $srcdir $dstdir
+
+        # Save current block cloning stats for later use.
+        sync_pool $TESTPOOL
+        oused=$(get_pool_prop bcloneused $TESTPOOL)
+        osaved=$(get_pool_prop bclonesaved $TESTPOOL)
     fi
 
     #
@@ -285,21 +292,24 @@ function bclone_corner_cases_test
                                     overwrite_clone "$second_overwrite"
 
                                     if checksum_compare $read_after; then
-                                        log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after"
+                                        log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after"
                                     else
-                                        log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after"
+                                        log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after"
                                     fi
 
                                     log_must zpool export $TESTPOOL
                                     log_must zpool import $TESTPOOL
 
                                     if checksum_compare "yes"; then
-                                        log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after / read_next_txg"
+                                        log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after / read_next_txg"
                                     else
-                                        log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after / read_next_txg"
+                                        log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after / read_next_txg"
                                     fi
 
                                     rm -f "$CLONE"
+                                    sync_pool $TESTPOOL
+                                    verify_pool_prop_eq bcloneused $oused
+                                    verify_pool_prop_eq bclonesaved $osaved
                                 done
                             done
                         done

From 6e73d87cce4fabd554b264dd5c6abd47c0020fd9 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 3 Apr 2024 18:04:26 -0400
Subject: [PATCH 18/23] Improve dbuf_read() error reporting

Previous code reported non-ZIO errors only via return value, but
not via parent ZIO.  It could cause NULL-dereference panics due
to dmu_buf_hold_array_by_dnode() ignoring the return value,
relying solely on parent ZIO status.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reported by:	Ameer Hamza <ahamza@ixsystems.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16042
---
 module/zfs/dbuf.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index ae5657d762f5..46c564c2572b 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -1542,17 +1542,14 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
  * returning.
  */
 static int
-dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
+dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
     db_lock_type_t dblt, const void *tag)
 {
-	dnode_t *dn;
 	zbookmark_phys_t zb;
 	uint32_t aflags = ARC_FLAG_NOWAIT;
 	int err, zio_flags;
 	blkptr_t bp, *bpp;
 
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
@@ -1627,8 +1624,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	if (err != 0)
 		goto early_unlock;
 
-	DB_DNODE_EXIT(db);
-
 	db->db_state = DB_READ;
 	DTRACE_SET_STATE(db, "read issued");
 	mutex_exit(&db->db_mtx);
@@ -1653,12 +1648,11 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	 * parent's rwlock, which would be a lock ordering violation.
 	 */
 	dmu_buf_unlock_parent(db, dblt, tag);
-	(void) arc_read(zio, db->db_objset->os_spa, bpp,
+	return (arc_read(zio, db->db_objset->os_spa, bpp,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
-	    &aflags, &zb);
-	return (err);
+	    &aflags, &zb));
+
 early_unlock:
-	DB_DNODE_EXIT(db);
 	mutex_exit(&db->db_mtx);
 	dmu_buf_unlock_parent(db, dblt, tag);
 	return (err);
@@ -1743,7 +1737,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 }
 
 int
-dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 {
 	int err = 0;
 	boolean_t prefetch;
@@ -1759,7 +1753,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 	dn = DB_DNODE(db);
 
 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
-	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL;
+	    (flags & DB_RF_NOPREFETCH) == 0;
 
 	mutex_enter(&db->db_mtx);
 	if (flags & DB_RF_PARTIAL_FIRST)
@@ -1806,13 +1800,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 
 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
 
-		if (zio == NULL && (db->db_state == DB_NOFILL ||
+		if (pio == NULL && (db->db_state == DB_NOFILL ||
 		    (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
 			spa_t *spa = dn->dn_objset->os_spa;
-			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+			pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 			need_wait = B_TRUE;
 		}
-		err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
+		err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
 		/*
 		 * dbuf_read_impl has dropped db_mtx and our parent's rwlock
 		 * for us
@@ -1833,9 +1827,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 		 */
 		if (need_wait) {
 			if (err == 0)
-				err = zio_wait(zio);
+				err = zio_wait(pio);
 			else
-				VERIFY0(zio_wait(zio));
+				(void) zio_wait(pio);
+			pio = NULL;
 		}
 	} else {
 		/*
@@ -1862,7 +1857,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 				ASSERT(db->db_state == DB_READ ||
 				    (flags & DB_RF_HAVESTRUCT) == 0);
 				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
-				    db, zio_t *, zio);
+				    db, zio_t *, pio);
 				cv_wait(&db->db_changed, &db->db_mtx);
 			}
 			if (db->db_state == DB_UNCACHED)
@@ -1871,6 +1866,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 		}
 	}
 
+	if (pio && err != 0) {
+		zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
+		    ZIO_FLAG_CANFAIL);
+		zio->io_error = err;
+		zio_nowait(zio);
+	}
+
 	return (err);
 }
 

From 3a73611ef39165ed2beb6da26a81738bb549e0a8 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 8 Apr 2024 15:03:18 -0400
Subject: [PATCH 19/23] Fix read errors race after block cloning

Investigating read errors triggering panic fixed in #16042 I've
found that we have a race in a sync process between the moment
dirty record for cloned block is removed and the moment dbuf is
destroyed.  If dmu_buf_hold_array_by_dnode() take a hold on a
cloned dbuf before it is synced/destroyed, then dbuf_read_impl()
may see it still in DB_NOFILL state, but without the dirty record.
Such case is not an error, but equivalent to DB_UNCACHED, since
the dbuf block pointer is already updated by dbuf_write_ready().
Unfortunately it is impossible to safely change the dbuf state
to DB_UNCACHED there, since there may already be another cloning
in progress, that dropped dbuf lock before creating a new dirty
record, protected only by the range lock.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Robert Evans <evansr@google.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16052
---
 module/zfs/dbuf.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 46c564c2572b..42e5811c8597 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -1548,7 +1548,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
 	zbookmark_phys_t zb;
 	uint32_t aflags = ARC_FLAG_NOWAIT;
 	int err, zio_flags;
-	blkptr_t bp, *bpp;
+	blkptr_t bp, *bpp = NULL;
 
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1562,29 +1562,28 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
 		goto early_unlock;
 	}
 
-	if (db->db_state == DB_UNCACHED) {
-		if (db->db_blkptr == NULL) {
-			bpp = NULL;
-		} else {
-			bp = *db->db_blkptr;
+	/*
+	 * If we have a pending block clone, we don't want to read the
+	 * underlying block, but the content of the block being cloned,
+	 * pointed by the dirty record, so we have the most recent data.
+	 * If there is no dirty record, then we hit a race in a sync
+	 * process when the dirty record is already removed, while the
+	 * dbuf is not yet destroyed. Such case is equivalent to uncached.
+	 */
+	if (db->db_state == DB_NOFILL) {
+		dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
+		if (dr != NULL) {
+			if (!dr->dt.dl.dr_brtwrite) {
+				err = EIO;
+				goto early_unlock;
+			}
+			bp = dr->dt.dl.dr_overridden_by;
 			bpp = &bp;
 		}
-	} else {
-		dbuf_dirty_record_t *dr;
-
-		ASSERT3S(db->db_state, ==, DB_NOFILL);
+	}
 
-		/*
-		 * Block cloning: If we have a pending block clone,
-		 * we don't want to read the underlying block, but the content
-		 * of the block being cloned, so we have the most recent data.
-		 */
-		dr = list_head(&db->db_dirty_records);
-		if (dr == NULL || !dr->dt.dl.dr_brtwrite) {
-			err = EIO;
-			goto early_unlock;
-		}
-		bp = dr->dt.dl.dr_overridden_by;
+	if (bpp == NULL && db->db_blkptr != NULL) {
+		bp = *db->db_blkptr;
 		bpp = &bp;
 	}
 

From f7b6c8623e7473ebfaee1488e366056d37145e91 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 8 Apr 2024 18:13:27 -0400
Subject: [PATCH 20/23] Speculative prefetch for reordered requests

Before this change speculative prefetcher was able to detect a stream
only if all of its accesses are perfectly sequential.  It was easy to
implement and is perfectly fine for single-threaded applications.
Unfortunately multi-threaded network servers, such as iSCSI, SMB or
NFS usually have plenty of threads and may often reorder requests,
preventing successful speculation and prefetch.

This change allows speculative prefetcher to detect streams even if
requests are reordered by introducing a list of 9 non-contiguous
ranges up to 16MB ahead of current stream position and filling the
gaps as more requests arrive.  It also allows stream to proceed
even with holes up to a certain configurable threshold (25%).

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16022
---
 cmd/arc_summary          |  11 +-
 include/sys/dmu_zfetch.h |  16 ++-
 man/man4/zfs.4           |  11 ++
 module/zfs/dmu.c         |   8 +-
 module/zfs/dmu_zfetch.c  | 289 +++++++++++++++++++++++++++++++--------
 5 files changed, 272 insertions(+), 63 deletions(-)

diff --git a/cmd/arc_summary b/cmd/arc_summary
index 9c69ec4f8ccc..100fb1987a8b 100755
--- a/cmd/arc_summary
+++ b/cmd/arc_summary
@@ -793,18 +793,27 @@ def section_dmu(kstats_dict):
 
     zfetch_stats = isolate_section('zfetchstats', kstats_dict)
 
-    zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses'])
+    zfetch_access_total = int(zfetch_stats['hits']) +\
+        int(zfetch_stats['future']) + int(zfetch_stats['stride']) +\
+        int(zfetch_stats['past']) + int(zfetch_stats['misses'])
 
     prt_1('DMU predictive prefetcher calls:', f_hits(zfetch_access_total))
     prt_i2('Stream hits:',
            f_perc(zfetch_stats['hits'], zfetch_access_total),
            f_hits(zfetch_stats['hits']))
+    future = int(zfetch_stats['future']) + int(zfetch_stats['stride'])
+    prt_i2('Hits ahead of stream:', f_perc(future, zfetch_access_total),
+           f_hits(future))
+    prt_i2('Hits behind stream:',
+           f_perc(zfetch_stats['past'], zfetch_access_total),
+           f_hits(zfetch_stats['past']))
     prt_i2('Stream misses:',
            f_perc(zfetch_stats['misses'], zfetch_access_total),
            f_hits(zfetch_stats['misses']))
     prt_i2('Streams limit reached:',
            f_perc(zfetch_stats['max_streams'], zfetch_stats['misses']),
            f_hits(zfetch_stats['max_streams']))
+    prt_i1('Stream strides:', f_hits(zfetch_stats['stride']))
     prt_i1('Prefetches issued', f_hits(zfetch_stats['io_issued']))
     print()
 
diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h
index f00e13cf03a6..322472fb1ae2 100644
--- a/include/sys/dmu_zfetch.h
+++ b/include/sys/dmu_zfetch.h
@@ -45,18 +45,24 @@ typedef struct zfetch {
 	int		zf_numstreams;	/* number of zstream_t's */
 } zfetch_t;
 
+typedef struct zsrange {
+	uint16_t	start;
+	uint16_t	end;
+} zsrange_t;
+
+#define	ZFETCH_RANGES	9		/* Fits zstream_t into 128 bytes */
+
 typedef struct zstream {
+	list_node_t	zs_node;	/* link for zf_stream */
 	uint64_t	zs_blkid;	/* expect next access at this blkid */
+	uint_t		zs_atime;	/* time last prefetch issued */
+	zsrange_t	zs_ranges[ZFETCH_RANGES]; /* ranges from future */
 	unsigned int	zs_pf_dist;	/* data prefetch distance in bytes */
 	unsigned int	zs_ipf_dist;	/* L1 prefetch distance in bytes */
 	uint64_t	zs_pf_start;	/* first data block to prefetch */
 	uint64_t	zs_pf_end;	/* data block to prefetch up to */
 	uint64_t	zs_ipf_start;	/* first data block to prefetch L1 */
 	uint64_t	zs_ipf_end;	/* data block to prefetch L1 up to */
-
-	list_node_t	zs_node;	/* link for zf_stream */
-	hrtime_t	zs_atime;	/* time last prefetch issued */
-	zfetch_t	*zs_fetch;	/* parent fetch */
 	boolean_t	zs_missed;	/* stream saw cache misses */
 	boolean_t	zs_more;	/* need more distant prefetch */
 	zfs_refcount_t	zs_callers;	/* number of pending callers */
@@ -74,7 +80,7 @@ void		dmu_zfetch_init(zfetch_t *, struct dnode *);
 void		dmu_zfetch_fini(zfetch_t *);
 zstream_t	*dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
     boolean_t);
-void		dmu_zfetch_run(zstream_t *, boolean_t, boolean_t);
+void		dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t);
 void		dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
     boolean_t);
 
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 24ea390d6e9a..1191cc962492 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -544,6 +544,10 @@ However, this is limited by
 Maximum micro ZAP size.
 A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
 .
+.It Sy zfetch_hole_shift Ns = Ns Sy 2 Pq uint
+Log2 fraction of holes in speculative prefetch stream allowed for it to
+proceed.
+.
 .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
 Min bytes to prefetch per stream.
 Prefetch distance starts from the demand access size and quickly grows to
@@ -558,6 +562,13 @@ Max bytes to prefetch per stream.
 .It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint
 Max bytes to prefetch indirects for per stream.
 .
+.It Sy zfetch_max_reorder Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
+Requests within this byte distance from the current prefetch stream position
+are considered parts of the stream, reordered due to parallel processing.
+Such requests do not advance the stream position immediately unless
+.Sy zfetch_hole_shift
+fill threshold is reached, but saved to fill holes in the stream later.
+.
 .It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint
 Max number of streams per zfetch (prefetch streams per file).
 .
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 7d07accc7c9e..d8d5cfdbd230 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -569,8 +569,10 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 		if (db == NULL) {
-			if (zs)
-				dmu_zfetch_run(zs, missed, B_TRUE);
+			if (zs) {
+				dmu_zfetch_run(&dn->dn_zfetch, zs, missed,
+				    B_TRUE);
+			}
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
 			if (read)
@@ -606,7 +608,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 		zfs_racct_write(length, nblks);
 
 	if (zs)
-		dmu_zfetch_run(zs, missed, B_TRUE);
+		dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (read) {
diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c
index d0acaf502066..e26195176036 100644
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@@ -65,9 +65,16 @@ unsigned int	zfetch_max_distance = 64 * 1024 * 1024;
 #endif
 /* max bytes to prefetch indirects for per stream (default 64MB) */
 unsigned int	zfetch_max_idistance = 64 * 1024 * 1024;
+/* max request reorder distance within a stream (default 16MB) */
+unsigned int	zfetch_max_reorder = 16 * 1024 * 1024;
+/* Max log2 fraction of holes in a stream */
+unsigned int	zfetch_hole_shift = 2;
 
 typedef struct zfetch_stats {
 	kstat_named_t zfetchstat_hits;
+	kstat_named_t zfetchstat_future;
+	kstat_named_t zfetchstat_stride;
+	kstat_named_t zfetchstat_past;
 	kstat_named_t zfetchstat_misses;
 	kstat_named_t zfetchstat_max_streams;
 	kstat_named_t zfetchstat_io_issued;
@@ -76,6 +83,9 @@ typedef struct zfetch_stats {
 
 static zfetch_stats_t zfetch_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
+	{ "future",			KSTAT_DATA_UINT64 },
+	{ "stride",			KSTAT_DATA_UINT64 },
+	{ "past",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "max_streams",		KSTAT_DATA_UINT64 },
 	{ "io_issued",			KSTAT_DATA_UINT64 },
@@ -84,6 +94,9 @@ static zfetch_stats_t zfetch_stats = {
 
 struct {
 	wmsum_t zfetchstat_hits;
+	wmsum_t zfetchstat_future;
+	wmsum_t zfetchstat_stride;
+	wmsum_t zfetchstat_past;
 	wmsum_t zfetchstat_misses;
 	wmsum_t zfetchstat_max_streams;
 	wmsum_t zfetchstat_io_issued;
@@ -107,6 +120,12 @@ zfetch_kstats_update(kstat_t *ksp, int rw)
 		return (EACCES);
 	zs->zfetchstat_hits.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_hits);
+	zs->zfetchstat_future.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_future);
+	zs->zfetchstat_stride.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_stride);
+	zs->zfetchstat_past.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_past);
 	zs->zfetchstat_misses.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_misses);
 	zs->zfetchstat_max_streams.value.ui64 =
@@ -122,6 +141,9 @@ void
 zfetch_init(void)
 {
 	wmsum_init(&zfetch_sums.zfetchstat_hits, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_future, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_stride, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_past, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
@@ -147,6 +169,9 @@ zfetch_fini(void)
 	}
 
 	wmsum_fini(&zfetch_sums.zfetchstat_hits);
+	wmsum_fini(&zfetch_sums.zfetchstat_future);
+	wmsum_fini(&zfetch_sums.zfetchstat_stride);
+	wmsum_fini(&zfetch_sums.zfetchstat_past);
 	wmsum_fini(&zfetch_sums.zfetchstat_misses);
 	wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
 	wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
@@ -222,22 +247,22 @@ static void
 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 {
 	zstream_t *zs, *zs_next, *zs_old = NULL;
-	hrtime_t now = gethrtime(), t;
+	uint_t now = gethrestime_sec(), t;
 
 	ASSERT(MUTEX_HELD(&zf->zf_lock));
 
 	/*
 	 * Delete too old streams, reusing the first found one.
 	 */
-	t = now - SEC2NSEC(zfetch_max_sec_reap);
+	t = now - zfetch_max_sec_reap;
 	for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) {
 		zs_next = list_next(&zf->zf_stream, zs);
 		/*
 		 * Skip if still active.  1 -- zf_stream reference.
 		 */
-		if (zfs_refcount_count(&zs->zs_refs) != 1)
+		if ((int)(zs->zs_atime - t) >= 0)
 			continue;
-		if (zs->zs_atime > t)
+		if (zfs_refcount_count(&zs->zs_refs) != 1)
 			continue;
 		if (zs_old)
 			dmu_zfetch_stream_remove(zf, zs);
@@ -246,6 +271,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 	}
 	if (zs_old) {
 		zs = zs_old;
+		list_remove(&zf->zf_stream, zs);
 		goto reuse;
 	}
 
@@ -255,21 +281,23 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 	 * for all the streams to be non-overlapping.
 	 */
 	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
-	    zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
+	    (zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) /
 	    zfetch_max_distance));
 	if (zf->zf_numstreams >= max_streams) {
-		t = now - SEC2NSEC(zfetch_min_sec_reap);
+		t = now - zfetch_min_sec_reap;
 		for (zs = list_head(&zf->zf_stream); zs != NULL;
 		    zs = list_next(&zf->zf_stream, zs)) {
-			if (zfs_refcount_count(&zs->zs_refs) != 1)
+			if ((int)(zs->zs_atime - t) >= 0)
 				continue;
-			if (zs->zs_atime > t)
+			if (zfs_refcount_count(&zs->zs_refs) != 1)
 				continue;
-			if (zs_old == NULL || zs->zs_atime < zs_old->zs_atime)
+			if (zs_old == NULL ||
+			    (int)(zs_old->zs_atime - zs->zs_atime) >= 0)
 				zs_old = zs;
 		}
 		if (zs_old) {
 			zs = zs_old;
+			list_remove(&zf->zf_stream, zs);
 			goto reuse;
 		}
 		ZFETCHSTAT_BUMP(zfetchstat_max_streams);
@@ -277,24 +305,24 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 	}
 
 	zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
-	zs->zs_fetch = zf;
 	zfs_refcount_create(&zs->zs_callers);
 	zfs_refcount_create(&zs->zs_refs);
 	/* One reference for zf_stream. */
 	zfs_refcount_add(&zs->zs_refs, NULL);
 	zf->zf_numstreams++;
-	list_insert_head(&zf->zf_stream, zs);
 
 reuse:
+	list_insert_head(&zf->zf_stream, zs);
 	zs->zs_blkid = blkid;
+	/* Allow immediate stream reuse until first hit. */
+	zs->zs_atime = now - zfetch_min_sec_reap;
+	memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges));
 	zs->zs_pf_dist = 0;
+	zs->zs_ipf_dist = 0;
 	zs->zs_pf_start = blkid;
 	zs->zs_pf_end = blkid;
-	zs->zs_ipf_dist = 0;
 	zs->zs_ipf_start = blkid;
 	zs->zs_ipf_end = blkid;
-	/* Allow immediate stream reuse until first hit. */
-	zs->zs_atime = now - SEC2NSEC(zfetch_min_sec_reap);
 	zs->zs_missed = B_FALSE;
 	zs->zs_more = B_FALSE;
 }
@@ -311,6 +339,120 @@ dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
 	aggsum_add(&zfetch_sums.zfetchstat_io_active, -1);
 }
 
+/*
+ * Process stream hit access for nblks blocks starting at zs_blkid.  Return
+ * number of blocks to proceed for after aggregation with future ranges.
+ */
+static uint64_t
+dmu_zfetch_hit(zstream_t *zs, uint64_t nblks)
+{
+	uint_t i, j;
+
+	/* Optimize sequential accesses (no future ranges). */
+	if (zs->zs_ranges[0].start == 0)
+		goto done;
+
+	/* Look for intersections with further ranges. */
+	for (i = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0 || r->start > nblks)
+			break;
+		if (r->end >= nblks) {
+			nblks = r->end;
+			i++;
+			break;
+		}
+	}
+
+	/* Delete all found intersecting ranges, updates remaining. */
+	for (j = 0; i < ZFETCH_RANGES; i++, j++) {
+		if (zs->zs_ranges[i].start == 0)
+			break;
+		ASSERT3U(zs->zs_ranges[i].start, >, nblks);
+		ASSERT3U(zs->zs_ranges[i].end, >, nblks);
+		zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks;
+		zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks;
+	}
+	if (j < ZFETCH_RANGES) {
+		zs->zs_ranges[j].start = 0;
+		zs->zs_ranges[j].end = 0;
+	}
+
+done:
+	zs->zs_blkid += nblks;
+	return (nblks);
+}
+
+/*
+ * Process future stream access for nblks blocks starting at blkid.  Return
+ * number of blocks to proceed for if future ranges reach fill threshold.
+ */
+static uint64_t
+dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks)
+{
+	ASSERT3U(blkid, >, zs->zs_blkid);
+	blkid -= zs->zs_blkid;
+	ASSERT3U(blkid + nblks, <=, UINT16_MAX);
+
+	/* Search for first and last intersection or insert point. */
+	uint_t f = ZFETCH_RANGES, l = 0, i;
+	for (i = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0 || r->start > blkid + nblks)
+			break;
+		if (r->end < blkid)
+			continue;
+		if (f > i)
+			f = i;
+		if (l < i)
+			l = i;
+	}
+	if (f <= l) {
+		/* Got some intersecting range, expand it if needed. */
+		if (zs->zs_ranges[f].start > blkid)
+			zs->zs_ranges[f].start = blkid;
+		zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks);
+		if (f < l) {
+			/* Got more than one intersection, remove others. */
+			for (f++, l++; l < ZFETCH_RANGES; f++, l++) {
+				zs->zs_ranges[f].start = zs->zs_ranges[l].start;
+				zs->zs_ranges[f].end = zs->zs_ranges[l].end;
+			}
+			zs->zs_ranges[ZFETCH_RANGES - 1].start = 0;
+			zs->zs_ranges[ZFETCH_RANGES - 1].end = 0;
+		}
+	} else if (i < ZFETCH_RANGES) {
+		/* Got no intersecting ranges, insert new one. */
+		for (l = ZFETCH_RANGES - 1; l > i; l--) {
+			zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start;
+			zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end;
+		}
+		zs->zs_ranges[i].start = blkid;
+		zs->zs_ranges[i].end = blkid + nblks;
+	} else {
+		/* No space left to insert.  Drop the range. */
+		return (0);
+	}
+
+	/* Check if with the new access addition we reached fill threshold. */
+	if (zfetch_hole_shift >= 16)
+		return (0);
+	uint_t hole = 0;
+	for (i = f = l = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0)
+			break;
+		hole += r->start - f;
+		f = r->end;
+		if (hole <= r->end >> zfetch_hole_shift)
+			l = r->end;
+	}
+	if (l > 0)
+		return (dmu_zfetch_hit(zs, l));
+
+	return (0);
+}
+
 /*
  * This is the predictive prefetch entry point.  dmu_zfetch_prepare()
  * associates dnode access specified with blkid and nblks arguments with
@@ -365,53 +507,92 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	mutex_enter(&zf->zf_lock);
 
 	/*
-	 * Find matching prefetch stream.  Depending on whether the accesses
+	 * Find perfect prefetch stream.  Depending on whether the accesses
 	 * are block-aligned, first block of the new access may either follow
 	 * the last block of the previous access, or be equal to it.
 	 */
+	unsigned int dbs = zf->zf_dnode->dn_datablkshift;
+	uint64_t end_blkid = blkid + nblks;
 	for (zs = list_head(&zf->zf_stream); zs != NULL;
 	    zs = list_next(&zf->zf_stream, zs)) {
 		if (blkid == zs->zs_blkid) {
-			break;
+			goto hit;
 		} else if (blkid + 1 == zs->zs_blkid) {
 			blkid++;
 			nblks--;
-			break;
+			goto hit;
 		}
 	}
 
 	/*
-	 * If the file is ending, remove the matching stream if found.
-	 * If not found then it is too late to create a new one now.
+	 * Find close enough prefetch stream.  Access crossing stream position
+	 * is a hit in its new part.  Access ahead of stream position considered
+	 * a hit for metadata prefetch, since we do not care about fill percent,
+	 * or stored for future otherwise.  Access behind stream position is
+	 * silently ignored, since we already skipped it reaching fill percent.
 	 */
-	uint64_t end_of_access_blkid = blkid + nblks;
-	if (end_of_access_blkid >= maxblkid) {
-		if (zs != NULL)
-			dmu_zfetch_stream_remove(zf, zs);
-		mutex_exit(&zf->zf_lock);
-		if (!have_lock)
-			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		return (NULL);
+	uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX);
+	uint_t t = gethrestime_sec() - zfetch_max_sec_reap;
+	for (zs = list_head(&zf->zf_stream); zs != NULL;
+	    zs = list_next(&zf->zf_stream, zs)) {
+		if (blkid > zs->zs_blkid) {
+			if (end_blkid <= zs->zs_blkid + max_reorder) {
+				if (!fetch_data) {
+					nblks = dmu_zfetch_hit(zs,
+					    end_blkid - zs->zs_blkid);
+					ZFETCHSTAT_BUMP(zfetchstat_stride);
+					goto future;
+				}
+				nblks = dmu_zfetch_future(zs, blkid, nblks);
+				if (nblks > 0)
+					ZFETCHSTAT_BUMP(zfetchstat_stride);
+				else
+					ZFETCHSTAT_BUMP(zfetchstat_future);
+				goto future;
+			}
+		} else if (end_blkid >= zs->zs_blkid) {
+			nblks -= zs->zs_blkid - blkid;
+			blkid += zs->zs_blkid - blkid;
+			goto hit;
+		} else if (end_blkid + max_reorder > zs->zs_blkid &&
+		    (int)(zs->zs_atime - t) >= 0) {
+			ZFETCHSTAT_BUMP(zfetchstat_past);
+			zs->zs_atime = gethrestime_sec();
+			goto out;
+		}
 	}
 
-	/* Exit if we already prefetched this block before. */
-	if (nblks == 0) {
-		mutex_exit(&zf->zf_lock);
-		if (!have_lock)
-			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		return (NULL);
-	}
+	/*
+	 * This access is not part of any existing stream.  Create a new
+	 * stream for it unless we are at the end of file.
+	 */
+	if (end_blkid < maxblkid)
+		dmu_zfetch_stream_create(zf, end_blkid);
+	mutex_exit(&zf->zf_lock);
+	if (!have_lock)
+		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+	ZFETCHSTAT_BUMP(zfetchstat_misses);
+	return (NULL);
 
-	if (zs == NULL) {
-		/*
-		 * This access is not part of any existing stream.  Create
-		 * a new stream for it.
-		 */
-		dmu_zfetch_stream_create(zf, end_of_access_blkid);
+hit:
+	nblks = dmu_zfetch_hit(zs, nblks);
+	ZFETCHSTAT_BUMP(zfetchstat_hits);
+
+future:
+	zs->zs_atime = gethrestime_sec();
+
+	/* Exit if we already prefetched for this position before. */
+	if (nblks == 0)
+		goto out;
+
+	/* If the file is ending, remove the stream. */
+	end_blkid = zs->zs_blkid;
+	if (end_blkid >= maxblkid) {
+		dmu_zfetch_stream_remove(zf, zs);
+out:
 		mutex_exit(&zf->zf_lock);
 		if (!have_lock)
 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		ZFETCHSTAT_BUMP(zfetchstat_misses);
 		return (NULL);
 	}
 
@@ -427,7 +608,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	 * than ~6% of ARC held by active prefetches.  It should help with
 	 * getting out of RAM on some badly mispredicted read patterns.
 	 */
-	unsigned int dbs = zf->zf_dnode->dn_datablkshift;
 	unsigned int nbytes = nblks << dbs;
 	unsigned int pf_nblks;
 	if (fetch_data) {
@@ -447,10 +627,10 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	} else {
 		pf_nblks = 0;
 	}
-	if (zs->zs_pf_start < end_of_access_blkid)
-		zs->zs_pf_start = end_of_access_blkid;
-	if (zs->zs_pf_end < end_of_access_blkid + pf_nblks)
-		zs->zs_pf_end = end_of_access_blkid + pf_nblks;
+	if (zs->zs_pf_start < end_blkid)
+		zs->zs_pf_start = end_blkid;
+	if (zs->zs_pf_end < end_blkid + pf_nblks)
+		zs->zs_pf_end = end_blkid + pf_nblks;
 
 	/*
 	 * Do the same for indirects, starting where we will stop reading
@@ -468,9 +648,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
 		zs->zs_ipf_end = zs->zs_pf_end + pf_nblks;
 
-	zs->zs_blkid = end_of_access_blkid;
-	/* Protect the stream from reclamation. */
-	zs->zs_atime = gethrtime();
 	zfs_refcount_add(&zs->zs_refs, NULL);
 	/* Count concurrent callers. */
 	zfs_refcount_add(&zs->zs_callers, NULL);
@@ -478,15 +655,13 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 
 	if (!have_lock)
 		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-
-	ZFETCHSTAT_BUMP(zfetchstat_hits);
 	return (zs);
 }
 
 void
-dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
+dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
+    boolean_t have_lock)
 {
-	zfetch_t *zf = zs->zs_fetch;
 	int64_t pf_start, pf_end, ipf_start, ipf_end;
 	int epbs, issued;
 
@@ -562,7 +737,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
 
 	zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
 	if (zs)
-		dmu_zfetch_run(zs, missed, have_lock);
+		dmu_zfetch_run(zf, zs, missed, have_lock);
 }
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
@@ -585,3 +760,9 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
 	"Max bytes to prefetch indirects for per stream");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW,
+	"Max request reorder distance within a stream");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW,
+	"Max log2 fraction of holes in a stream");

From d52f7fbe68ed768bfa402ad7127305d5e94751b8 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 8 Apr 2024 18:23:43 -0400
Subject: [PATCH 21/23] Remove db_state DB_NOFILL checks from syncing context

Syncing context should not depend on current state of dbuf, which
could already change several times in later transaction groups,
but rely solely on dirty record for the transaction group being
synced. Some of the checks seem already impossible, while instead
of others I think we should better check for absence of data in
the specific dirty record rather than DB_NOFILL.

Reviewed-by: Robert Evans <evansr@google.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16057
---
 module/zfs/dbuf.c | 44 +++++++++++++++++++-------------------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 42e5811c8597..5d1887175a53 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -4550,11 +4550,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
 		dbuf_prepare_encrypted_dnode_leaf(dr);
 
-	if (db->db_state != DB_NOFILL &&
+	if (*datap != NULL && *datap == db->db_buf &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    zfs_refcount_count(&db->db_holds) > 1 &&
-	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
-	    *datap == db->db_buf) {
+	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN) {
 		/*
 		 * If this buffer is currently "in use" (i.e., there
 		 * are active holds and db_data still references it),
@@ -4839,11 +4838,9 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
-		if (db->db_state != DB_NOFILL) {
-			if (dr->dt.dl.dr_data != NULL &&
-			    dr->dt.dl.dr_data != db->db_buf) {
-				arc_buf_destroy(dr->dt.dl.dr_data, db);
-			}
+		if (dr->dt.dl.dr_data != NULL &&
+		    dr->dt.dl.dr_data != db->db_buf) {
+			arc_buf_destroy(dr->dt.dl.dr_data, db);
 		}
 	} else {
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@@ -5042,21 +5039,18 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 
 	os = dn->dn_objset;
 
-	if (db->db_state != DB_NOFILL) {
-		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
-			/*
-			 * Private object buffers are released here rather
-			 * than in dbuf_dirty() since they are only modified
-			 * in the syncing context and we don't want the
-			 * overhead of making multiple copies of the data.
-			 */
-			if (BP_IS_HOLE(db->db_blkptr)) {
-				arc_buf_thaw(data);
-			} else {
-				dbuf_release_bp(db);
-			}
-			dbuf_remap(dn, db, tx);
-		}
+	if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+		/*
+		 * Private object buffers are released here rather than in
+		 * dbuf_dirty() since they are only modified in the syncing
+		 * context and we don't want the overhead of making multiple
+		 * copies of the data.
+		 */
+		if (BP_IS_HOLE(db->db_blkptr))
+			arc_buf_thaw(data);
+		else
+			dbuf_release_bp(db);
+		dbuf_remap(dn, db, tx);
 	}
 
 	if (parent != dn->dn_dbuf) {
@@ -5092,7 +5086,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		wp_flag = WP_SPILL;
-	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
+	wp_flag |= (data == NULL) ? WP_NOFILL : 0;
 
 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
 
@@ -5124,7 +5118,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
 		    dr->dt.dl.dr_brtwrite);
 		mutex_exit(&db->db_mtx);
-	} else if (db->db_state == DB_NOFILL) {
+	} else if (data == NULL) {
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(pio, os->os_spa, txg,

From 599743dc9be645538c0034e11f23d39bf5e35137 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 9 Apr 2024 19:14:04 -0400
Subject: [PATCH 22/23] Small fix to prefetch ranges aggregation

When after #16022 adding new range we aggregate more than two
existing ranges, that should be very rare, only if several streams
overlap, we may need to zero not the last range, but some earlier.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16072
---
 module/zfs/dmu_zfetch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c
index e26195176036..3439f9bddf4e 100644
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@@ -418,8 +418,8 @@ dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks)
 				zs->zs_ranges[f].start = zs->zs_ranges[l].start;
 				zs->zs_ranges[f].end = zs->zs_ranges[l].end;
 			}
-			zs->zs_ranges[ZFETCH_RANGES - 1].start = 0;
-			zs->zs_ranges[ZFETCH_RANGES - 1].end = 0;
+			zs->zs_ranges[f].start = 0;
+			zs->zs_ranges[f].end = 0;
 		}
 	} else if (i < ZFETCH_RANGES) {
 		/* Got no intersecting ranges, insert new one. */

From ddf864dcc326833ea5e9bf2d841d2cf6f1f3e429 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 9 Apr 2024 19:23:19 -0400
Subject: [PATCH 23/23] L2ARC: Relax locking during write

Previous code held ARC state sublist lock throughout all L2ARC
write process, which included number of allocations and even ZIO
issues.  Being blocked in any of those places the code could also
block ARC eviction, that could cause OOM activation or even dead-
lock if system is low on memory or one is too fragmented.

Fix it by dropping the lock as soon as we see a block eligible
for L2ARC writing and pick it up later using earlier inserted
marker.  While there, also reduce scope of hash lock, moving
ZIO allocation and other operations not requiring header access
out of it.  All operations requiring header access move under
hash lock, since L2_WRITING flag does not prevent header eviction
only transition to arc_l2c_only state with L1 header.

To be able to manipulate sublist lock and marker as needed add few
more multilist functions and modify one.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16040
---
 include/sys/multilist.h |   5 +-
 module/zfs/arc.c        | 179 +++++++++++++++++++++-------------------
 module/zfs/dbuf.c       |   2 +-
 module/zfs/dmu_objset.c |  10 +--
 module/zfs/metaslab.c   |   8 +-
 module/zfs/multilist.c  |  26 +++++-
 6 files changed, 131 insertions(+), 99 deletions(-)

diff --git a/include/sys/multilist.h b/include/sys/multilist.h
index 26f37c37ab38..e7de86f2379b 100644
--- a/include/sys/multilist.h
+++ b/include/sys/multilist.h
@@ -82,12 +82,15 @@ int  multilist_is_empty(multilist_t *);
 unsigned int multilist_get_num_sublists(multilist_t *);
 unsigned int multilist_get_random_index(multilist_t *);
 
-multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
+void multilist_sublist_lock(multilist_sublist_t *);
+multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int);
 multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
 void multilist_sublist_unlock(multilist_sublist_t *);
 
 void multilist_sublist_insert_head(multilist_sublist_t *, void *);
 void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
+void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *);
+void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *);
 void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
 void multilist_sublist_remove(multilist_sublist_t *, void *);
 int  multilist_sublist_is_empty(multilist_sublist_t *);
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 4db6c06148b1..1953640139b3 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -3872,7 +3872,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 
 	ASSERT3P(marker, !=, NULL);
 
-	mls = multilist_sublist_lock(ml, idx);
+	mls = multilist_sublist_lock_idx(ml, idx);
 
 	for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
 	    hdr = multilist_sublist_prev(mls, marker)) {
@@ -3984,6 +3984,26 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 	return (bytes_evicted);
 }
 
+static arc_buf_hdr_t *
+arc_state_alloc_marker(void)
+{
+	arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+	/*
+	 * A b_spa of 0 is used to indicate that this header is
+	 * a marker. This fact is used in arc_evict_state_impl().
+	 */
+	marker->b_spa = 0;
+
+	return (marker);
+}
+
+static void
+arc_state_free_marker(arc_buf_hdr_t *marker)
+{
+	kmem_cache_free(hdr_full_cache, marker);
+}
+
 /*
  * Allocate an array of buffer headers used as placeholders during arc state
  * eviction.
@@ -3994,16 +4014,8 @@ arc_state_alloc_markers(int count)
 	arc_buf_hdr_t **markers;
 
 	markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
-	for (int i = 0; i < count; i++) {
-		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
-
-		/*
-		 * A b_spa of 0 is used to indicate that this header is
-		 * a marker. This fact is used in arc_evict_state_impl().
-		 */
-		markers[i]->b_spa = 0;
-
-	}
+	for (int i = 0; i < count; i++)
+		markers[i] = arc_state_alloc_marker();
 	return (markers);
 }
 
@@ -4011,7 +4023,7 @@ static void
 arc_state_free_markers(arc_buf_hdr_t **markers, int count)
 {
 	for (int i = 0; i < count; i++)
-		kmem_cache_free(hdr_full_cache, markers[i]);
+		arc_state_free_marker(markers[i]);
 	kmem_free(markers, sizeof (*markers) * count);
 }
 
@@ -4055,7 +4067,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls;
 
-		mls = multilist_sublist_lock(ml, i);
+		mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_insert_tail(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
@@ -4120,7 +4132,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 	}
 
 	for (int i = 0; i < num_sublists; i++) {
-		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_remove(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
@@ -8628,7 +8640,7 @@ l2arc_sublist_lock(int list_num)
 	 * sublists being selected.
 	 */
 	idx = multilist_get_random_index(ml);
-	return (multilist_sublist_lock(ml, idx));
+	return (multilist_sublist_lock_idx(ml, idx));
 }
 
 /*
@@ -9040,9 +9052,9 @@ l2arc_blk_fetch_done(zio_t *zio)
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
-	arc_buf_hdr_t 		*hdr, *hdr_prev, *head;
-	uint64_t 		write_asize, write_psize, write_lsize, headroom;
-	boolean_t		full;
+	arc_buf_hdr_t 		*hdr, *head, *marker;
+	uint64_t 		write_asize, write_psize, headroom;
+	boolean_t		full, from_head = !arc_warm;
 	l2arc_write_callback_t	*cb = NULL;
 	zio_t 			*pio, *wzio;
 	uint64_t 		guid = spa_load_guid(spa);
@@ -9051,10 +9063,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	ASSERT3P(dev->l2ad_vdev, !=, NULL);
 
 	pio = NULL;
-	write_lsize = write_asize = write_psize = 0;
+	write_asize = write_psize = 0;
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
 	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
+	marker = arc_state_alloc_marker();
 
 	/*
 	 * Copy buffers for L2ARC writing.
@@ -9069,40 +9082,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				continue;
 		}
 
-		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
 		uint64_t passed_sz = 0;
-
-		VERIFY3P(mls, !=, NULL);
+		headroom = target_sz * l2arc_headroom;
+		if (zfs_compressed_arc_enabled)
+			headroom = (headroom * l2arc_headroom_boost) / 100;
 
 		/*
-		 * L2ARC fast warmup.
-		 *
 		 * Until the ARC is warm and starts to evict, read from the
 		 * head of the ARC lists rather than the tail.
 		 */
-		if (arc_warm == B_FALSE)
+		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
+		ASSERT3P(mls, !=, NULL);
+		if (from_head)
 			hdr = multilist_sublist_head(mls);
 		else
 			hdr = multilist_sublist_tail(mls);
 
-		headroom = target_sz * l2arc_headroom;
-		if (zfs_compressed_arc_enabled)
-			headroom = (headroom * l2arc_headroom_boost) / 100;
-
-		for (; hdr; hdr = hdr_prev) {
+		while (hdr != NULL) {
 			kmutex_t *hash_lock;
 			abd_t *to_write = NULL;
 
-			if (arc_warm == B_FALSE)
-				hdr_prev = multilist_sublist_next(mls, hdr);
-			else
-				hdr_prev = multilist_sublist_prev(mls, hdr);
-
 			hash_lock = HDR_LOCK(hdr);
 			if (!mutex_tryenter(hash_lock)) {
-				/*
-				 * Skip this buffer rather than waiting.
-				 */
+skip:
+				/* Skip this buffer rather than waiting. */
+				if (from_head)
+					hdr = multilist_sublist_next(mls, hdr);
+				else
+					hdr = multilist_sublist_prev(mls, hdr);
 				continue;
 			}
 
@@ -9117,11 +9124,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 
 			if (!l2arc_write_eligible(guid, hdr)) {
 				mutex_exit(hash_lock);
-				continue;
+				goto skip;
 			}
 
 			ASSERT(HDR_HAS_L1HDR(hdr));
-
 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
 			ASSERT3U(arc_hdr_size(hdr), >, 0);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
@@ -9143,12 +9149,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			}
 
 			/*
-			 * We rely on the L1 portion of the header below, so
-			 * it's invalid for this header to have been evicted out
-			 * of the ghost cache, prior to being written out. The
-			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+			 * We should not sleep with sublist lock held or it
+			 * may block ARC eviction.  Insert a marker to save
+			 * the position and drop the lock.
 			 */
-			arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
+			if (from_head) {
+				multilist_sublist_insert_after(mls, hdr,
+				    marker);
+			} else {
+				multilist_sublist_insert_before(mls, hdr,
+				    marker);
+			}
+			multilist_sublist_unlock(mls);
 
 			/*
 			 * If this header has b_rabd, we can use this since it
@@ -9179,32 +9191,45 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				    &to_write);
 				if (ret != 0) {
 					arc_hdr_clear_flags(hdr,
-					    ARC_FLAG_L2_WRITING);
+					    ARC_FLAG_L2CACHE);
 					mutex_exit(hash_lock);
-					continue;
+					goto next;
 				}
 
 				l2arc_free_abd_on_write(to_write, asize, type);
 			}
 
+			hdr->b_l2hdr.b_dev = dev;
+			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+			hdr->b_l2hdr.b_hits = 0;
+			hdr->b_l2hdr.b_arcs_state =
+			    hdr->b_l1hdr.b_state->arcs_state;
+			mutex_enter(&dev->l2ad_mtx);
 			if (pio == NULL) {
 				/*
 				 * Insert a dummy header on the buflist so
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
-				mutex_enter(&dev->l2ad_mtx);
 				list_insert_head(&dev->l2ad_buflist, head);
-				mutex_exit(&dev->l2ad_mtx);
+			}
+			list_insert_head(&dev->l2ad_buflist, hdr);
+			mutex_exit(&dev->l2ad_mtx);
+			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
+			    ARC_FLAG_L2_WRITING);
+
+			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
+			    arc_hdr_size(hdr), hdr);
+			l2arc_hdr_arcstats_increment(hdr);
 
+			boolean_t commit = l2arc_log_blk_insert(dev, hdr);
+			mutex_exit(hash_lock);
+
+			if (pio == NULL) {
 				cb = kmem_alloc(
 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
 				cb->l2wcb_dev = dev;
 				cb->l2wcb_head = head;
-				/*
-				 * Create a list to save allocated abd buffers
-				 * for l2arc_log_blk_commit().
-				 */
 				list_create(&cb->l2wcb_abd_list,
 				    sizeof (l2arc_lb_abd_buf_t),
 				    offsetof(l2arc_lb_abd_buf_t, node));
@@ -9212,54 +9237,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				    ZIO_FLAG_CANFAIL);
 			}
 
-			hdr->b_l2hdr.b_dev = dev;
-			hdr->b_l2hdr.b_hits = 0;
-
-			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
-			hdr->b_l2hdr.b_arcs_state =
-			    hdr->b_l1hdr.b_state->arcs_state;
-			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
-
-			mutex_enter(&dev->l2ad_mtx);
-			list_insert_head(&dev->l2ad_buflist, hdr);
-			mutex_exit(&dev->l2ad_mtx);
-
-			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
-			    arc_hdr_size(hdr), hdr);
-
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
-			    hdr->b_l2hdr.b_daddr, asize, to_write,
+			    dev->l2ad_hand, asize, to_write,
 			    ZIO_CHECKSUM_OFF, NULL, hdr,
 			    ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_CANFAIL, B_FALSE);
 
-			write_lsize += HDR_GET_LSIZE(hdr);
 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
 			    zio_t *, wzio);
+			zio_nowait(wzio);
 
 			write_psize += psize;
 			write_asize += asize;
 			dev->l2ad_hand += asize;
-			l2arc_hdr_arcstats_increment(hdr);
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
-			mutex_exit(hash_lock);
-
-			/*
-			 * Append buf info to current log and commit if full.
-			 * arcstat_l2_{size,asize} kstats are updated
-			 * internally.
-			 */
-			if (l2arc_log_blk_insert(dev, hdr)) {
-				/*
-				 * l2ad_hand will be adjusted in
-				 * l2arc_log_blk_commit().
-				 */
+			if (commit) {
+				/* l2ad_hand will be adjusted inside. */
 				write_asize +=
 				    l2arc_log_blk_commit(dev, pio, cb);
 			}
 
-			zio_nowait(wzio);
+next:
+			multilist_sublist_lock(mls);
+			if (from_head)
+				hdr = multilist_sublist_next(mls, marker);
+			else
+				hdr = multilist_sublist_prev(mls, marker);
+			multilist_sublist_remove(mls, marker);
 		}
 
 		multilist_sublist_unlock(mls);
@@ -9268,9 +9273,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			break;
 	}
 
+	arc_state_free_marker(marker);
+
 	/* No buffers selected for writing? */
 	if (pio == NULL) {
-		ASSERT0(write_lsize);
+		ASSERT0(write_psize);
 		ASSERT(!HDR_HAS_L1HDR(head));
 		kmem_cache_free(hdr_l2only_cache, head);
 
@@ -10598,7 +10605,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
 	L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
 	L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
 	L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
-	L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
+	L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);
 
 	dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
 	    HDR_GET_PSIZE(hdr));
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 5d1887175a53..72aaf7f19822 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -754,7 +754,7 @@ static void
 dbuf_evict_one(void)
 {
 	int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
-	multilist_sublist_t *mls = multilist_sublist_lock(
+	multilist_sublist_t *mls = multilist_sublist_lock_idx(
 	    &dbuf_caches[DB_DBUF_CACHE].cache, idx);
 
 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index d134d4958f7c..f8bd7422a5df 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1633,7 +1633,7 @@ sync_dnodes_task(void *arg)
 	sync_dnodes_arg_t *sda = arg;
 
 	multilist_sublist_t *ms =
-	    multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
+	    multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);
 
 	dmu_objset_sync_dnodes(ms, sda->sda_tx);
 
@@ -1987,8 +1987,8 @@ userquota_updates_task(void *arg)
 	dnode_t *dn;
 	userquota_cache_t cache = { { 0 } };
 
-	multilist_sublist_t *list =
-	    multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+	multilist_sublist_t *list = multilist_sublist_lock_idx(
+	    &os->os_synced_dnodes, uua->uua_sublist_idx);
 
 	ASSERT(multilist_sublist_head(list) == NULL ||
 	    dmu_objset_userused_enabled(os));
@@ -2070,8 +2070,8 @@ dnode_rele_task(void *arg)
 	userquota_updates_arg_t *uua = arg;
 	objset_t *os = uua->uua_os;
 
-	multilist_sublist_t *list =
-	    multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+	multilist_sublist_t *list = multilist_sublist_lock_idx(
+	    &os->os_synced_dnodes, uua->uua_sublist_idx);
 
 	dnode_t *dn;
 	while ((dn = multilist_sublist_head(list)) != NULL) {
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 5809a832bcb0..dbfc00362ff8 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -641,7 +641,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 {
 	multilist_t *ml = &mc->mc_metaslab_txg_list;
 	for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
-		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL) {
@@ -658,7 +658,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 				i--;
 				break;
 			}
-			mls = multilist_sublist_lock(ml, i);
+			mls = multilist_sublist_lock_idx(ml, i);
 			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 			multilist_sublist_unlock(mls);
 			if (txg >
@@ -2190,12 +2190,12 @@ metaslab_potentially_evict(metaslab_class_t *mc)
 		unsigned int idx = multilist_get_random_index(
 		    &mc->mc_metaslab_txg_list);
 		multilist_sublist_t *mls =
-		    multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx);
+		    multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
 		    inuse * size) {
-			VERIFY3P(mls, ==, multilist_sublist_lock(
+			VERIFY3P(mls, ==, multilist_sublist_lock_idx(
 			    &mc->mc_metaslab_txg_list, idx));
 			ASSERT3U(idx, ==,
 			    metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c
index b1cdf1c5c5f4..3d3ef86e6839 100644
--- a/module/zfs/multilist.c
+++ b/module/zfs/multilist.c
@@ -277,9 +277,15 @@ multilist_get_random_index(multilist_t *ml)
 	return (random_in_range(ml->ml_num_sublists));
 }
 
+void
+multilist_sublist_lock(multilist_sublist_t *mls)
+{
+	mutex_enter(&mls->mls_lock);
+}
+
 /* Lock and return the sublist specified at the given index */
 multilist_sublist_t *
-multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+multilist_sublist_lock_idx(multilist_t *ml, unsigned int sublist_idx)
 {
 	multilist_sublist_t *mls;
 
@@ -294,7 +300,7 @@ multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
 multilist_sublist_t *
 multilist_sublist_lock_obj(multilist_t *ml, void *obj)
 {
-	return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
+	return (multilist_sublist_lock_idx(ml, ml->ml_index_func(ml, obj)));
 }
 
 void
@@ -327,6 +333,22 @@ multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
 	list_insert_tail(&mls->mls_list, obj);
 }
 
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_after(multilist_sublist_t *mls, void *prev, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_after(&mls->mls_list, prev, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_before(multilist_sublist_t *mls, void *next, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_before(&mls->mls_list, next, obj);
+}
+
 /*
  * Move the object one element forward in the list.
  *