Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[2.2] Backport some ZIL, BRT and prefetcher patches #16106

Merged
merged 23 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
27224c2
ZIL: Detect single-threaded workloads
amotin Oct 24, 2023
98c24e9
ZIO: Optimize zio_flush()
amotin Nov 17, 2023
6e7331d
ZIL: Improve next log block size prediction
amotin Dec 21, 2023
41a8e55
ZIL: Update Linux tracing after #15635
amotin Jan 9, 2024
84d6363
Refactor dmu_prefetch().
amotin Aug 7, 2023
01d4876
Linux: Cleanup taskq threads spawn/exit
amotin Feb 13, 2024
8e86cd5
Update resume token at object receive.
amotin Mar 21, 2024
98d846c
BRT: Change brt_pending_tree sorting order
amotin Mar 21, 2024
85a00ac
ZAP: Some cleanups/micro-optimizations
amotin Mar 21, 2024
656f9d7
BRT: Skip duplicate BRT prefetches
amotin Mar 25, 2024
0e0786b
ZAP: Massively switch to _by_dnode() interfaces
amotin Mar 25, 2024
28768ed
BRT: Relax brt_pending_apply() locking
amotin Mar 25, 2024
38ce13b
BRT: Make BRT block sizes configurable
amotin Mar 25, 2024
8477add
BRT: Skip getting length in brt_entry_lookup()
amotin Mar 26, 2024
75f4140
BRT: Fix holes cloning.
amotin Mar 18, 2024
3c98b55
BRT: Fix tests to work on non-empty pools
amotin Mar 19, 2024
6552ab7
BRT: Check pool clone stats in more tests
amotin Mar 19, 2024
6e73d87
Improve dbuf_read() error reporting
amotin Apr 3, 2024
3a73611
Fix read errors race after block cloning
amotin Apr 8, 2024
f7b6c86
Speculative prefetch for reordered requests
amotin Apr 8, 2024
d52f7fb
Remove db_state DB_NOFILL checks from syncing context
amotin Apr 8, 2024
599743d
Small fix to prefetch ranges aggregation
amotin Apr 9, 2024
ddf864d
L2ARC: Relax locking during write
amotin Apr 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion cmd/arc_summary
Original file line number Diff line number Diff line change
Expand Up @@ -793,18 +793,27 @@ def section_dmu(kstats_dict):

zfetch_stats = isolate_section('zfetchstats', kstats_dict)

zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses'])
zfetch_access_total = int(zfetch_stats['hits']) +\
int(zfetch_stats['future']) + int(zfetch_stats['stride']) +\
int(zfetch_stats['past']) + int(zfetch_stats['misses'])

prt_1('DMU predictive prefetcher calls:', f_hits(zfetch_access_total))
prt_i2('Stream hits:',
f_perc(zfetch_stats['hits'], zfetch_access_total),
f_hits(zfetch_stats['hits']))
future = int(zfetch_stats['future']) + int(zfetch_stats['stride'])
prt_i2('Hits ahead of stream:', f_perc(future, zfetch_access_total),
f_hits(future))
prt_i2('Hits behind stream:',
f_perc(zfetch_stats['past'], zfetch_access_total),
f_hits(zfetch_stats['past']))
prt_i2('Stream misses:',
f_perc(zfetch_stats['misses'], zfetch_access_total),
f_hits(zfetch_stats['misses']))
prt_i2('Streams limit reached:',
f_perc(zfetch_stats['max_streams'], zfetch_stats['misses']),
f_hits(zfetch_stats['max_streams']))
prt_i1('Stream strides:', f_hits(zfetch_stats['stride']))
prt_i1('Prefetches issued', f_hits(zfetch_stats['io_issued']))
print()

Expand Down
2 changes: 1 addition & 1 deletion include/os/linux/spl/sys/taskq.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ typedef struct taskq {
/* list node for the cpu hotplug callback */
struct hlist_node tq_hp_cb_node;
boolean_t tq_hp_support;
unsigned long lastshouldstop; /* when to purge dynamic */
unsigned long lastspawnstop; /* when to purge dynamic */
} taskq_t;

typedef struct taskq_ent {
Expand Down
14 changes: 10 additions & 4 deletions include/os/linux/zfs/sys/trace_zil.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@
__field(uint64_t, zl_parse_lr_seq) \
__field(uint64_t, zl_parse_blk_count) \
__field(uint64_t, zl_parse_lr_count) \
__field(uint64_t, zl_cur_used) \
__field(uint64_t, zl_cur_size) \
__field(uint64_t, zl_cur_left) \
__field(uint64_t, zl_cur_max) \
__field(clock_t, zl_replay_time) \
__field(uint64_t, zl_replay_blks)

Expand All @@ -72,7 +74,9 @@
__entry->zl_parse_lr_seq = zilog->zl_parse_lr_seq; \
__entry->zl_parse_blk_count = zilog->zl_parse_blk_count;\
__entry->zl_parse_lr_count = zilog->zl_parse_lr_count; \
__entry->zl_cur_used = zilog->zl_cur_used; \
__entry->zl_cur_size = zilog->zl_cur_size; \
__entry->zl_cur_left = zilog->zl_cur_left; \
__entry->zl_cur_max = zilog->zl_cur_max; \
__entry->zl_replay_time = zilog->zl_replay_time; \
__entry->zl_replay_blks = zilog->zl_replay_blks;

Expand All @@ -82,7 +86,8 @@
"replay %u stop_sync %u logbias %u sync %u " \
"parse_error %u parse_blk_seq %llu parse_lr_seq %llu " \
"parse_blk_count %llu parse_lr_count %llu " \
"cur_used %llu replay_time %lu replay_blks %llu }"
"cur_size %llu cur_left %llu cur_max %llu replay_time %lu " \
"replay_blks %llu }"

#define ZILOG_TP_PRINTK_ARGS \
__entry->zl_lr_seq, __entry->zl_commit_lr_seq, \
Expand All @@ -92,7 +97,8 @@
__entry->zl_stop_sync, __entry->zl_logbias, __entry->zl_sync, \
__entry->zl_parse_error, __entry->zl_parse_blk_seq, \
__entry->zl_parse_lr_seq, __entry->zl_parse_blk_count, \
__entry->zl_parse_lr_count, __entry->zl_cur_used, \
__entry->zl_parse_lr_count, __entry->zl_cur_size, \
__entry->zl_cur_left, __entry->zl_cur_max, \
__entry->zl_replay_time, __entry->zl_replay_blks

#define ITX_TP_STRUCT_ENTRY \
Expand Down
5 changes: 3 additions & 2 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -739,8 +739,6 @@ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
void *dmu_buf_get_user(dmu_buf_t *db);

objset_t *dmu_buf_get_objset(dmu_buf_t *db);
dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
void dmu_buf_dnode_exit(dmu_buf_t *db);

/* Block until any in-progress dmu buf user evictions complete. */
void dmu_buf_user_evict_wait(void);
Expand Down Expand Up @@ -889,6 +887,9 @@ extern uint_t zfs_max_recordsize;
*/
void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len, enum zio_priority pri);
void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
uint64_t len, enum zio_priority pri);
void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);

typedef struct dmu_object_info {
/* All sizes are in bytes unless otherwise indicated. */
Expand Down
16 changes: 11 additions & 5 deletions include/sys/dmu_zfetch.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,18 +45,24 @@ typedef struct zfetch {
int zf_numstreams; /* number of zstream_t's */
} zfetch_t;

typedef struct zsrange {
uint16_t start;
uint16_t end;
} zsrange_t;

#define ZFETCH_RANGES 9 /* Fits zstream_t into 128 bytes */

typedef struct zstream {
list_node_t zs_node; /* link for zf_stream */
uint64_t zs_blkid; /* expect next access at this blkid */
uint_t zs_atime; /* time last prefetch issued */
zsrange_t zs_ranges[ZFETCH_RANGES]; /* ranges from future */
unsigned int zs_pf_dist; /* data prefetch distance in bytes */
unsigned int zs_ipf_dist; /* L1 prefetch distance in bytes */
uint64_t zs_pf_start; /* first data block to prefetch */
uint64_t zs_pf_end; /* data block to prefetch up to */
uint64_t zs_ipf_start; /* first data block to prefetch L1 */
uint64_t zs_ipf_end; /* data block to prefetch L1 up to */

list_node_t zs_node; /* link for zf_stream */
hrtime_t zs_atime; /* time last prefetch issued */
zfetch_t *zs_fetch; /* parent fetch */
boolean_t zs_missed; /* stream saw cache misses */
boolean_t zs_more; /* need more distant prefetch */
zfs_refcount_t zs_callers; /* number of pending callers */
Expand All @@ -74,7 +80,7 @@ void dmu_zfetch_init(zfetch_t *, struct dnode *);
void dmu_zfetch_fini(zfetch_t *);
zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
boolean_t);
void dmu_zfetch_run(zstream_t *, boolean_t, boolean_t);
void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
boolean_t);

Expand Down
5 changes: 4 additions & 1 deletion include/sys/multilist.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,15 @@ int multilist_is_empty(multilist_t *);
unsigned int multilist_get_num_sublists(multilist_t *);
unsigned int multilist_get_random_index(multilist_t *);

multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
void multilist_sublist_lock(multilist_sublist_t *);
multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int);
multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
void multilist_sublist_unlock(multilist_sublist_t *);

void multilist_sublist_insert_head(multilist_sublist_t *, void *);
void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *);
void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *);
void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
void multilist_sublist_remove(multilist_sublist_t *, void *);
int multilist_sublist_is_empty(multilist_sublist_t *);
Expand Down
8 changes: 8 additions & 0 deletions include/sys/zap.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,9 @@ int zap_add_by_dnode(dnode_t *dn, const char *key,
int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
int key_numints, int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);
int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints, int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);

/*
* Set the attribute with the given name to the given value. If an
Expand All @@ -267,6 +270,9 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);

/*
* Get the length (in integers) and the integer size of the specified
Expand All @@ -292,6 +298,8 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx);
int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, dmu_tx_t *tx);
int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints, dmu_tx_t *tx);

/*
* Returns (in *count) the number of attributes in the specified zap
Expand Down
1 change: 1 addition & 0 deletions include/sys/zap_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ typedef struct zap {
dmu_buf_user_t zap_dbu;
objset_t *zap_objset;
uint64_t zap_object;
dnode_t *zap_dnode;
struct dmu_buf *zap_dbuf;
krwlock_t zap_rwlock;
boolean_t zap_ismicro;
Expand Down
8 changes: 4 additions & 4 deletions include/sys/zap_leaf.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ struct zap_stats;
* entries - header space (2*chunksize)
*/
#define ZAP_LEAF_NUMCHUNKS_BS(bs) \
(((1<<(bs)) - 2*ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \
(((1U << (bs)) - 2 * ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \
ZAP_LEAF_CHUNKSIZE - 2)

#define ZAP_LEAF_NUMCHUNKS(l) (ZAP_LEAF_NUMCHUNKS_BS(((l)->l_bs)))
Expand Down Expand Up @@ -80,7 +80,7 @@ struct zap_stats;
* chunks per entry (3).
*/
#define ZAP_LEAF_HASH_SHIFT_BS(bs) ((bs) - 5)
#define ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1 << ZAP_LEAF_HASH_SHIFT_BS(bs))
#define ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1U << ZAP_LEAF_HASH_SHIFT_BS(bs))
#define ZAP_LEAF_HASH_SHIFT(l) (ZAP_LEAF_HASH_SHIFT_BS(((l)->l_bs)))
#define ZAP_LEAF_HASH_NUMENTRIES(l) (ZAP_LEAF_HASH_NUMENTRIES_BS(((l)->l_bs)))

Expand Down Expand Up @@ -163,7 +163,7 @@ typedef struct zap_leaf {
dmu_buf_user_t l_dbu;
krwlock_t l_rwlock;
uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */
int l_bs; /* block size shift */
uint_t l_bs; /* block size shift */
dmu_buf_t *l_dbuf;
} zap_leaf_t;

Expand Down Expand Up @@ -243,7 +243,7 @@ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
*/

extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort);
extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t len);
extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort);
extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l,
struct zap_stats *zs);
Expand Down
10 changes: 7 additions & 3 deletions include/sys/zil_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ typedef struct zil_vdev_node {
avl_node_t zv_node; /* AVL tree linkage */
} zil_vdev_node_t;

#define ZIL_PREV_BLKS 16
#define ZIL_BURSTS 8

/*
* Stable storage intent log management structure. One per dataset.
Expand Down Expand Up @@ -216,14 +216,18 @@ struct zilog {
uint64_t zl_parse_lr_count; /* number of log records parsed */
itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */
list_t zl_itx_commit_list; /* itx list to be committed */
uint64_t zl_cur_used; /* current commit log size used */
uint64_t zl_cur_size; /* current burst full size */
uint64_t zl_cur_left; /* current burst remaining size */
uint64_t zl_cur_max; /* biggest record in current burst */
list_t zl_lwb_list; /* in-flight log write list */
avl_tree_t zl_bp_tree; /* track bps during log parse */
clock_t zl_replay_time; /* lbolt of when replay started */
uint64_t zl_replay_blks; /* number of log blocks replayed */
zil_header_t zl_old_header; /* debugging aid */
uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
uint_t zl_parallel; /* workload is multi-threaded */
uint_t zl_prev_rotor; /* rotor for zl_prev[] */
uint_t zl_prev_opt[ZIL_BURSTS]; /* optimal block size */
uint_t zl_prev_min[ZIL_BURSTS]; /* minimal first block size */
txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */
uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */

Expand Down
18 changes: 4 additions & 14 deletions man/man4/spl.4
Original file line number Diff line number Diff line change
Expand Up @@ -186,18 +186,8 @@ reading it could cause a lock-up if the list grow too large
without limiting the output.
"(truncated)" will be shown if the list is larger than the limit.
.
.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 10000 Pq uint
(Linux-only)
How long a taskq has to have had no work before we tear it down.
Previously, we would tear down a dynamic taskq worker as soon
as we noticed it had no work, but it was observed that this led
to a lot of churn in tearing down things we then immediately
spawned anew.
In practice, it seems any nonzero value will remove the vast
majority of this churn, while the nontrivially larger value
was chosen to help filter out the little remaining churn on
a mostly idle system.
Setting this value to
.Sy 0
will revert to the previous behavior.
.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 5000 Pq uint
Minimum idle threads exit interval for dynamic taskqs.
Smaller values allow idle threads exit more often and potentially be
respawned again on demand, causing more churn.
.El
37 changes: 27 additions & 10 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -245,12 +245,25 @@ For blocks that could be forced to be a gang block (due to
.Sy metaslab_force_ganging ) ,
force this many of them to be gang blocks.
.
.It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
.It Sy brt_zap_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int
Controls prefetching BRT records for blocks which are going to be cloned.
.
.It Sy brt_zap_default_bs Ns = Ns Sy 12 Po 4 KiB Pc Pq int
Default BRT ZAP data block size as a power of 2. Note that changing this after
creating a BRT on the pool will not affect existing BRTs, only newly created
ones.
.
.It Sy brt_zap_default_ibs Ns = Ns Sy 12 Po 4 KiB Pc Pq int
Default BRT ZAP indirect block size as a power of 2. Note that changing this
after creating a BRT on the pool will not affect existing BRTs, only newly
created ones.
.
.It Sy ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
Default DDT ZAP data block size as a power of 2. Note that changing this after
creating a DDT on the pool will not affect existing DDTs, only newly created
ones.
.
.It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
.It Sy ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
Default DDT ZAP indirect block size as a power of 2. Note that changing this
after creating a DDT on the pool will not affect existing DDTs, only newly
created ones.
Expand Down Expand Up @@ -531,6 +544,10 @@ However, this is limited by
Maximum micro ZAP size.
A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
.
.It Sy zfetch_hole_shift Ns = Ns Sy 2 Pq uint
Log2 fraction of holes in speculative prefetch stream allowed for it to
proceed.
.
.It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
Min bytes to prefetch per stream.
Prefetch distance starts from the demand access size and quickly grows to
Expand All @@ -545,6 +562,13 @@ Max bytes to prefetch per stream.
.It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint
Max bytes to prefetch indirects for per stream.
.
.It Sy zfetch_max_reorder Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
Requests within this byte distance from the current prefetch stream position
are considered parts of the stream, reordered due to parallel processing.
Such requests do not advance the stream position immediately unless
.Sy zfetch_hole_shift
fill threshold is reached, but saved to fill holes in the stream later.
.
.It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint
Max number of streams per zfetch (prefetch streams per file).
.
Expand Down Expand Up @@ -799,7 +823,7 @@ Note that this should not be set below the ZED thresholds
(currently 10 checksums over 10 seconds)
or else the daemon may not trigger any action.
.
.It Sy zfs_commit_timeout_pct Ns = Ns Sy 5 Ns % Pq uint
.It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint
This controls the amount of time that a ZIL block (lwb) will remain "open"
when it isn't "full", and it has a thread waiting for it to be committed to
stable storage.
Expand Down Expand Up @@ -2206,13 +2230,6 @@ This sets the maximum number of write bytes logged via WR_COPIED.
It tunes a tradeoff between additional memory copy and possibly worse log
space efficiency vs additional range lock/unlock.
.
.It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64
This sets the minimum delay in nanoseconds ZIL care to delay block commit,
waiting for more records.
If ZIL writes are too fast, kernel may not be able sleep for so short interval,
increasing log latency above allowed by
.Sy zfs_commit_timeout_pct .
.
.It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int
Disable the cache flush commands that are normally sent to disk by
the ZIL after an LWB write has completed.
Expand Down
4 changes: 1 addition & 3 deletions module/os/freebsd/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -1869,10 +1869,8 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,

ASSERT3S(outcount, <=, bufsize);

/* Prefetch znode */
if (prefetch)
dmu_prefetch(os, objnum, 0, 0, 0,
ZIO_PRIORITY_SYNC_READ);
dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);

/*
* Move to the next entry, fill in the previous offset.
Expand Down
Loading
Loading