Skip to content

Commit

Permalink
Add TRIM support for disk vdevs.
Browse files Browse the repository at this point in the history
This adds TRIM (a.k.a UNMAP, DISCARD, hole punching) support for disk
vdevs. The original patch is from Pawel Jakub Dawidek
<[email protected]> who wrote it for FreeBSD. Etienne Dechamps
<[email protected]> ported it to ZFS On Linux.

The code builds a map of regions that were freed. On every write the
code consults the map and eventually removes ranges that were freed
before, but are now overwritten.

Freed blocks are not TRIMed immediately. There is a tunable that defines
how many txg we should wait with TRIMming freed blocks (64 by default).

There is a low priority thread that TRIMs ranges when the time comes.
During TRIM we keep in-flight ranges on a list to detect colliding
writes - we have to delay writes that collide with in-flight TRIMs in
case something will be reordered and write will reached the disk before
the TRIM. We don't have to do the same for in-flight writes, as
colliding writes just remove ranges to TRIM.

Most of the code stayed unchanged during the porting to Linux. The only
big change is in the vdev disk module, since the FreeBSD and Linux
interfaces for issuing discards to block devices is obviously different.
On FreeBSD it seems that issuing a DELETE request of any size is
sufficient; on Linux we have to be careful not to exceed maximum discard
limits. That's why we introduce a new vdev_disk_io_trim() function
inspired from the Linux blkdev_issue_discard() function and the
pre-existing vdev_disk_physio() function. The new function takes care of
splitting discard requests into smaller ones if necessary.

In theory, the code should work for main pool disk vdevs, slog disk
vdevs, L2ARC disk vdevs, and supports mirror and raidz. File vdevs are
not supported yet.

Note that the new feature is disabled by default (zfs_notrim=1). To use
it, you have to explictly set the module parameter "zfs_notrim" to "0".
Be aware that this code is largely untested and brings huge risks of
potential data corruption. Use at your own risk and expect data loss.
  • Loading branch information
dechamps committed Aug 31, 2012
1 parent 2b28613 commit cc6cd40
Show file tree
Hide file tree
Showing 18 changed files with 827 additions and 27 deletions.
3 changes: 3 additions & 0 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,9 @@ struct spa {
spa_proc_state_t spa_proc_state; /* see definition */
proc_t *spa_proc; /* "zpool-poolname" process */
uint64_t spa_did; /* if procp != p0, did of t1 */
kthread_t *spa_trim_thread; /* thread sending TRIM I/Os */
kmutex_t spa_trim_lock; /* protects spa_trim_cv */
kcondvar_t spa_trim_cv; /* used to notify TRIM thread */
boolean_t spa_autoreplace; /* autoreplace set in open */
int spa_vdev_locks; /* locks grabbed */
uint64_t spa_creation_version; /* version at pool creation */
Expand Down
51 changes: 51 additions & 0 deletions include/sys/trim_map.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2012 Pawel Jakub Dawidek <[email protected]>.
* All rights reserved.
*/

#ifndef _SYS_TRIM_MAP_H
#define _SYS_TRIM_MAP_H

#include <sys/avl.h>
#include <sys/list.h>
#include <sys/spa.h>

#ifdef __cplusplus
extern "C" {
#endif

extern void trim_map_create(vdev_t *vd);
extern void trim_map_destroy(vdev_t *vd);
extern void trim_map_free(zio_t *zio);
extern boolean_t trim_map_write_start(zio_t *zio);
extern void trim_map_write_done(zio_t *zio);

extern void trim_thread_create(spa_t *spa);
extern void trim_thread_destroy(spa_t *spa);
extern void trim_thread_wakeup(spa_t *spa);

#ifdef __cplusplus
}
#endif

#endif /* _SYS_TRIM_MAP_H */
1 change: 1 addition & 0 deletions include/sys/vdev.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ typedef enum vdev_dtl_type {
} vdev_dtl_type_t;

extern int zfs_nocacheflush;
extern int zfs_notrim;

extern int vdev_open(vdev_t *);
extern void vdev_open_children(vdev_t *);
Expand Down
2 changes: 2 additions & 0 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ struct vdev {
uint64_t vdev_unspare; /* unspare when resilvering done */
hrtime_t vdev_last_try; /* last reopen time */
boolean_t vdev_nowritecache; /* true if flushwritecache failed */
boolean_t vdev_notrim; /* true if trim failed */
boolean_t vdev_checkremove; /* temporary online test */
boolean_t vdev_forcefault; /* force online fault */
boolean_t vdev_splitting; /* split or repair in progress */
Expand All @@ -201,6 +202,7 @@ struct vdev {
spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */
zio_t *vdev_probe_zio; /* root of current probe */
vdev_aux_t vdev_label_aux; /* on-disk aux state */
struct trim_map *vdev_trimmap;

/*
* For DTrace to work in userland (libzpool) context, these fields must
Expand Down
10 changes: 8 additions & 2 deletions include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,8 @@ enum zio_compress {
#define ZIO_PRIORITY_RESILVER (zio_priority_table[9])
#define ZIO_PRIORITY_SCRUB (zio_priority_table[10])
#define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11])
#define ZIO_PRIORITY_TABLE_SIZE 12
#define ZIO_PRIORITY_TRIM (zio_priority_table[12])
#define ZIO_PRIORITY_TABLE_SIZE 13

#define ZIO_PIPELINE_CONTINUE 0x100
#define ZIO_PIPELINE_STOP 0x101
Expand Down Expand Up @@ -429,6 +430,9 @@ struct zio {

/* Taskq dispatching state */
taskq_ent_t io_tqent;

avl_node_t io_trim_node;
list_node_t io_trim_link;
};

extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
Expand Down Expand Up @@ -459,7 +463,8 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
zio_done_func_t *done, void *private, enum zio_flag flags);

extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio_done_func_t *done, void *private, int priority, enum zio_flag flags);
uint64_t offset, uint64_t size, zio_done_func_t *done,
void *private, int priority, enum zio_flag flags);

extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
Expand All @@ -478,6 +483,7 @@ extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
extern void zio_flush(zio_t *zio, vdev_t *vd);
extern void zio_trim(zio_t *zio, vdev_t *vd, uint64_t offset, uint64_t size);
extern void zio_shrink(zio_t *zio, uint64_t size);

extern int zio_wait(zio_t *zio);
Expand Down
10 changes: 6 additions & 4 deletions include/sys/zio_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ enum zio_stage {

ZIO_STAGE_READY = 1 << 15, /* RWFCI */

ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */
ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--I */
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */
ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RWF-I */
ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RWF-- */
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RWF-I */

ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */

Expand Down Expand Up @@ -143,7 +143,9 @@ enum zio_stage {
#define ZIO_FREE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
ZIO_STAGE_FREE_BP_INIT | \
ZIO_STAGE_DVA_FREE)
ZIO_STAGE_DVA_FREE | \
ZIO_STAGE_VDEV_IO_START | \
ZIO_STAGE_VDEV_IO_ASSESS)

#define ZIO_DDT_FREE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
Expand Down
2 changes: 2 additions & 0 deletions lib/libspl/include/sys/dkio.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,8 @@ struct dk_geom {
*/
#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */

#define DKIOCTRIM (DKIOC|35) /* TRIM a block */

struct dk_callback {
void (*dkc_callback)(void *dkc_cookie, int error);
void *dkc_cookie;
Expand Down
1 change: 1 addition & 0 deletions lib/libzpool/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ libzpool_la_SOURCES = \
$(top_srcdir)/module/zfs/spa_history.c \
$(top_srcdir)/module/zfs/spa_misc.c \
$(top_srcdir)/module/zfs/space_map.c \
$(top_srcdir)/module/zfs/trim_map.c \
$(top_srcdir)/module/zfs/txg.c \
$(top_srcdir)/module/zfs/uberblock.c \
$(top_srcdir)/module/zfs/unique.c \
Expand Down
1 change: 1 addition & 0 deletions module/avl/avl.c
Original file line number Diff line number Diff line change
Expand Up @@ -1052,6 +1052,7 @@ EXPORT_SYMBOL(avl_nearest);
EXPORT_SYMBOL(avl_add);
EXPORT_SYMBOL(avl_remove);
EXPORT_SYMBOL(avl_numnodes);
EXPORT_SYMBOL(avl_is_empty);
EXPORT_SYMBOL(avl_destroy_nodes);
EXPORT_SYMBOL(avl_destroy);
#endif
1 change: 1 addition & 0 deletions module/zfs/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/spa_errlog.o
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o
$(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o
$(MODULE)-objs += @top_srcdir@/module/zfs/trim_map.o
$(MODULE)-objs += @top_srcdir@/module/zfs/txg.o
$(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o
$(MODULE)-objs += @top_srcdir@/module/zfs/unique.o
Expand Down
17 changes: 17 additions & 0 deletions module/zfs/spa.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
#include <sys/spa_boot.h>
#include <sys/zfs_ioctl.h>
#include <sys/dsl_scan.h>
#include <sys/trim_map.h>

#ifdef _KERNEL
#include <sys/bootprops.h>
Expand Down Expand Up @@ -854,6 +855,11 @@ spa_activate(spa_t *spa, int mode)
spa_create_zio_taskqs(spa);
}

/*
* Start TRIM thread.
*/
trim_thread_create(spa);

list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_config_dirty_node));
list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
Expand Down Expand Up @@ -884,6 +890,12 @@ spa_deactivate(spa_t *spa)
ASSERT(spa->spa_async_zio_root == NULL);
ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);

/*
* Stop TRIM thread in case spa_unload() wasn't called before
* spa_deactivate().
*/
trim_thread_destroy(spa);

txg_list_destroy(&spa->spa_vdev_txg_list);

list_destroy(&spa->spa_config_dirty_list);
Expand Down Expand Up @@ -998,6 +1010,11 @@ spa_unload(spa_t *spa)

ASSERT(MUTEX_HELD(&spa_namespace_lock));

/*
* Stop TRIM thread.
*/
trim_thread_destroy(spa);

/*
* Stop async tasks.
*/
Expand Down
Loading

0 comments on commit cc6cd40

Please sign in to comment.