diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index d367486a04a9..5581cc425dd0 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -217,6 +217,9 @@ struct spa { spa_proc_state_t spa_proc_state; /* see definition */ proc_t *spa_proc; /* "zpool-poolname" process */ uint64_t spa_did; /* if procp != p0, did of t1 */ + kthread_t *spa_trim_thread; /* thread sending TRIM I/Os */ + kmutex_t spa_trim_lock; /* protects spa_trim_cv */ + kcondvar_t spa_trim_cv; /* used to notify TRIM thread */ boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ uint64_t spa_creation_version; /* version at pool creation */ diff --git a/include/sys/trim_map.h b/include/sys/trim_map.h new file mode 100644 index 000000000000..ba82a05ee305 --- /dev/null +++ b/include/sys/trim_map.h @@ -0,0 +1,51 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012 Pawel Jakub Dawidek . + * All rights reserved. + */ + +#ifndef _SYS_TRIM_MAP_H +#define _SYS_TRIM_MAP_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern void trim_map_create(vdev_t *vd); +extern void trim_map_destroy(vdev_t *vd); +extern void trim_map_free(zio_t *zio); +extern boolean_t trim_map_write_start(zio_t *zio); +extern void trim_map_write_done(zio_t *zio); + +extern void trim_thread_create(spa_t *spa); +extern void trim_thread_destroy(spa_t *spa); +extern void trim_thread_wakeup(spa_t *spa); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TRIM_MAP_H */ diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 50dbe695c73b..5885d50cc489 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -45,6 +45,7 @@ typedef enum vdev_dtl_type { } vdev_dtl_type_t; extern int zfs_nocacheflush; +extern int zfs_notrim; extern int vdev_open(vdev_t *); extern void vdev_open_children(vdev_t *); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 8862d9bc9854..f74288e01f11 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -186,6 +186,7 @@ struct vdev { uint64_t vdev_unspare; /* unspare when resilvering done */ hrtime_t vdev_last_try; /* last reopen time */ boolean_t vdev_nowritecache; /* true if flushwritecache failed */ + boolean_t vdev_notrim; /* true if trim failed */ boolean_t vdev_checkremove; /* temporary online test */ boolean_t vdev_forcefault; /* force online fault */ boolean_t vdev_splitting; /* split or repair in progress */ @@ -201,6 +202,7 @@ struct vdev { spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ + struct trim_map *vdev_trimmap; /* * For DTrace to work in userland (libzpool) context, these fields must diff --git a/include/sys/zio.h b/include/sys/zio.h index 4f20cab6591e..9ae755168991 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -140,7 +140,8 @@ enum zio_compress { #define ZIO_PRIORITY_RESILVER (zio_priority_table[9]) #define ZIO_PRIORITY_SCRUB (zio_priority_table[10]) #define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11]) -#define ZIO_PRIORITY_TABLE_SIZE 12 +#define ZIO_PRIORITY_TRIM (zio_priority_table[12]) +#define ZIO_PRIORITY_TABLE_SIZE 13 #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 @@ -429,6 +430,9 @@ struct zio { /* Taskq dispatching state */ taskq_ent_t io_tqent; + + avl_node_t io_trim_node; + list_node_t io_trim_link; }; extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, @@ -459,7 +463,8 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, int priority, enum zio_flag flags); + uint64_t offset, uint64_t size, zio_done_func_t *done, + void *private, int priority, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, @@ -478,6 +483,7 @@ extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t use_slog); extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp); extern void zio_flush(zio_t *zio, vdev_t *vd); +extern void zio_trim(zio_t *zio, vdev_t *vd, uint64_t offset, uint64_t size); extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index d90bd8bd5921..86ea9790784f 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -60,9 +60,9 @@ enum zio_stage { ZIO_STAGE_READY = 1 << 15, /* RWFCI */ - ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--I */ - ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */ + ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RWF-I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RWF-- */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RWF-I */ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */ @@ -143,7 +143,9 @@ enum zio_stage { #define ZIO_FREE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ ZIO_STAGE_FREE_BP_INIT | \ - ZIO_STAGE_DVA_FREE) + ZIO_STAGE_DVA_FREE | \ + ZIO_STAGE_VDEV_IO_START | \ + ZIO_STAGE_VDEV_IO_ASSESS) #define ZIO_DDT_FREE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ diff --git a/lib/libspl/include/sys/dkio.h b/lib/libspl/include/sys/dkio.h index 6b430b81f946..5f02cfef52ad 100644 --- a/lib/libspl/include/sys/dkio.h +++ b/lib/libspl/include/sys/dkio.h @@ -188,6 +188,8 @@ struct dk_geom { */ #define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ +#define DKIOCTRIM (DKIOC|35) /* TRIM a block */ + struct dk_callback { void (*dkc_callback)(void *dkc_cookie, int error); void *dkc_cookie; diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index cbe3acd34c60..ee8c8563e9e1 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -59,6 +59,7 @@ libzpool_la_SOURCES = \ $(top_srcdir)/module/zfs/spa_history.c \ $(top_srcdir)/module/zfs/spa_misc.c \ $(top_srcdir)/module/zfs/space_map.c \ + $(top_srcdir)/module/zfs/trim_map.c \ $(top_srcdir)/module/zfs/txg.c \ $(top_srcdir)/module/zfs/uberblock.c \ $(top_srcdir)/module/zfs/unique.c \ diff --git a/module/avl/avl.c b/module/avl/avl.c index e000647be238..fbe6903e0d41 100644 --- a/module/avl/avl.c +++ b/module/avl/avl.c @@ -1052,6 +1052,7 @@ EXPORT_SYMBOL(avl_nearest); EXPORT_SYMBOL(avl_add); EXPORT_SYMBOL(avl_remove); EXPORT_SYMBOL(avl_numnodes); +EXPORT_SYMBOL(avl_is_empty); EXPORT_SYMBOL(avl_destroy_nodes); EXPORT_SYMBOL(avl_destroy); #endif diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 98576d1d2ed5..758763863175 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -43,6 +43,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/spa_errlog.o $(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o $(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o $(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o +$(MODULE)-objs += @top_srcdir@/module/zfs/trim_map.o $(MODULE)-objs += @top_srcdir@/module/zfs/txg.o $(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o $(MODULE)-objs += @top_srcdir@/module/zfs/unique.o diff --git a/module/zfs/spa.c b/module/zfs/spa.c index a3d52c8b1c70..04b3cfb361e5 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -63,6 +63,7 @@ #include #include #include +#include #ifdef _KERNEL #include @@ -854,6 +855,11 @@ spa_activate(spa_t *spa, int mode) spa_create_zio_taskqs(spa); } + /* + * Start TRIM thread. + */ + trim_thread_create(spa); + list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), @@ -884,6 +890,12 @@ spa_deactivate(spa_t *spa) ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); + /* + * Stop TRIM thread in case spa_unload() wasn't called before + * spa_deactivate(). + */ + trim_thread_destroy(spa); + txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); @@ -998,6 +1010,11 @@ spa_unload(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); + /* + * Stop TRIM thread. + */ + trim_thread_destroy(spa); + /* * Stop async tasks. */ diff --git a/module/zfs/trim_map.c b/module/zfs/trim_map.c new file mode 100644 index 000000000000..80a7c7b5cd87 --- /dev/null +++ b/module/zfs/trim_map.c @@ -0,0 +1,535 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012 Pawel Jakub Dawidek . + * All rights reserved. + */ + +#include +#include +#include +#include + +typedef struct trim_map { + list_t tm_head; /* List of segments sorted by txg. */ + avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */ + avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */ + avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */ + list_t tm_pending_writes; /* Writes blocked on in-flight frees. */ + kmutex_t tm_lock; +} trim_map_t; + +typedef struct trim_seg { + avl_node_t ts_node; /* AVL node. */ + list_node_t ts_next; /* List element. */ + uint64_t ts_start; /* Starting offset of this segment. */ + uint64_t ts_end; /* Ending offset (non-inclusive). */ + uint64_t ts_txg; /* Segment creation txg. */ +} trim_seg_t; + +/* + * TRIM support. Disabled by default. + * HIGHLY EXPERIMENTAL, USE AT YOUR OWN RISK, EXPECT DATA LOSS + */ +int zfs_notrim = B_TRUE; +int trim_txg_limit = 64; + +static int +trim_map_seg_compare(const void *x1, const void *x2) +{ + const trim_seg_t *s1 = x1; + const trim_seg_t *s2 = x2; + + if (s1->ts_start < s2->ts_start) { + if (s1->ts_end > s2->ts_start) + return (0); + return (-1); + } + if (s1->ts_start > s2->ts_start) { + if (s1->ts_start < s2->ts_end) + return (0); + return (1); + } + return (0); +} + +static int +trim_map_zio_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_offset < z2->io_offset) { + if (z1->io_offset + z1->io_size > z2->io_offset) + return (0); + return (-1); + } + if (z1->io_offset > z2->io_offset) { + if (z1->io_offset < z2->io_offset + z2->io_size) + return (0); + return (1); + } + return (0); +} + +void +trim_map_create(vdev_t *vd) +{ + trim_map_t *tm; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (zfs_notrim) + return; + + tm = kmem_zalloc(sizeof (*tm), KM_SLEEP); + mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&tm->tm_head, sizeof (trim_seg_t), + offsetof(trim_seg_t, ts_next)); + list_create(&tm->tm_pending_writes, sizeof (zio_t), + offsetof(zio_t, io_trim_link)); + avl_create(&tm->tm_queued_frees, trim_map_seg_compare, + sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); + avl_create(&tm->tm_inflight_frees, trim_map_seg_compare, + sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); + avl_create(&tm->tm_inflight_writes, trim_map_zio_compare, + sizeof (zio_t), offsetof(zio_t, io_trim_node)); + vd->vdev_trimmap = tm; +} + +void +trim_map_destroy(vdev_t *vd) +{ + trim_map_t *tm; + trim_seg_t *ts; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (zfs_notrim) + return; + + tm = vd->vdev_trimmap; + if (tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + while ((ts = list_head(&tm->tm_head)) != NULL) { + avl_remove(&tm->tm_queued_frees, ts); + list_remove(&tm->tm_head, ts); + kmem_free(ts, sizeof (*ts)); + } + mutex_exit(&tm->tm_lock); + + avl_destroy(&tm->tm_queued_frees); + avl_destroy(&tm->tm_inflight_frees); + avl_destroy(&tm->tm_inflight_writes); + list_destroy(&tm->tm_pending_writes); + list_destroy(&tm->tm_head); + mutex_destroy(&tm->tm_lock); + kmem_free(tm, sizeof (*tm)); + vd->vdev_trimmap = NULL; +} + +static void +trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) +{ + avl_index_t where; + trim_seg_t tsearch, *ts_before, *ts_after, *ts; + boolean_t merge_before, merge_after; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + VERIFY(start < end); + + tsearch.ts_start = start; + tsearch.ts_end = end; + + ts = avl_find(&tm->tm_queued_frees, &tsearch, &where); + if (ts != NULL) { + if (start < ts->ts_start) + trim_map_segment_add(tm, start, ts->ts_start, txg); + if (end > ts->ts_end) + trim_map_segment_add(tm, ts->ts_end, end, txg); + return; + } + + ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE); + ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER); + + merge_before = (ts_before != NULL && ts_before->ts_end == start && + ts_before->ts_txg == txg); + merge_after = (ts_after != NULL && ts_after->ts_start == end && + ts_after->ts_txg == txg); + + if (merge_before && merge_after) { + avl_remove(&tm->tm_queued_frees, ts_before); + list_remove(&tm->tm_head, ts_before); + ts_after->ts_start = ts_before->ts_start; + kmem_free(ts_before, sizeof (*ts_before)); + } else if (merge_before) { + ts_before->ts_end = end; + } else if (merge_after) { + ts_after->ts_start = start; + } else { + ts = kmem_alloc(sizeof (*ts), KM_SLEEP); + ts->ts_start = start; + ts->ts_end = end; + ts->ts_txg = txg; + avl_insert(&tm->tm_queued_frees, ts, where); + list_insert_tail(&tm->tm_head, ts); + } +} + +static void +trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start, + uint64_t end) +{ + trim_seg_t *nts; + boolean_t left_over, right_over; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + left_over = (ts->ts_start < start); + right_over = (ts->ts_end > end); + + if (left_over && right_over) { + nts = kmem_alloc(sizeof (*nts), KM_SLEEP); + nts->ts_start = end; + nts->ts_end = ts->ts_end; + nts->ts_txg = ts->ts_txg; + ts->ts_end = start; + avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER); + list_insert_after(&tm->tm_head, ts, nts); + } else if (left_over) { + ts->ts_end = start; + } else if (right_over) { + ts->ts_start = end; + } else { + avl_remove(&tm->tm_queued_frees, ts); + list_remove(&tm->tm_head, ts); + kmem_free(ts, sizeof (*ts)); + } +} + +static void +trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) +{ + zio_t *zs; + /* + * Declaring a zio_t variable on the stack makes the frame size too + * large for the taste of GCC. We work around the issue by allocating + * just the fields we actually need. + */ + char zsearch_buffer[offsetof(zio_t, io_offset) + sizeof(zs->io_offset)]; + zio_t *zsearch = (zio_t *)zsearch_buffer; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + zsearch->io_offset = start; + zsearch->io_size = end - start; + + zs = avl_find(&tm->tm_inflight_writes, zsearch, NULL); + if (zs == NULL) { + trim_map_segment_add(tm, start, end, txg); + return; + } + if (start < zs->io_offset) + trim_map_free_locked(tm, start, zs->io_offset, txg); + if (zs->io_offset + zs->io_size < end) + trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg); +} + +void +trim_map_free(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + + if (zfs_notrim || vd->vdev_notrim || tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + trim_map_free_locked(tm, zio->io_offset, zio->io_offset + zio->io_size, + vd->vdev_spa->spa_syncing_txg); + mutex_exit(&tm->tm_lock); +} + +boolean_t +trim_map_write_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t tsearch, *ts; + uint64_t start, end; + + if (zfs_notrim || vd->vdev_notrim || tm == NULL) + return (B_TRUE); + + start = zio->io_offset; + end = start + zio->io_size; + tsearch.ts_start = start; + tsearch.ts_end = end; + + mutex_enter(&tm->tm_lock); + + /* + * Checking for colliding in-flight frees. + */ + ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL); + if (ts != NULL) { + list_insert_tail(&tm->tm_pending_writes, zio); + mutex_exit(&tm->tm_lock); + return (B_FALSE); + } + + ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); + if (ts != NULL) { + /* + * Loop until all overlapping segments are removed. + */ + do { + trim_map_segment_remove(tm, ts, start, end); + ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); + } while (ts != NULL); + } + avl_add(&tm->tm_inflight_writes, zio); + + mutex_exit(&tm->tm_lock); + + return (B_TRUE); +} + +void +trim_map_write_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + + if (zfs_notrim || vd->vdev_notrim || tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + avl_remove(&tm->tm_inflight_writes, zio); + mutex_exit(&tm->tm_lock); +} + +/* + * Return the oldest segment (the one with the lowest txg) or false if + * the list is empty or the first element's txg is greater than txg given + * as function argument. + */ +static trim_seg_t * +trim_map_first(trim_map_t *tm, uint64_t txg) +{ + trim_seg_t *ts; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + ts = list_head(&tm->tm_head); + if (ts != NULL && ts->ts_txg <= txg) + return (ts); + return (NULL); +} + +static void +trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd) +{ + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t *ts; + uint64_t txglimit; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (tm == NULL) + return; + + txglimit = spa->spa_syncing_txg - trim_txg_limit; + + mutex_enter(&tm->tm_lock); + /* + * Loop until we send all frees up to the txglimit. + */ + while ((ts = trim_map_first(tm, txglimit)) != NULL) { + list_remove(&tm->tm_head, ts); + avl_remove(&tm->tm_queued_frees, ts); + avl_add(&tm->tm_inflight_frees, ts); + zio_trim(zio, vd, ts->ts_start, ts->ts_end - ts->ts_start); + } + mutex_exit(&tm->tm_lock); +} + +static void +trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd) +{ + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t *ts; + list_t pending_writes; + zio_t *zio; + void *cookie; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + if (!avl_is_empty(&tm->tm_inflight_frees)) { + cookie = NULL; + while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees, + &cookie)) != NULL) { + kmem_free(ts, sizeof (*ts)); + } + } + list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t, + io_trim_link)); + list_move_tail(&pending_writes, &tm->tm_pending_writes); + mutex_exit(&tm->tm_lock); + + while ((zio = list_remove_head(&pending_writes)) != NULL) { + zio_vdev_io_reissue(zio); + zio_execute(zio); + } + list_destroy(&pending_writes); +} + +static void +trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd) +{ + int c; + + if (vd == NULL || spa->spa_syncing_txg <= trim_txg_limit) + return; + + if (vd->vdev_ops->vdev_op_leaf) { + trim_map_vdev_commit(spa, zio, vd); + } else { + for (c = 0; c < vd->vdev_children; c++) + trim_map_commit(spa, zio, vd->vdev_child[c]); + } +} + +static void +trim_map_commit_done(spa_t *spa, vdev_t *vd) +{ + int c; + + if (vd == NULL) + return; + + if (vd->vdev_ops->vdev_op_leaf) { + trim_map_vdev_commit_done(spa, vd); + } else { + for (c = 0; c < vd->vdev_children; c++) + trim_map_commit_done(spa, vd->vdev_child[c]); + } +} + +static void +trim_thread(void *arg) +{ + spa_t *spa = arg; + zio_t *zio; + + for (;;) { + mutex_enter(&spa->spa_trim_lock); + if (spa->spa_trim_thread == NULL) { + spa->spa_trim_thread = curthread; + cv_signal(&spa->spa_trim_cv); + mutex_exit(&spa->spa_trim_lock); + thread_exit(); + } + cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); + mutex_exit(&spa->spa_trim_lock); + + zio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + trim_map_commit(spa, zio, spa->spa_root_vdev); + spa_config_exit(spa, SCL_STATE, FTAG); + + (void) zio_wait(zio); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + trim_map_commit_done(spa, spa->spa_root_vdev); + spa_config_exit(spa, SCL_STATE, FTAG); + } +} + +void +trim_thread_create(spa_t *spa) +{ + + if (zfs_notrim) + return; + + mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL); + mutex_enter(&spa->spa_trim_lock); + spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0, + TS_RUN, minclsyspri); + mutex_exit(&spa->spa_trim_lock); +} + +void +trim_thread_destroy(spa_t *spa) +{ + + if (zfs_notrim) + return; + if (spa->spa_trim_thread == NULL) + return; + + mutex_enter(&spa->spa_trim_lock); + /* Setting spa_trim_thread to NULL tells the thread to stop. */ + spa->spa_trim_thread = NULL; + cv_signal(&spa->spa_trim_cv); + /* The thread will set it back to != NULL on exit. */ + while (spa->spa_trim_thread == NULL) + cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); + spa->spa_trim_thread = NULL; + mutex_exit(&spa->spa_trim_lock); + + cv_destroy(&spa->spa_trim_cv); + mutex_destroy(&spa->spa_trim_lock); +} + +void +trim_thread_wakeup(spa_t *spa) +{ + + if (zfs_notrim) + return; + if (spa->spa_trim_thread == NULL) + return; + + mutex_enter(&spa->spa_trim_lock); + cv_signal(&spa->spa_trim_cv); + mutex_exit(&spa->spa_trim_lock); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_notrim, int, 0644); +MODULE_PARM_DESC(zfs_notrim, "Disable TRIM (default is 1; 0 IS HIGHLY EXPERIMENTAL, USE AT YOUR OWN RISK, EXPECT DATA LOSS)"); + +module_param(trim_txg_limit, int, 0644); +MODULE_PARM_DESC(trim_txg_limit, "Delay TRIMs by that many TXGs."); +#endif + diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 06c7d0c8937b..455acec854a7 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -42,6 +42,7 @@ #include #include #include +#include /* * Virtual device management. @@ -1213,6 +1214,11 @@ vdev_open(vdev_t *vd) if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); + if (vd->vdev_ops->vdev_op_leaf) { + vd->vdev_notrim = B_FALSE; + trim_map_create(vd); + } + for (c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, @@ -1451,6 +1457,9 @@ vdev_close(vdev_t *vd) vdev_cache_purge(vd); + if (vd->vdev_ops->vdev_op_leaf) + trim_map_destroy(vd); + /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index fd40b10055fb..b5d72dab64c5 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -291,6 +291,8 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) /* Based on the minimum sector size set the block size */ *ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; + v->vdev_notrim = !blk_queue_discard(bdev_get_queue(bdev)); + /* Try to set the io scheduler elevator algorithm */ (void) vdev_elevator_switch(v, zfs_vdev_scheduler); @@ -639,6 +641,101 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) } #endif /* HAVE_BIO_EMPTY_BARRIER */ +static int +vdev_disk_io_trim(struct block_device *bdev, zio_t *zio) +{ + struct request_queue *q = bdev_get_queue(bdev); + unsigned int max_discard_sectors; + dio_request_t *dr; + sector_t bio_sector, bio_sector_count; + int bio_count = 16; + int i = 0; + + if (!blk_queue_discard(q)) + return ENOTSUP; + + max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); + if (!max_discard_sectors) + return ENOTSUP; + if (q->limits.discard_granularity) + max_discard_sectors &= ~((q->limits.discard_granularity >> 9) - 1); + +retry: + dr = vdev_disk_dio_alloc(bio_count); + if (dr == NULL) + return ENOMEM; + + dr->dr_zio = zio; + dr->dr_rw = REQ_WRITE | REQ_DISCARD; + + if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) + bio_set_flags_failfast(bdev, &dr->dr_rw); + + /* + * When the discard size exceeds the maximum discard size for the + * request queue we are forced to break the discard in multiple bio's + * and wait for them all to complete. + */ + bio_sector = zio->io_offset >> 9; + bio_sector_count = zio->io_size >> 9; + for (i = 0; i <= dr->dr_bio_count; i++) { + + /* Finished constructing bio's for given buffer */ + if (bio_sector_count <= 0) + break; + + /* + * By default only 'bio_count' bio's per dio are allowed. + * However, if we find ourselves in a situation where more + * are needed we allocate a larger dio and warn the user. + */ + if (dr->dr_bio_count == i) { + vdev_disk_dio_free(dr); + bio_count *= 2; + printk("WARNING: Resized discard bio's/dio to %d\n",bio_count); + goto retry; + } + + dr->dr_bio[i] = bio_alloc(GFP_NOIO, 1); + if (dr->dr_bio[i] == NULL) { + vdev_disk_dio_free(dr); + return ENOMEM; + } + + /* Matching put called by vdev_disk_physio_completion */ + vdev_disk_dio_get(dr); + + dr->dr_bio[i]->bi_bdev = bdev; + dr->dr_bio[i]->bi_sector = bio_sector; + dr->dr_bio[i]->bi_rw = dr->dr_rw; + dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; + dr->dr_bio[i]->bi_private = dr; + + if (bio_sector_count > max_discard_sectors) { + dr->dr_bio[i]->bi_size = max_discard_sectors << 9; + bio_sector_count -= max_discard_sectors; + bio_sector += max_discard_sectors; + } else { + dr->dr_bio[i]->bi_size = bio_sector_count << 9; + bio_sector_count = 0; + } + } + + /* Extra reference to protect dio_request during submit_bio */ + vdev_disk_dio_get(dr); + if (zio) + zio->io_delay = jiffies_64; + + /* Submit all bio's associated with this dio */ + for (i = 0; i < dr->dr_bio_count; i++) + if (dr->dr_bio[i]) + submit_bio(dr->dr_rw, dr->dr_bio[i]); + + (void)vdev_disk_dio_put(dr); + + return (0); +} + static int vdev_disk_io_start(zio_t *zio) { @@ -654,6 +751,9 @@ vdev_disk_io_start(zio_t *zio) return ZIO_PIPELINE_CONTINUE; } + if (zio->io_cmd == DKIOCTRIM) + break; + switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: @@ -694,8 +794,16 @@ vdev_disk_io_start(zio_t *zio) return ZIO_PIPELINE_CONTINUE; } - error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, - zio->io_size, zio->io_offset, flags); + if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM) + { + error = vdev_disk_io_trim(vd->vd_bdev, zio); + if (error == ENOTSUP) + v->vdev_notrim = B_TRUE; + } + else + error = __vdev_disk_physio(vd->vd_bdev, zio, + zio->io_data, zio->io_size, + zio->io_offset, flags); if (error) { zio->io_error = error; return ZIO_PIPELINE_CONTINUE; diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 7ac23500f7f9..16071520b2d9 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -141,6 +141,7 @@ #include #include #include +#include #include /* @@ -1221,5 +1222,10 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard) * to disk to ensure that all odd-label updates are committed to * stable storage before the next transaction group begins. */ - return (vdev_label_sync_list(spa, 1, txg, flags)); + if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) + return (error); + + trim_thread_wakeup(spa); + + return (0); } diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 96623d2cf5e2..33d148125cd5 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -290,10 +290,11 @@ vdev_mirror_io_start(zio_t *zio) c = vdev_mirror_child_select(zio); children = (c >= 0); } else { - ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_FREE); /* - * Writes go to all children. + * Writes and frees go to all children. */ c = 0; children = mm->mm_children; @@ -374,6 +375,8 @@ vdev_mirror_io_done(zio_t *zio) zio->io_error = vdev_mirror_worst_error(mm); } return; + } else if (zio->io_type == ZIO_TYPE_FREE) { + return; } ASSERT(zio->io_type == ZIO_TYPE_READ); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 21c6e032bc19..99aa0fd7ed2c 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -503,14 +503,20 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); ASSERT3U(rm->rm_nskip, <=, nparity); - for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); + if (zio->io_type != ZIO_TYPE_FREE) { + for (c = 0; c < rm->rm_firstdatacol; c++) { + rm->rm_col[c].rc_data = + zio_buf_alloc(rm->rm_col[c].rc_size); + } - rm->rm_col[c].rc_data = zio->io_data; + rm->rm_col[c].rc_data = zio->io_data; - for (c = c + 1; c < acols; c++) - rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + - rm->rm_col[c - 1].rc_size; + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_data = + (char *)rm->rm_col[c - 1].rc_data + + rm->rm_col[c - 1].rc_size; + } + } /* * If all data stored spans all columns, there's a danger that parity @@ -1531,6 +1537,18 @@ vdev_raidz_io_start(zio_t *zio) ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); + if (zio->io_type == ZIO_TYPE_FREE) { + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_data, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + return (ZIO_PIPELINE_CONTINUE); + } + if (zio->io_type == ZIO_TYPE_WRITE) { vdev_raidz_generate_parity(rm); @@ -1915,6 +1933,8 @@ vdev_raidz_io_done(zio_t *zio) if (total_errors > rm->rm_firstdatacol) zio->io_error = vdev_raidz_worst_error(rm); + return; + } else if (zio->io_type == ZIO_TYPE_FREE) { return; } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index fe2bdc867fea..420d129ebbad 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -36,6 +36,7 @@ #include #include #include +#include /* * ========================================================================== @@ -55,6 +56,7 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 10, /* ZIO_PRIORITY_RESILVER */ 20, /* ZIO_PRIORITY_SCRUB */ 2, /* ZIO_PRIORITY_DDT_PREFETCH */ + 30, /* ZIO_PRIORITY_TRIM */ }; /* @@ -553,7 +555,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, { zio_t *zio; - ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); @@ -803,15 +805,16 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, } zio_t * -zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, int priority, enum zio_flag flags) +zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private, int priority, + enum zio_flag flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { - zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, - ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, + zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, + ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; @@ -820,7 +823,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, - done, private, priority, flags)); + offset, size, done, private, priority, flags)); } return (zio); @@ -945,11 +948,22 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, void zio_flush(zio_t *zio, vdev_t *vd) { - zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, + zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, NULL, NULL, ZIO_PRIORITY_NOW, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } +void +zio_trim(zio_t *zio, vdev_t *vd, uint64_t offset, uint64_t size) +{ + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCTRIM, offset, size, + NULL, NULL, ZIO_PRIORITY_TRIM, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); +} + void zio_shrink(zio_t *zio, uint64_t size) { @@ -2409,6 +2423,12 @@ zio_vdev_io_start(zio_t *zio) return (vdev_mirror_ops.vdev_op_io_start(zio)); } + if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) { + if (!BP_IS_GANG(zio->io_bp)) + trim_map_free(zio); + return (ZIO_PIPELINE_CONTINUE); + } + /* * We keep track of time-sensitive I/Os so that the scan thread * can quickly react to certain workloads. In particular, we care @@ -2444,7 +2464,7 @@ zio_vdev_io_start(zio_t *zio) ASSERT(P2PHASE(zio->io_offset, align) == 0); ASSERT(P2PHASE(zio->io_size, align) == 0); - VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); + VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- @@ -2484,6 +2504,11 @@ zio_vdev_io_start(zio_t *zio) } } + if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE) { + if (!trim_map_write_start(zio)) + return (ZIO_PIPELINE_STOP); + } + return (vd->vdev_ops->vdev_op_io_start(zio)); } @@ -2497,9 +2522,16 @@ zio_vdev_io_done(zio_t *zio) if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); - ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); + + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + zio->io_type == ZIO_TYPE_WRITE) { + trim_map_write_done(zio); + } - if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { vdev_queue_io_done(zio);